Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 31 additions & 14 deletions helm/bundles/cortex-nova/alerts/nova.alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ groups:

- alert: CortexNovaCommittedResourceRejectionRateTooHigh
expr: |
rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m])
sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m]))
/ sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) > 0.5
for: 5m
labels:
Expand Down Expand Up @@ -486,7 +486,10 @@ groups:

# Committed Resource Syncer Alerts
- alert: CortexNovaCommittedResourceSyncerNotRunning
expr: increase(cortex_committed_resource_syncer_runs_total{service="cortex-nova-metrics"}[2h]) == 0
expr: |
increase(cortex_committed_resource_syncer_runs_total{service="cortex-nova-metrics"}[2h]) == 0
or
absent(cortex_committed_resource_syncer_runs_total{service="cortex-nova-metrics"})
for: 5m
labels:
context: committed-resource-syncer
Expand All @@ -497,8 +500,10 @@ groups:
annotations:
summary: "Committed Resource syncer not running"
description: >
The committed resource syncer has not run in the last 2 hours. This indicates
that the syncer may have stopped or is encountering errors. Check the syncer logs for errors.
The committed resource syncer has not run in the last 2 hours or the metric is missing.
This indicates that the syncer may have stopped, is encountering errors, or the feature
is not enabled. Check the syncer logs for errors or verify the commitments-sync-task is
in the enabledTasks configuration.

- alert: CortexNovaCommittedResourceSyncerErrorsHigh
expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3
Expand All @@ -517,8 +522,11 @@ groups:

- alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh
expr: |
rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h])
/ rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0.05
(
sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h]))
/ sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]))
) > 0.05
and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0
for: 15m
labels:
context: committed-resource-syncer
Expand All @@ -537,8 +545,11 @@ groups:

- alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh
expr: |
rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h])
/ rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0
(
sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h]))
/ sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]))
) > 0
and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0
for: 15m
labels:
context: committed-resource-syncer
Expand All @@ -557,10 +568,13 @@ groups:
- alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh
expr: |
(
rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) +
rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) +
rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0.01
(
rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) +
rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) +
rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h])
) > 0.01
and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
for: 15m
labels:
context: committed-resource-syncer
Expand All @@ -578,8 +592,11 @@ groups:

- alert: CortexNovaCommittedResourceSyncerRepairRateHigh
expr: |
rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
/ rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
(
rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
/ rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h])
) > 0
and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
for: 15m
labels:
context: committed-resource-syncer
Expand Down
28 changes: 28 additions & 0 deletions helm/bundles/cortex-nova/templates/pipelines_kvm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,20 @@ spec:
from the nova scheduler request spec. It supports filtering by host and
by aggregates. Aggregates use AND logic between list elements, with
comma-separated UUIDs within an element using OR logic.
- name: filter_committed_resource_bookkeeping
description: |
Bookkeeping for committed resource (CR) reservations. Note that unlocking
of CR capacity happens in filter_has_enough_capacity when project ID and
resource group (hw_version) match. This filter handles additional tasks:
tracking which VMs are expected to land on which CR reservations by
updating reservation spec allocations. In the future, this filter will
also enforce that VMs use available CR reservation slots when sufficient
slots exist among candidates.
params:
# Enable updating CR reservation allocations with VM assignments
- {key: updateReservationAllocations, boolValue: true}
# Future: enforce reservation slots (not yet implemented)
- {key: enforceReservationSlots, boolValue: false}
weighers:
- name: kvm_prefer_smaller_hosts
params:
Expand Down Expand Up @@ -241,6 +255,20 @@ spec:
from the nova scheduler request spec. It supports filtering by host and
by aggregates. Aggregates use AND logic between list elements, with
comma-separated UUIDs within an element using OR logic.
- name: filter_committed_resource_bookkeeping
description: |
Bookkeeping for committed resource (CR) reservations. Note that unlocking
of CR capacity happens in filter_has_enough_capacity when project ID and
resource group (hw_version) match. This filter handles additional tasks:
tracking which VMs are expected to land on which CR reservations by
updating reservation spec allocations. In the future, this filter will
also enforce that VMs use available CR reservation slots when sufficient
slots exist among candidates.
params:
# Enable updating CR reservation allocations with VM assignments
- {key: updateReservationAllocations, boolValue: true}
# Future: enforce reservation slots (not yet implemented)
- {key: enforceReservationSlots, boolValue: false}
weighers:
- name: kvm_prefer_smaller_hosts
params:
Expand Down
Loading
Loading