Skip to content

Commit 08cdf71

Browse files
committed
add rule for manila
1 parent a170411 commit 08cdf71

2 files changed

Lines changed: 88 additions & 76 deletions

File tree

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
---
22
apiVersion: v1
33
name: prometheus-openstack-exporter
4-
version: 0.5.1
4+
version: 0.5.2
55
appVersion: v1.7.0

charts/prometheus-openstack-exporter/values.yaml

Lines changed: 87 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ promethuesRules:
7676
rules:
7777
- alert: KeystoneDown
7878
for: 5m
79-
expr: 'openstack_identity_up != 1'
79+
expr: openstack_identity_up != 1
8080
labels:
8181
severity: critical
8282
annotations:
@@ -88,7 +88,7 @@ promethuesRules:
8888
rules:
8989
- alert: GlanceDown
9090
for: 5m
91-
expr: 'openstack_glance_up != 1'
91+
expr: openstack_glance_up != 1
9292
labels:
9393
severity: critical
9494
annotations:
@@ -100,7 +100,7 @@ promethuesRules:
100100
rules:
101101
- alert: CinderDown
102102
for: 5m
103-
expr: 'openstack_cinder_up != 1'
103+
expr: openstack_cinder_up != 1
104104
labels:
105105
severity: critical
106106
annotations:
@@ -109,68 +109,65 @@ promethuesRules:
109109

110110
- alert: CinderAgentDown
111111
for: 5m
112-
expr: |
113-
openstack_cinder_agent_state{adminState="enabled"} != 1
112+
expr: openstack_cinder_agent_state{adminState="enabled"} != 1
114113
labels:
115114
severity: critical
116115
annotations:
117-
summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down"
116+
summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` down"
118117
description: >
119118
The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}`
120-
is being reported as down for 5 minutes. This can affect volume operations so it must
119+
is being reported as down for 5 minutes. This can affect volume operations so it must
121120
be resolved as quickly as possible.
122121
123122
- alert: CinderAgentDisabled
124123
for: 1h
125-
expr: |
126-
openstack_cinder_agent_state{adminState!="enabled"}
124+
expr: openstack_cinder_agent_state{adminState!="enabled"}
127125
labels:
128126
severity: warning
129127
annotations:
130-
summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled"
128+
summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` disabled"
131129
description: >
132130
The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}`
133-
has been disabled for 60 minutes. This can affect volume operations so it must be resolved
131+
has been disabled for 60 minutes. This can affect volume operations so it must be resolved
134132
as quickly as possible.
135133
136134
- alert: CinderVolumeInError
137-
for: 24h
138-
expr: |
139-
openstack_cinder_volume_status{status=~"error.*"}
135+
for: 15m
136+
expr: openstack_cinder_volume_status{status=~"error.*"}
140137
labels:
141-
severity: warning
138+
severity: critical
142139
annotations:
143-
summary: "[`{{`{{$labels.id}}`}}`] Volume in ERROR state"
140+
summary: "`{{`{{$labels.name}}`}}` Volume in ERROR state"
144141
description: >
145-
The volume `{{`{{$labels.id}}`}}` has been in ERROR state for over 24 hours. It must
146-
be cleaned up or removed in order to provide a consistent customer experience.
142+
The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes.
143+
It must be cleaned up or removed in order to provide a consistent customer experience.
147144
148145
- alert: CinderVolumeInDeleting
149-
expr: 'openstack_cinder_volume_status == 7'
150-
for: 10m
146+
expr: openstack_cinder_volume_status == 7
147+
for: 15m
151148
labels:
152-
severity: warning
149+
severity: warning
153150
annotations:
154-
summary: "[`{{`{{$labels.id}}`}}`] Volume in deleting state"
155-
description: >
156-
The volume `{{`{{$labels.id}}`}}` is stuck in deleting status for more than 10 minutes"
151+
summary: "`{{`{{$labels.name}}`}}` Volume in DELETING state"
152+
description: >
153+
The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes.
157154
158155
- alert: CinderVolumeInCreating
159-
expr: 'openstack_cinder_volume_status == 0'
160-
for: 10m
156+
expr: openstack_cinder_volume_status == 0
157+
for: 15m
161158
labels:
162159
severity: warning
163160
annotations:
164-
summary: "[`{{`{{$labels.id}}`}}`] Volume in creating state"
161+
summary: "`{{`{{$labels.name}}`}}` Volume in CREATING state"
165162
description: >
166-
The volume `{{`{{$labels.id}}`}}` is stuck in deleting status for more than 10 minutes"
163+
The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes.
167164
168165
neutron:
169166
enabled: true
170167
rules:
171168
- alert: NeutronDown
172169
for: 5m
173-
expr: 'openstack_neutron_up != 1'
170+
expr: openstack_neutron_up != 1
174171
labels:
175172
severity: critical
176173
annotations:
@@ -179,37 +176,34 @@ promethuesRules:
179176

180177
- alert: NeutronAgentDown
181178
for: 5m
182-
expr: |
183-
openstack_neutron_agent_state{adminState="up"} != 1
179+
expr: openstack_neutron_agent_state{adminState="up"} != 1
184180
labels:
185181
severity: critical
186182
annotations:
187-
summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down"
183+
summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` down"
188184
description: >
189185
The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}`
190186
is being reported as down for 5 minutes. This can affect network operations so it must
191187
be resolved as quickly as possible.
192188
193189
- alert: NeutronAgentDisabled
194190
for: 1h
195-
expr: |
196-
openstack_neutron_agent_state{adminState!="up"}
191+
expr: openstack_neutron_agent_state{adminState!="up"}
197192
labels:
198193
severity: warning
199194
annotations:
200-
summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled"
195+
summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` disabled"
201196
description: >
202197
The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}`
203-
has been disabled for 60 minutes. This can affect network operations so it must be resolved
198+
has been disabled for 60 minutes. This can affect network operations so it must be resolved
204199
as quickly as possible.
205200
206201
- alert: NeutronBindingFailedPorts
207-
expr: |
208-
openstack_neutron_port{binding_vif_type="binding_failed"} != 0
202+
expr: openstack_neutron_port{binding_vif_type="binding_failed"} != 0
209203
labels:
210204
severity: warning
211205
annotations:
212-
summary: "[`{{`{{$labels.device_owner}}`}}`] `{{`{{$labels.mac_address}}`}}` binding failed"
206+
summary: "`{{`{{$labels.device_owner}}`}}` `{{`{{$labels.mac_address}}`}}` binding failed"
213207
description: >
214208
The NIC `{{`{{$labels.mac_address}}`}}` of `{{`{{$labels.device_owner}}`}}`
215209
has binding failed port now.
@@ -220,18 +214,18 @@ promethuesRules:
220214
labels:
221215
severity: warning
222216
annotations:
223-
summary: "[`{{`{{$labels.network_name}}`}}`] `{{`{{$labels.subnet_name}}`}}` running out of IPs"
217+
summary: "`{{`{{$labels.network_name}}`}}` `{{`{{$labels.subnet_name}}`}}` running out of IPs"
224218
description: >
225219
The subnet `{{`{{$labels.subnet_name}}`}}` within `{{`{{$labels.network_name}}`}}`
226-
is currently at `{{`{{$value}}`}}`% utilization. If the IP addresses run out, it will
220+
is currently at `{{`{{$value}}`}}`% utilization. If the IP addresses run out, it will
227221
impact the provisioning of new ports.
228222
229223
nova:
230224
enabled: true
231225
rules:
232226
- alert: NovaDown
233227
for: 5m
234-
expr: 'openstack_nova_up != 1'
228+
expr: openstack_nova_up != 1
235229
labels:
236230
severity: critical
237231
annotations:
@@ -240,91 +234,88 @@ promethuesRules:
240234

241235
- alert: NovaAgentDown
242236
for: 5m
243-
expr: |
244-
openstack_nova_agent_state{adminState="enabled"} != 1
237+
expr: openstack_nova_agent_state{adminState="enabled"} != 1
245238
labels:
246239
severity: critical
247240
annotations:
248-
summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down"
241+
summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` down"
249242
description: >
250243
The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}`
251-
is being reported as down. This can affect compute operations so it must be resolved
244+
is being reported as down. This can affect compute operations so it must be resolved
252245
as quickly as possible.
253246
254247
- alert: NovaAgentDisabled
255248
for: 1h
256-
expr: |
257-
openstack_nova_agent_state{adminState!="enabled"}
249+
expr: openstack_nova_agent_state{adminState!="enabled"}
258250
labels:
259251
severity: warning
260252
annotations:
261-
summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled"
253+
summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` disabled"
262254
description: >
263255
The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}`
264-
has been disabled for 60 minutes. This can affect compute operations so it must be resolved
256+
has been disabled for 60 minutes. This can affect compute operations so it must be resolved
265257
as quickly as possible.
266258
267259
- alert: NovaInstanceInError
268260
for: 10m
269-
expr: |
270-
openstack_nova_server_status{status="ERROR"}
261+
expr: openstack_nova_server_status{status="ERROR"}
271262
labels:
272263
severity: critical
273264
annotations:
274-
summary: "[`{{`{{$labels.id}}`}}`] Instance in ERROR state"
265+
summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state"
275266
description: >
276-
The instance `{{`{{$labels.id}}`}}` has been in ERROR state for over 10 minutes. It must
277-
be cleaned up or removed in order to provide a consistent customer experience.
267+
The instance `{{`{{$labels.id}}`}}` on host `{{`{{$labels.hypervisor_hostname}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 10 minutes.
268+
It must be cleaned up or removed in order to provide a consistent customer experience.
278269
279270
- alert: NovaInstanceInBuilding
280271
for: 15m
281-
expr: 'openstack_nova_server_status == 1'
272+
expr: openstack_nova_server_status == 1
282273
labels:
283274
severity: critical
284275
annotations:
285-
summary: "[`{{`{{$labels.id}}`}}`] Instance in BUILD state"
276+
summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state"
286277
description: >
287-
The instance `{{`{{$labels.id}}`}}` has been in BUILD state for over 15 minutes.
278+
The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes.
288279
289280
- alert: NovaInstanceInRESIZE
290281
for: 15m
291-
expr: 'openstack_nova_server_status == 10'
282+
expr: openstack_nova_server_status == 10
292283
labels:
293284
severity: critical
294285
annotations:
295-
summary: "[`{{`{{$labels.id}}`}}`] Instance in RESIZE state"
286+
summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state"
296287
description: >
297-
The instance `{{`{{$labels.id}}`}}` has been in RESIZE state for over 15 minutes.
288+
The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes.
298289
299290
- alert: NovaInstanceInUNKNOWN
300291
for: 15m
301-
expr: 'openstack_nova_server_status == 13'
292+
expr: openstack_nova_server_status == 13
302293
labels:
303294
severity: critical
304295
annotations:
305-
summary: "[`{{`{{$labels.id}}`}}`] Instance in UNKNOWN state"
296+
summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state"
306297
description: >
307-
The instance `{{`{{$labels.id}}`}}` has been in UNKNOWN state for over 15 minutes.
298+
The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes.
308299
309300
- alert: NovaInstanceInVERIFY_RESIZE
310301
for: 15m
311-
expr: 'openstack_nova_server_status == 14'
302+
expr: openstack_nova_server_status == 14
312303
labels:
313304
severity: critical
314305
annotations:
315-
summary: "[`{{`{{$labels.id}}`}}`] Instance in VERIFY_RESIZE state"
306+
summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state"
316307
description: >
317-
The instance `{{`{{$labels.id}}`}}` has been in VERIFY_RESIZE state for over 15 minutes.
308+
The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes.
318309
319310
- alert: NovaInstanceInMIGRATING
320311
for: 30m
321-
expr: 'openstack_nova_server_status == 15'
312+
expr: openstack_nova_server_status == 15
322313
labels:
323314
severity: critical
324315
annotations:
325-
summary: "[`{{`{{$labels.id}}`}}`] Instance in MIGRATING state"
316+
summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state"
326317
description: >
327-
The instance `{{`{{$labels.id}}`}}` has been in MIGRATING state for over 30 minutes.
318+
The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 30 minutes.
328319
329320
- alert: NovaFailureRisk
330321
for: 6h
@@ -337,7 +328,7 @@ promethuesRules:
337328
description: >
338329
The cloud capacity will be at `{{`{{$value}}`}}` in the event of the failure of a single
339330
hypervisor which puts the cloud at risk of not being able to recover should any hypervisor
340-
failures occur. Please ensure that adequate amount of infrastructure is assigned to this
331+
failures occur. Please ensure that adequate amount of infrastructure is assigned to this
341332
deployment to prevent this.
342333
343334
- alert: NovaCapacityNearFull
@@ -379,15 +370,15 @@ promethuesRules:
379370
summary: "[nova] Capacity risk"
380371
description: >
381372
The cloud capacity is currently at `{{`{{$value}}`}}` which means there is a risk of running
382-
out of capacity due to the timeline required to add new nodes. Please ensure that adequate
373+
out of capacity due to the timeline required to add new nodes. Please ensure that adequate
383374
amount of infrastructure is assigned to this deployment to prevent this.
384375
385376
octavia:
386377
enabled: true
387378
rules:
388379
- alert: LoadbalancerDown
389380
for: 5m
390-
expr: 'openstack_loadbalancer_up != 1'
381+
expr: openstack_loadbalancer_up != 1
391382
labels:
392383
severity: critical
393384
annotations:
@@ -401,7 +392,7 @@ promethuesRules:
401392
severity: critical
402393
annotations:
403394
summary: "OpenStack loadbalancer `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE"
404-
description: "OpenStack loadbalancer `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE"
395+
description: "OpenStack loadbalancer `{{`{{$labels.id}}`}}` provisioning status is `{{`{{$labels.provisioning_status}}`}}`"
405396

406397
- alert: LoadbalancerPoolNotActive
407398
for: 5m
@@ -410,4 +401,25 @@ promethuesRules:
410401
severity: critical
411402
annotations:
412403
summary: "OpenStack loadbalancer pool `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE"
413-
description: "OpenStack loadbalancer pool `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE"
404+
description: "OpenStack loadbalancer pool `{{`{{$labels.id}}`}}` provisioning status is `{{`{{$labels.provisioning_status}}`}}`"
405+
406+
manila:
407+
enabled: true
408+
rules:
409+
- alert: ManilaDown
410+
for: 5m
411+
expr: openstack_sharev2_up != 1
412+
labels:
413+
severity: critical
414+
annotations:
415+
summary: "OpenStack shared file system (Manila) service down"
416+
description: "OpenStack shared file system (Manila) service down"
417+
418+
- alert: ManilaStatusNotActive
419+
for: 5m
420+
expr: openstack_sharev2_share_status{status!="available"}
421+
labels:
422+
severity: critical
423+
annotations:
424+
summary: "OpenStack Share `{{`{{$labels.name}}`}}` status is not ACTIVE"
425+
description: "OpenStack Share `{{`{{$labels.id}}`}}` status is `{{`{{$labels.status}}`}}`"

0 commit comments

Comments
 (0)