@@ -76,7 +76,7 @@ promethuesRules:
7676 rules :
7777 - alert : KeystoneDown
7878 for : 5m
79- expr : ' openstack_identity_up != 1'
79+ expr : openstack_identity_up != 1
8080 labels :
8181 severity : critical
8282 annotations :
@@ -88,7 +88,7 @@ promethuesRules:
8888 rules :
8989 - alert : GlanceDown
9090 for : 5m
91- expr : ' openstack_glance_up != 1'
91+ expr : openstack_glance_up != 1
9292 labels :
9393 severity : critical
9494 annotations :
@@ -100,7 +100,7 @@ promethuesRules:
100100 rules :
101101 - alert : CinderDown
102102 for : 5m
103- expr : ' openstack_cinder_up != 1'
103+ expr : openstack_cinder_up != 1
104104 labels :
105105 severity : critical
106106 annotations :
@@ -109,68 +109,65 @@ promethuesRules:
109109
110110 - alert : CinderAgentDown
111111 for : 5m
112- expr : |
113- openstack_cinder_agent_state{adminState="enabled"} != 1
112+ expr : openstack_cinder_agent_state{adminState="enabled"} != 1
114113 labels :
115114 severity : critical
116115 annotations :
117- summary : " [ `{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down"
116+ summary : " `{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` down"
118117 description : >
119118 The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}`
120- is being reported as down for 5 minutes. This can affect volume operations so it must
119+ is being reported as down for 5 minutes. This can affect volume operations so it must
121120 be resolved as quickly as possible.
122121
123122 - alert : CinderAgentDisabled
124123 for : 1h
125- expr : |
126- openstack_cinder_agent_state{adminState!="enabled"}
124+ expr : openstack_cinder_agent_state{adminState!="enabled"}
127125 labels :
128126 severity : warning
129127 annotations :
130- summary : " [ `{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled"
128+ summary : " `{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` disabled"
131129 description : >
132130 The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}`
133- has been disabled for 60 minutes. This can affect volume operations so it must be resolved
131+ has been disabled for 60 minutes. This can affect volume operations so it must be resolved
134132 as quickly as possible.
135133
136134 - alert : CinderVolumeInError
137- for : 24h
138- expr : |
139- openstack_cinder_volume_status{status=~"error.*"}
135+ for : 15m
136+ expr : openstack_cinder_volume_status{status=~"error.*"}
140137 labels :
141- severity : warning
138+ severity : critical
142139 annotations :
143- summary : " [ `{{`{{$labels.id }}`}}`] Volume in ERROR state"
140+ summary : " `{{`{{$labels.name }}`}}` Volume in ERROR state"
144141 description : >
145- The volume `{{`{{$labels.id}}`}}` has been in ERROR state for over 24 hours. It must
146- be cleaned up or removed in order to provide a consistent customer experience.
142+ The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes.
143+ It must be cleaned up or removed in order to provide a consistent customer experience.
147144
148145 - alert : CinderVolumeInDeleting
149- expr : ' openstack_cinder_volume_status == 7'
150- for : 10m
146+ expr : openstack_cinder_volume_status == 7
147+ for : 15m
151148 labels :
152- severity : warning
149+ severity : warning
153150 annotations :
154- summary : " [ `{{`{{$labels.id }}`}}`] Volume in deleting state"
155- description : >
156- The volume `{{`{{$labels.id}}`}}` is stuck in deleting status for more than 10 minutes"
151+ summary : " `{{`{{$labels.name }}`}}` Volume in DELETING state"
152+ description : >
153+ The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels. status}}`}}` state for over 15 minutes.
157154
158155 - alert : CinderVolumeInCreating
159- expr : ' openstack_cinder_volume_status == 0'
160- for : 10m
156+ expr : openstack_cinder_volume_status == 0
157+ for : 15m
161158 labels :
162159 severity : warning
163160 annotations :
164- summary : " [ `{{`{{$labels.id }}`}}`] Volume in creating state"
161+ summary : " `{{`{{$labels.name }}`}}` Volume in CREATING state"
165162 description : >
166- The volume `{{`{{$labels.id}}`}}` is stuck in deleting status for more than 10 minutes"
163+ The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels. status}}`}}` state for over 15 minutes.
167164
168165 neutron :
169166 enabled : true
170167 rules :
171168 - alert : NeutronDown
172169 for : 5m
173- expr : ' openstack_neutron_up != 1'
170+ expr : openstack_neutron_up != 1
174171 labels :
175172 severity : critical
176173 annotations :
@@ -179,37 +176,34 @@ promethuesRules:
179176
180177 - alert : NeutronAgentDown
181178 for : 5m
182- expr : |
183- openstack_neutron_agent_state{adminState="up"} != 1
179+ expr : openstack_neutron_agent_state{adminState="up"} != 1
184180 labels :
185181 severity : critical
186182 annotations :
187- summary : " [ `{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down"
183+ summary : " `{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` down"
188184 description : >
189185 The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}`
190186 is being reported as down for 5 minutes. This can affect network operations so it must
191187 be resolved as quickly as possible.
192188
193189 - alert : NeutronAgentDisabled
194190 for : 1h
195- expr : |
196- openstack_neutron_agent_state{adminState!="up"}
191+ expr : openstack_neutron_agent_state{adminState!="up"}
197192 labels :
198193 severity : warning
199194 annotations :
200- summary : " [ `{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled"
195+ summary : " `{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` disabled"
201196 description : >
202197 The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}`
203- has been disabled for 60 minutes. This can affect network operations so it must be resolved
198+ has been disabled for 60 minutes. This can affect network operations so it must be resolved
204199 as quickly as possible.
205200
206201 - alert : NeutronBindingFailedPorts
207- expr : |
208- openstack_neutron_port{binding_vif_type="binding_failed"} != 0
202+ expr : openstack_neutron_port{binding_vif_type="binding_failed"} != 0
209203 labels :
210204 severity : warning
211205 annotations :
212- summary : " [ `{{`{{$labels.device_owner}}`}}`] `{{`{{$labels.mac_address}}`}}` binding failed"
206+ summary : " `{{`{{$labels.device_owner}}`}}` `{{`{{$labels.mac_address}}`}}` binding failed"
213207 description : >
214208 The NIC `{{`{{$labels.mac_address}}`}}` of `{{`{{$labels.device_owner}}`}}`
215209 has binding failed port now.
@@ -220,18 +214,18 @@ promethuesRules:
220214 labels :
221215 severity : warning
222216 annotations :
223- summary : " [ `{{`{{$labels.network_name}}`}}`] `{{`{{$labels.subnet_name}}`}}` running out of IPs"
217+ summary : " `{{`{{$labels.network_name}}`}}` `{{`{{$labels.subnet_name}}`}}` running out of IPs"
224218 description : >
225219 The subnet `{{`{{$labels.subnet_name}}`}}` within `{{`{{$labels.network_name}}`}}`
226- is currently at `{{`{{$value}}`}}`% utilization. If the IP addresses run out, it will
220+ is currently at `{{`{{$value}}`}}`% utilization. If the IP addresses run out, it will
227221 impact the provisioning of new ports.
228222
229223 nova :
230224 enabled : true
231225 rules :
232226 - alert : NovaDown
233227 for : 5m
234- expr : ' openstack_nova_up != 1'
228+ expr : openstack_nova_up != 1
235229 labels :
236230 severity : critical
237231 annotations :
@@ -240,91 +234,88 @@ promethuesRules:
240234
241235 - alert : NovaAgentDown
242236 for : 5m
243- expr : |
244- openstack_nova_agent_state{adminState="enabled"} != 1
237+ expr : openstack_nova_agent_state{adminState="enabled"} != 1
245238 labels :
246239 severity : critical
247240 annotations :
248- summary : " [ `{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down"
241+ summary : " `{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` down"
249242 description : >
250243 The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}`
251- is being reported as down. This can affect compute operations so it must be resolved
244+ is being reported as down. This can affect compute operations so it must be resolved
252245 as quickly as possible.
253246
254247 - alert : NovaAgentDisabled
255248 for : 1h
256- expr : |
257- openstack_nova_agent_state{adminState!="enabled"}
249+ expr : openstack_nova_agent_state{adminState!="enabled"}
258250 labels :
259251 severity : warning
260252 annotations :
261- summary : " [ `{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled"
253+ summary : " `{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` disabled"
262254 description : >
263255 The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}`
264- has been disabled for 60 minutes. This can affect compute operations so it must be resolved
256+ has been disabled for 60 minutes. This can affect compute operations so it must be resolved
265257 as quickly as possible.
266258
267259 - alert : NovaInstanceInError
268260 for : 10m
269- expr : |
270- openstack_nova_server_status{status="ERROR"}
261+ expr : openstack_nova_server_status{status="ERROR"}
271262 labels :
272263 severity : critical
273264 annotations :
274- summary : " [ `{{`{{$labels.id }}`}}`] Instance in ERROR state"
265+ summary : " `{{`{{$labels.name }}`}}` Instance in `{{`{{$labels.status}}`}}` state"
275266 description : >
276- The instance `{{`{{$labels.id}}`}}` has been in ERROR state for over 10 minutes. It must
277- be cleaned up or removed in order to provide a consistent customer experience.
267+ The instance `{{`{{$labels.id}}`}}` on host `{{`{{$labels.hypervisor_hostname}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 10 minutes.
268+ It must be cleaned up or removed in order to provide a consistent customer experience.
278269
279270 - alert : NovaInstanceInBuilding
280271 for : 15m
281- expr : ' openstack_nova_server_status == 1'
272+ expr : openstack_nova_server_status == 1
282273 labels :
283274 severity : critical
284275 annotations :
285- summary : " [ `{{`{{$labels.id }}`}}`] Instance in BUILD state"
276+ summary : " `{{`{{$labels.name }}`}}` Instance in `{{`{{$labels.status}}`}}` state"
286277 description : >
287- The instance `{{`{{$labels.id}}`}}` has been in BUILD state for over 15 minutes.
278+ The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes.
288279
289280 - alert : NovaInstanceInRESIZE
290281 for : 15m
291- expr : ' openstack_nova_server_status == 10'
282+ expr : openstack_nova_server_status == 10
292283 labels :
293284 severity : critical
294285 annotations :
295- summary : " [ `{{`{{$labels.id }}`}}`] Instance in RESIZE state"
286+ summary : " `{{`{{$labels.name }}`}}` Instance in `{{`{{$labels.status}}`}}` state"
296287 description : >
297- The instance `{{`{{$labels.id}}`}}` has been in RESIZE state for over 15 minutes.
288+ The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes.
298289
299290 - alert : NovaInstanceInUNKNOWN
300291 for : 15m
301- expr : ' openstack_nova_server_status == 13'
292+ expr : openstack_nova_server_status == 13
302293 labels :
303294 severity : critical
304295 annotations :
305- summary : " [ `{{`{{$labels.id }}`}}`] Instance in UNKNOWN state"
296+ summary : " `{{`{{$labels.name }}`}}` Instance in `{{`{{$labels.status}}`}}` state"
306297 description : >
307- The instance `{{`{{$labels.id}}`}}` has been in UNKNOWN state for over 15 minutes.
298+ The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes.
308299
309300 - alert : NovaInstanceInVERIFY_RESIZE
310301 for : 15m
311- expr : ' openstack_nova_server_status == 14'
302+ expr : openstack_nova_server_status == 14
312303 labels :
313304 severity : critical
314305 annotations :
315- summary : " [ `{{`{{$labels.id }}`}}`] Instance in VERIFY_RESIZE state"
306+ summary : " `{{`{{$labels.name }}`}}` Instance in `{{`{{$labels.status}}`}}` state"
316307 description : >
317- The instance `{{`{{$labels.id}}`}}` has been in VERIFY_RESIZE state for over 15 minutes.
308+ The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes.
318309
319310 - alert : NovaInstanceInMIGRATING
320311 for : 30m
321- expr : ' openstack_nova_server_status == 15'
312+ expr : openstack_nova_server_status == 15
322313 labels :
323314 severity : critical
324315 annotations :
325- summary : " [ `{{`{{$labels.id }}`}}`] Instance in MIGRATING state"
316+ summary : " `{{`{{$labels.name }}`}}` Instance in `{{`{{$labels.status}}`}}` state"
326317 description : >
327- The instance `{{`{{$labels.id}}`}}` has been in MIGRATING state for over 30 minutes.
318+ The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 30 minutes.
328319
329320 - alert : NovaFailureRisk
330321 for : 6h
@@ -337,7 +328,7 @@ promethuesRules:
337328 description : >
338329 The cloud capacity will be at `{{`{{$value}}`}}` in the event of the failure of a single
339330 hypervisor which puts the cloud at risk of not being able to recover should any hypervisor
340- failures occur. Please ensure that adequate amount of infrastructure is assigned to this
331+ failures occur. Please ensure that adequate amount of infrastructure is assigned to this
341332 deployment to prevent this.
342333
343334 - alert : NovaCapacityNearFull
@@ -379,15 +370,15 @@ promethuesRules:
379370 summary : " [nova] Capacity risk"
380371 description : >
381372 The cloud capacity is currently at `{{`{{$value}}`}}` which means there is a risk of running
382- out of capacity due to the timeline required to add new nodes. Please ensure that adequate
373+ out of capacity due to the timeline required to add new nodes. Please ensure that adequate
383374 amount of infrastructure is assigned to this deployment to prevent this.
384375
385376 octavia :
386377 enabled : true
387378 rules :
388379 - alert : LoadbalancerDown
389380 for : 5m
390- expr : ' openstack_loadbalancer_up != 1'
381+ expr : openstack_loadbalancer_up != 1
391382 labels :
392383 severity : critical
393384 annotations :
@@ -401,7 +392,7 @@ promethuesRules:
401392 severity : critical
402393 annotations :
403394 summary : " OpenStack loadbalancer `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE"
404- description : " OpenStack loadbalancer `{{`{{$labels.name }}`}}` provisioning status is not ACTIVE "
395+ description : " OpenStack loadbalancer `{{`{{$labels.id }}`}}` provisioning status is `{{`{{$labels.provisioning_status}}`}}` "
405396
406397 - alert : LoadbalancerPoolNotActive
407398 for : 5m
@@ -410,4 +401,25 @@ promethuesRules:
410401 severity : critical
411402 annotations :
412403 summary : " OpenStack loadbalancer pool `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE"
413- description : " OpenStack loadbalancer pool `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE"
404+ description : " OpenStack loadbalancer pool `{{`{{$labels.id}}`}}` provisioning status is `{{`{{$labels.provisioning_status}}`}}`"
405+
406+ manila :
407+ enabled : true
408+ rules :
409+ - alert : ManilaDown
410+ for : 5m
411+ expr : openstack_sharev2_up != 1
412+ labels :
413+ severity : critical
414+ annotations :
415+ summary : " OpenStack shared file system (Manila) service down"
416+ description : " OpenStack shared file system (Manila) service down"
417+
418+ - alert : ManilaStatusNotActive
419+ for : 5m
420+ expr : openstack_sharev2_share_status{status!="available"}
421+ labels :
422+ severity : critical
423+ annotations :
424+ summary : " OpenStack Share `{{`{{$labels.name}}`}}` status is not ACTIVE"
425+ description : " OpenStack Share `{{`{{$labels.id}}`}}` status is `{{`{{$labels.status}}`}}`"
0 commit comments