blob: 8cd3014b471ebc63338d289318f47ab264438635 [file] [log] [blame]
guilhermesteinmuller86a88b62022-05-27 16:45:49 +00001# Copyright (c) 2022 VEXXHOST, Inc.
2#
3# Licensed under the Apache License, Version 2.0 (the "License"); you may
4# not use this file except in compliance with the License. You may obtain
5# a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12# License for the specific language governing permissions and limitations
13# under the License.
14
15- name: Create keystone user
16 openstack.cloud.identity_user:
17 cloud: atmosphere
18 state: present
19 name: openstack-exporter-{{ openstack_helm_endpoints_region_name }}
20 password: "{{ openstack_helm_endpoints_openstack_exporter_keystone_password }}"
21 domain: service
22 default_project: service
23
24- name: Assign admin role to service user
25 openstack.cloud.role_assignment:
26 cloud: atmosphere
27 user: openstack-exporter-{{ openstack_helm_endpoints_region_name }}
28 role: admin
29 project: service
30 domain: service
31
Mohammed Naserc8e1a452022-08-11 16:16:13 -040032- name: Deploy service
guilhermesteinmuller86a88b62022-05-27 16:45:49 +000033 kubernetes.core.k8s:
34 state: present
35 definition:
Mohammed Naserc8e1a452022-08-11 16:16:13 -040036 - apiVersion: v1
37 kind: Secret
38 metadata:
39 name: openstack-config
40 namespace: monitoring
41 type: Opaque
42 stringData:
43 clouds.yaml: |
44 clouds:
45 openstack:
46 auth:
47 auth_url: http://keystone-api.openstack.svc.cluster.local:5000
48 project_domain_name: service
49 project_name: service
50 user_domain_name: service
51 username: openstack-exporter-{{ openstack_helm_endpoints_region_name }}
52 password: {{ openstack_helm_endpoints_openstack_exporter_keystone_password }}
53 region_name: {{ openstack_helm_endpoints_region_name }}
54 interface: internal
55 identity_api_version: 3
56 identity_interface: internal
guilhermesteinmuller86a88b62022-05-27 16:45:49 +000057
Mohammed Naserc8e1a452022-08-11 16:16:13 -040058 - apiVersion: apps/v1
59 kind: Deployment
60 metadata:
61 name: openstack-exporter
62 namespace: monitoring
63 labels:
guilhermesteinmuller86a88b62022-05-27 16:45:49 +000064 application: openstack-exporter
Mohammed Naserc8e1a452022-08-11 16:16:13 -040065 spec:
66 replicas: 1
67 selector:
68 matchLabels:
guilhermesteinmuller86a88b62022-05-27 16:45:49 +000069 application: openstack-exporter
Mohammed Naserc8e1a452022-08-11 16:16:13 -040070 template:
71 metadata:
72 labels:
73 application: openstack-exporter
74 spec:
75 nodeSelector:
76 openstack-control-plane: enabled
77 containers:
78 - name: openstack-exporter
Mohammed Naser6e24e872022-09-08 12:49:37 -040079 image: "{{ openstack_exporter_image_repository }}/openstack-exporter:{{ openstack_exporter_image_tag }}"
Mohammed Naserc8e1a452022-08-11 16:16:13 -040080 args:
81 - --endpoint-type
82 - internal
83 - default
84 - --collect-metric-time
85 - --disable-service.identity
86 - --disable-service.image
87 - --disable-metric=cinder-limits_volume_max_gb
88 - --disable-metric=cinder-limits_volume_used_gb
89 - --disable-metric=cinder-volumes
90 - --disable-metric=cinder-volume_status
91 - --disable-metric=neutron-floating_ips
92 - --disable-metric=neutron-networks
93 - --disable-metric=neutron-security_groups
94 - --disable-metric=neutron-subnets
95 - --disable-metric=neutron-routers
96 - --disable-metric=nova-flavors
97 - --disable-metric=nova-availability_zones
98 - --disable-metric=nova-security_groups
99 - --disable-metric=nova-limits_vcpus_max
100 - --disable-metric=nova-limits_vcpus_used
101 - --disable-metric=nova-limits_memory_max
102 - --disable-metric=nova-limits_memory_used
103 port:
104 name: metrics
105 containerPort: 9180
106 volumeMounts:
107 - name: openstack-config
108 mountPath: "/etc/openstack"
109 volumes:
110 - name: openstack-config
111 secret:
112 secretName: openstack-config
guilhermesteinmuller86a88b62022-05-27 16:45:49 +0000113
Mohammed Naserc8e1a452022-08-11 16:16:13 -0400114 - apiVersion: v1
115 kind: Service
116 metadata:
117 name: openstack-exporter
118 namespace: monitoring
119 labels:
120 application: openstack-exporter
121 spec:
122 clusterIP: None
123 ports:
124 - name: metrics
125 port: 9180
126 targetPort: metrics
127 selector:
guilhermesteinmuller86a88b62022-05-27 16:45:49 +0000128 application: openstack-exporter
129
Mohammed Naserc8e1a452022-08-11 16:16:13 -0400130 - apiVersion: monitoring.coreos.com/v1
131 kind: ServiceMonitor
132 metadata:
133 name: openstack-exporter
134 namespace: monitoring
135 labels:
136 application: openstack-exporter
137 spec:
138 endpoints:
139 - interval: 1m
140 scrapeTimeout: 30s
141 port: metrics
142 relabelings:
143 - action: replace
144 regex: (.*)
145 replacement: default
146 targetLabel: instance
147 jobLabel: jobLabel
148 namespaceSelector:
149 any: true
150 selector:
151 matchLabels:
152 application: openstack-exporter
153
154 - apiVersion: monitoring.coreos.com/v1
155 kind: PrometheusRule
156 metadata:
157 name: openstack-exporter
158 namespace: monitoring
159 labels:
160 application: openstack-exporter
161 spec:
162 groups:
163 - name: cinder
164 rules:
165 - alert: CinderAgentDown
166 annotations:
167 description: |
168 '{% raw %}The service {{ $labels.exported_service }} running
169 on {{ $labels.hostname }} is being reported as down.{% endraw %}'
170 summary: |
171 '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
172 down{% endraw %}'
173 expr: |
174 openstack_cinder_agent_state != 1
175 labels:
176 severity: warning
177 - alert: CinderAgentDown
178 annotations:
179 description: |
180 '{% raw %}The service {{ $labels.exported_service }} running on
181 {{ $labels.hostname }} is being reported as down for 5 minutes.
182 This can affect volume operations so it must be resolved as
183 quickly as possible.{% endraw %}'
184 summary: |
185 '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
186 down{% endraw %}'
187 expr: |
188 openstack_cinder_agent_state != 1
189 for: 5m
190 labels:
191 severity: critical
192 - alert: CinderAgentDisabled
193 annotations:
194 description: |
195 '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
196 has been disabled for 60 minutes. This can affect volume operations so it must be
197 resolved as quickly as possible.{% endraw %}'
198 summary: |
199 '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
200 disabled{% endraw %}'
201 expr: |
202 openstack_cinder_agent_state{adminState!="enabled"}
203 for: 1h
204 labels:
205 severity: warning
206 - alert: CinderVolumeInError
207 annotations:
208 description: |
209 '{% raw %}The volume {{ $labels.id }} has been in ERROR state for over 24 hours.
210 It must be cleaned up or removed in order to provide a consistent customer
211 experience.{% endraw %}'
212 summary: |
213 '{% raw %}[{{ $labels.id }}] Volume in ERROR state{% endraw %}'
214 expr: |
215 openstack_cinder_volume_status{status=~"error.*"}
216 for: 24h
217 labels:
218 severity: warning
219 - name: neutron
220 rules:
221 - alert: NeutronAgentDown
222 annotations:
223 description: |
224 '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
225 is being reported as down.{% endraw %}'
226 summary: |
227 '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
228 down{% endraw %}'
229 expr: |
230 openstack_neutron_agent_state != 1
231 labels:
232 severity: warning
233 - alert: NeutronAgentDown
234 annotations:
235 description: |
236 '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
237 is being reported as down for 5 minutes. This can affect network operations so it
238 must be resolved as quickly as possible.{% endraw %}'
239 summary: |
240 '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
241 down{% endraw %}'
242 expr: |
243 openstack_neutron_agent_state != 1
244 for: 5m
245 labels:
246 severity: critical
247 - alert: NeutronAgentDisabled
248 annotations:
249 description: |
250 '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
251 has been disabled for 60 minutes. This can affect network operations so it must be
252 resolved as quickly as possible.{% endraw %}'
253 summary: |
254 '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
255 disabled{% endraw %}'
256 expr: |
257 openstack_neutron_agent_state{adminState!="up"}
258 for: 1h
259 labels:
260 severity: warning
261 - alert: NeutronBindingFailedPorts
262 annotations:
263 description: |
264 '{% raw %}The NIC {{ $labels.mac_address }} of {{ $labels.device_owner }}
265 has binding failed port now.{% endraw %}'
266 summary: |
267 '{% raw %}[{{ $labels.device_owner }}] {{ $labels.mac_address }}
268 binding failed{% endraw %}'
269 expr: |
270 openstack_neutron_port{binding_vif_type="binding_failed"} != 0
271 labels:
272 severity: warning
273 - alert: NeutronNetworkOutOfIPs
274 annotations:
275 description: |
276 '{% raw %}The subnet {{ $labels.subnet_name }} within {{ $labels.network_name }}
277 is currently at {{ $value }}% utilization. If the IP addresses run out, it will
278 impact the provisioning of new ports.{% endraw %}'
279 summary: |
280 '{% raw %}[{{ $labels.network_name }}] {{ $labels.subnet_name }}
281 running out of IPs{% endraw %}'
282 expr: |
283 sum by (network_id) (openstack_neutron_network_ip_availabilities_used{project_id!=""}) / sum by (network_id)
284 (openstack_neutron_network_ip_availabilities_total{project_id!=""}) * 100 > 80
285 labels:
286 severity: warning
287 - name: nova
288 rules:
289 - alert: NovaAgentDown
290 annotations:
291 description: |
292 '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
293 is being reported as down.{% endraw %}'
294 summary: |
295 '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
296 down{% endraw %}'
297 expr: |
298 openstack_nova_agent_state != 1
299 labels:
300 severity: warning
301 - alert: NovaAgentDown
302 annotations:
303 description: |
304 '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} is
305 being reported as down. This can affect compute operations so it must be resolved as
306 quickly as possible.{% endraw %}'
307 summary: |
308 '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
309 down{% endraw %}'
310 expr: |
311 openstack_nova_agent_state != 1
312 for: 5m
313 labels:
314 severity: critical
315 - alert: NovaAgentDisabled
316 annotations:
317 description: |
318 '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} has been
319 disabled for 60 minutes. This can affect compute operations so it must be resolved as quickly
320 as possible.{% endraw %}'
321 summary: |
322 '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
323 disabled{% endraw %}'
324 expr: |
325 openstack_nova_agent_state{adminState!="enabled"}
326 for: 1h
327 labels:
328 severity: warning
329 - alert: NovaInstanceInError
330 annotations:
331 description: |
332 '{% raw %}The instance {{ $labels.id }} has been in ERROR state for over 24 hours.
333 It must be cleaned up or removed in order to provide a consistent customer
334 experience.{% endraw %}'
335 summary: '{% raw %}[{{ $labels.id }}] Instance in ERROR state{% endraw %}'
336 expr: |
337 openstack_nova_server_status{status="ERROR"}
338 for: 24h
339 labels:
340 severity: warning
341 - alert: NovaFailureRisk
342 annotations:
343 description: |
344 '{% raw %}The cloud capacity will be at {{ $value }} in the event of the failure of
345 a single hypervisor which puts the cloud at risk of not being able to recover should
346 any hypervisor failures occur. Please ensure that adequate amount of infrastructure
347 is assigned to this deployment to prevent this.{% endraw %}'
348 summary: '{% raw %}[nova] Failure risk{% endraw %}'
349 expr: |
350 (sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) - max(openstack_nova_memory_used_bytes))
351 / sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) * 100 < 0.25
352 for: 6h
353 labels:
354 severity: warning
355 - alert: NovaCapacity
356 annotations:
357 description: |
358 '{% raw %}The cloud capacity is currently at `{{ $value }}` which means there is a
359 risk of running out of capacity due to the timeline required to add new nodes.
360 Please ensure that adequate amount of infrastructure is assigned to this deployment
361 to prevent this.{% endraw %}'
362 summary: '{% raw %}[nova] Capacity risk{% endraw %}'
363 expr: |
364 sum (
365 openstack_nova_memory_used_bytes
366 + on(hostname) group_left(adminState)
367 (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"})
368 ) / sum (
369 openstack_nova_memory_available_bytes
370 + on(hostname) group_left(adminState)
371 (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"})
372 ) * 100 > 75
373 for: 6h
374 labels:
375 severity: warning
376 # NOTE(mnaser): Since we haven't moved to the operator pattern yet, we need to
377 # keep retrying a few times as the CRDs might not be installed
378 # yet.
379 retries: 60
380 delay: 5
381 register: _result
382 until: _result is not failed