guilhermesteinmuller | 86a88b6 | 2022-05-27 16:45:49 +0000 | [diff] [blame] | 1 | # Copyright (c) 2022 VEXXHOST, Inc. |
| 2 | # |
| 3 | # Licensed under the Apache License, Version 2.0 (the "License"); you may |
| 4 | # not use this file except in compliance with the License. You may obtain |
| 5 | # a copy of the License at |
| 6 | # |
| 7 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | # |
| 9 | # Unless required by applicable law or agreed to in writing, software |
| 10 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| 11 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| 12 | # License for the specific language governing permissions and limitations |
| 13 | # under the License. |
| 14 | |
| 15 | - name: Create keystone user |
| 16 | openstack.cloud.identity_user: |
| 17 | cloud: atmosphere |
| 18 | state: present |
| 19 | name: openstack-exporter-{{ openstack_helm_endpoints_region_name }} |
| 20 | password: "{{ openstack_helm_endpoints_openstack_exporter_keystone_password }}" |
| 21 | domain: service |
| 22 | default_project: service |
| 23 | |
| 24 | - name: Assign admin role to service user |
| 25 | openstack.cloud.role_assignment: |
| 26 | cloud: atmosphere |
| 27 | user: openstack-exporter-{{ openstack_helm_endpoints_region_name }} |
| 28 | role: admin |
| 29 | project: service |
| 30 | domain: service |
| 31 | |
Mohammed Naser | c8e1a45 | 2022-08-11 16:16:13 -0400 | [diff] [blame] | 32 | - name: Deploy service |
guilhermesteinmuller | 86a88b6 | 2022-05-27 16:45:49 +0000 | [diff] [blame] | 33 | kubernetes.core.k8s: |
| 34 | state: present |
| 35 | definition: |
Mohammed Naser | c8e1a45 | 2022-08-11 16:16:13 -0400 | [diff] [blame] | 36 | - apiVersion: v1 |
| 37 | kind: Secret |
| 38 | metadata: |
| 39 | name: openstack-config |
| 40 | namespace: monitoring |
| 41 | type: Opaque |
| 42 | stringData: |
| 43 | clouds.yaml: | |
| 44 | clouds: |
| 45 | openstack: |
| 46 | auth: |
| 47 | auth_url: http://keystone-api.openstack.svc.cluster.local:5000 |
| 48 | project_domain_name: service |
| 49 | project_name: service |
| 50 | user_domain_name: service |
| 51 | username: openstack-exporter-{{ openstack_helm_endpoints_region_name }} |
| 52 | password: {{ openstack_helm_endpoints_openstack_exporter_keystone_password }} |
| 53 | region_name: {{ openstack_helm_endpoints_region_name }} |
| 54 | interface: internal |
| 55 | identity_api_version: 3 |
| 56 | identity_interface: internal |
guilhermesteinmuller | 86a88b6 | 2022-05-27 16:45:49 +0000 | [diff] [blame] | 57 | |
Mohammed Naser | c8e1a45 | 2022-08-11 16:16:13 -0400 | [diff] [blame] | 58 | - apiVersion: apps/v1 |
| 59 | kind: Deployment |
| 60 | metadata: |
| 61 | name: openstack-exporter |
| 62 | namespace: monitoring |
| 63 | labels: |
guilhermesteinmuller | 86a88b6 | 2022-05-27 16:45:49 +0000 | [diff] [blame] | 64 | application: openstack-exporter |
Mohammed Naser | c8e1a45 | 2022-08-11 16:16:13 -0400 | [diff] [blame] | 65 | spec: |
| 66 | replicas: 1 |
| 67 | selector: |
| 68 | matchLabels: |
guilhermesteinmuller | 86a88b6 | 2022-05-27 16:45:49 +0000 | [diff] [blame] | 69 | application: openstack-exporter |
Mohammed Naser | c8e1a45 | 2022-08-11 16:16:13 -0400 | [diff] [blame] | 70 | template: |
| 71 | metadata: |
| 72 | labels: |
| 73 | application: openstack-exporter |
| 74 | spec: |
| 75 | nodeSelector: |
| 76 | openstack-control-plane: enabled |
| 77 | containers: |
| 78 | - name: openstack-exporter |
Mohammed Naser | 6e24e87 | 2022-09-08 12:49:37 -0400 | [diff] [blame] | 79 | image: "{{ openstack_exporter_image_repository }}/openstack-exporter:{{ openstack_exporter_image_tag }}" |
Mohammed Naser | c8e1a45 | 2022-08-11 16:16:13 -0400 | [diff] [blame] | 80 | args: |
| 81 | - --endpoint-type |
| 82 | - internal |
| 83 | - default |
| 84 | - --collect-metric-time |
| 85 | - --disable-service.identity |
| 86 | - --disable-service.image |
| 87 | - --disable-metric=cinder-limits_volume_max_gb |
| 88 | - --disable-metric=cinder-limits_volume_used_gb |
| 89 | - --disable-metric=cinder-volumes |
| 90 | - --disable-metric=cinder-volume_status |
| 91 | - --disable-metric=neutron-floating_ips |
| 92 | - --disable-metric=neutron-networks |
| 93 | - --disable-metric=neutron-security_groups |
| 94 | - --disable-metric=neutron-subnets |
| 95 | - --disable-metric=neutron-routers |
| 96 | - --disable-metric=nova-flavors |
| 97 | - --disable-metric=nova-availability_zones |
| 98 | - --disable-metric=nova-security_groups |
| 99 | - --disable-metric=nova-limits_vcpus_max |
| 100 | - --disable-metric=nova-limits_vcpus_used |
| 101 | - --disable-metric=nova-limits_memory_max |
| 102 | - --disable-metric=nova-limits_memory_used |
| 103 | port: |
| 104 | name: metrics |
| 105 | containerPort: 9180 |
| 106 | volumeMounts: |
| 107 | - name: openstack-config |
| 108 | mountPath: "/etc/openstack" |
| 109 | volumes: |
| 110 | - name: openstack-config |
| 111 | secret: |
| 112 | secretName: openstack-config |
guilhermesteinmuller | 86a88b6 | 2022-05-27 16:45:49 +0000 | [diff] [blame] | 113 | |
Mohammed Naser | c8e1a45 | 2022-08-11 16:16:13 -0400 | [diff] [blame] | 114 | - apiVersion: v1 |
| 115 | kind: Service |
| 116 | metadata: |
| 117 | name: openstack-exporter |
| 118 | namespace: monitoring |
| 119 | labels: |
| 120 | application: openstack-exporter |
| 121 | spec: |
| 122 | clusterIP: None |
| 123 | ports: |
| 124 | - name: metrics |
| 125 | port: 9180 |
| 126 | targetPort: metrics |
| 127 | selector: |
guilhermesteinmuller | 86a88b6 | 2022-05-27 16:45:49 +0000 | [diff] [blame] | 128 | application: openstack-exporter |
| 129 | |
Mohammed Naser | c8e1a45 | 2022-08-11 16:16:13 -0400 | [diff] [blame] | 130 | - apiVersion: monitoring.coreos.com/v1 |
| 131 | kind: ServiceMonitor |
| 132 | metadata: |
| 133 | name: openstack-exporter |
| 134 | namespace: monitoring |
| 135 | labels: |
| 136 | application: openstack-exporter |
| 137 | spec: |
| 138 | endpoints: |
| 139 | - interval: 1m |
| 140 | scrapeTimeout: 30s |
| 141 | port: metrics |
| 142 | relabelings: |
| 143 | - action: replace |
| 144 | regex: (.*) |
| 145 | replacement: default |
| 146 | targetLabel: instance |
| 147 | jobLabel: jobLabel |
| 148 | namespaceSelector: |
| 149 | any: true |
| 150 | selector: |
| 151 | matchLabels: |
| 152 | application: openstack-exporter |
| 153 | |
| 154 | - apiVersion: monitoring.coreos.com/v1 |
| 155 | kind: PrometheusRule |
| 156 | metadata: |
| 157 | name: openstack-exporter |
| 158 | namespace: monitoring |
| 159 | labels: |
| 160 | application: openstack-exporter |
| 161 | spec: |
| 162 | groups: |
| 163 | - name: cinder |
| 164 | rules: |
| 165 | - alert: CinderAgentDown |
| 166 | annotations: |
| 167 | description: | |
| 168 | '{% raw %}The service {{ $labels.exported_service }} running |
| 169 | on {{ $labels.hostname }} is being reported as down.{% endraw %}' |
| 170 | summary: | |
| 171 | '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| 172 | down{% endraw %}' |
| 173 | expr: | |
| 174 | openstack_cinder_agent_state != 1 |
| 175 | labels: |
| 176 | severity: warning |
| 177 | - alert: CinderAgentDown |
| 178 | annotations: |
| 179 | description: | |
| 180 | '{% raw %}The service {{ $labels.exported_service }} running on |
| 181 | {{ $labels.hostname }} is being reported as down for 5 minutes. |
| 182 | This can affect volume operations so it must be resolved as |
| 183 | quickly as possible.{% endraw %}' |
| 184 | summary: | |
| 185 | '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| 186 | down{% endraw %}' |
| 187 | expr: | |
| 188 | openstack_cinder_agent_state != 1 |
| 189 | for: 5m |
| 190 | labels: |
| 191 | severity: critical |
| 192 | - alert: CinderAgentDisabled |
| 193 | annotations: |
| 194 | description: | |
| 195 | '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} |
| 196 | has been disabled for 60 minutes. This can affect volume operations so it must be |
| 197 | resolved as quickly as possible.{% endraw %}' |
| 198 | summary: | |
| 199 | '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| 200 | disabled{% endraw %}' |
| 201 | expr: | |
| 202 | openstack_cinder_agent_state{adminState!="enabled"} |
| 203 | for: 1h |
| 204 | labels: |
| 205 | severity: warning |
| 206 | - alert: CinderVolumeInError |
| 207 | annotations: |
| 208 | description: | |
| 209 | '{% raw %}The volume {{ $labels.id }} has been in ERROR state for over 24 hours. |
| 210 | It must be cleaned up or removed in order to provide a consistent customer |
| 211 | experience.{% endraw %}' |
| 212 | summary: | |
| 213 | '{% raw %}[{{ $labels.id }}] Volume in ERROR state{% endraw %}' |
| 214 | expr: | |
| 215 | openstack_cinder_volume_status{status=~"error.*"} |
| 216 | for: 24h |
| 217 | labels: |
| 218 | severity: warning |
| 219 | - name: neutron |
| 220 | rules: |
| 221 | - alert: NeutronAgentDown |
| 222 | annotations: |
| 223 | description: | |
| 224 | '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} |
| 225 | is being reported as down.{% endraw %}' |
| 226 | summary: | |
| 227 | '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| 228 | down{% endraw %}' |
| 229 | expr: | |
| 230 | openstack_neutron_agent_state != 1 |
| 231 | labels: |
| 232 | severity: warning |
| 233 | - alert: NeutronAgentDown |
| 234 | annotations: |
| 235 | description: | |
| 236 | '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} |
| 237 | is being reported as down for 5 minutes. This can affect network operations so it |
| 238 | must be resolved as quickly as possible.{% endraw %}' |
| 239 | summary: | |
| 240 | '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| 241 | down{% endraw %}' |
| 242 | expr: | |
| 243 | openstack_neutron_agent_state != 1 |
| 244 | for: 5m |
| 245 | labels: |
| 246 | severity: critical |
| 247 | - alert: NeutronAgentDisabled |
| 248 | annotations: |
| 249 | description: | |
| 250 | '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} |
| 251 | has been disabled for 60 minutes. This can affect network operations so it must be |
| 252 | resolved as quickly as possible.{% endraw %}' |
| 253 | summary: | |
| 254 | '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| 255 | disabled{% endraw %}' |
| 256 | expr: | |
| 257 | openstack_neutron_agent_state{adminState!="up"} |
| 258 | for: 1h |
| 259 | labels: |
| 260 | severity: warning |
| 261 | - alert: NeutronBindingFailedPorts |
| 262 | annotations: |
| 263 | description: | |
| 264 | '{% raw %}The NIC {{ $labels.mac_address }} of {{ $labels.device_owner }} |
| 265 | has binding failed port now.{% endraw %}' |
| 266 | summary: | |
| 267 | '{% raw %}[{{ $labels.device_owner }}] {{ $labels.mac_address }} |
| 268 | binding failed{% endraw %}' |
| 269 | expr: | |
| 270 | openstack_neutron_port{binding_vif_type="binding_failed"} != 0 |
| 271 | labels: |
| 272 | severity: warning |
| 273 | - alert: NeutronNetworkOutOfIPs |
| 274 | annotations: |
| 275 | description: | |
| 276 | '{% raw %}The subnet {{ $labels.subnet_name }} within {{ $labels.network_name }} |
| 277 | is currently at {{ $value }}% utilization. If the IP addresses run out, it will |
| 278 | impact the provisioning of new ports.{% endraw %}' |
| 279 | summary: | |
| 280 | '{% raw %}[{{ $labels.network_name }}] {{ $labels.subnet_name }} |
| 281 | running out of IPs{% endraw %}' |
| 282 | expr: | |
| 283 | sum by (network_id) (openstack_neutron_network_ip_availabilities_used{project_id!=""}) / sum by (network_id) |
| 284 | (openstack_neutron_network_ip_availabilities_total{project_id!=""}) * 100 > 80 |
| 285 | labels: |
| 286 | severity: warning |
| 287 | - name: nova |
| 288 | rules: |
| 289 | - alert: NovaAgentDown |
| 290 | annotations: |
| 291 | description: | |
| 292 | '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} |
| 293 | is being reported as down.{% endraw %}' |
| 294 | summary: | |
| 295 | '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| 296 | down{% endraw %}' |
| 297 | expr: | |
| 298 | openstack_nova_agent_state != 1 |
| 299 | labels: |
| 300 | severity: warning |
| 301 | - alert: NovaAgentDown |
| 302 | annotations: |
| 303 | description: | |
| 304 | '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} is |
| 305 | being reported as down. This can affect compute operations so it must be resolved as |
| 306 | quickly as possible.{% endraw %}' |
| 307 | summary: | |
| 308 | '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| 309 | down{% endraw %}' |
| 310 | expr: | |
| 311 | openstack_nova_agent_state != 1 |
| 312 | for: 5m |
| 313 | labels: |
| 314 | severity: critical |
| 315 | - alert: NovaAgentDisabled |
| 316 | annotations: |
| 317 | description: | |
| 318 | '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} has been |
| 319 | disabled for 60 minutes. This can affect compute operations so it must be resolved as quickly |
| 320 | as possible.{% endraw %}' |
| 321 | summary: | |
| 322 | '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| 323 | disabled{% endraw %}' |
| 324 | expr: | |
| 325 | openstack_nova_agent_state{adminState!="enabled"} |
| 326 | for: 1h |
| 327 | labels: |
| 328 | severity: warning |
| 329 | - alert: NovaInstanceInError |
| 330 | annotations: |
| 331 | description: | |
| 332 | '{% raw %}The instance {{ $labels.id }} has been in ERROR state for over 24 hours. |
| 333 | It must be cleaned up or removed in order to provide a consistent customer |
| 334 | experience.{% endraw %}' |
| 335 | summary: '{% raw %}[{{ $labels.id }}] Instance in ERROR state{% endraw %}' |
| 336 | expr: | |
| 337 | openstack_nova_server_status{status="ERROR"} |
| 338 | for: 24h |
| 339 | labels: |
| 340 | severity: warning |
| 341 | - alert: NovaFailureRisk |
| 342 | annotations: |
| 343 | description: | |
| 344 | '{% raw %}The cloud capacity will be at {{ $value }} in the event of the failure of |
| 345 | a single hypervisor which puts the cloud at risk of not being able to recover should |
| 346 | any hypervisor failures occur. Please ensure that adequate amount of infrastructure |
| 347 | is assigned to this deployment to prevent this.{% endraw %}' |
| 348 | summary: '{% raw %}[nova] Failure risk{% endraw %}' |
| 349 | expr: | |
| 350 | (sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) - max(openstack_nova_memory_used_bytes)) |
| 351 | / sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) * 100 < 0.25 |
| 352 | for: 6h |
| 353 | labels: |
| 354 | severity: warning |
| 355 | - alert: NovaCapacity |
| 356 | annotations: |
| 357 | description: | |
| 358 | '{% raw %}The cloud capacity is currently at `{{ $value }}` which means there is a |
| 359 | risk of running out of capacity due to the timeline required to add new nodes. |
| 360 | Please ensure that adequate amount of infrastructure is assigned to this deployment |
| 361 | to prevent this.{% endraw %}' |
| 362 | summary: '{% raw %}[nova] Capacity risk{% endraw %}' |
| 363 | expr: | |
| 364 | sum ( |
| 365 | openstack_nova_memory_used_bytes |
| 366 | + on(hostname) group_left(adminState) |
| 367 | (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) |
| 368 | ) / sum ( |
| 369 | openstack_nova_memory_available_bytes |
| 370 | + on(hostname) group_left(adminState) |
| 371 | (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) |
| 372 | ) * 100 > 75 |
| 373 | for: 6h |
| 374 | labels: |
| 375 | severity: warning |
| 376 | # NOTE(mnaser): Since we haven't moved to the operator pattern yet, we need to |
| 377 | # keep retrying a few times as the CRDs might not be installed |
| 378 | # yet. |
| 379 | retries: 60 |
| 380 | delay: 5 |
| 381 | register: _result |
| 382 | until: _result is not failed |