Monitoring: add openstack exporter
Sem-Ver: feature
Depends-On: Id868ea6c1d14de8425944244d1243384acd143be
Change-Id: I20a3f92ba3fe264ebe200f510110278628f3add9
diff --git a/doc/source/roles/openstack_exporter/index.rst b/doc/source/roles/openstack_exporter/index.rst
new file mode 100644
index 0000000..9cedd84
--- /dev/null
+++ b/doc/source/roles/openstack_exporter/index.rst
@@ -0,0 +1,10 @@
+.. Copyright (C) 2022 VEXXHOST, Inc.
+.. SPDX-License-Identifier: Apache-2.0
+
+``openstack_exporter``
+======================
+
+.. toctree::
+ :maxdepth: 2
+
+ defaults/main
\ No newline at end of file
diff --git a/playbooks/openstack.yml b/playbooks/openstack.yml
index aff12a8..2f00a3c 100644
--- a/playbooks/openstack.yml
+++ b/playbooks/openstack.yml
@@ -120,6 +120,10 @@
tags:
- openstack-helm-horizon
+ - role: openstack_exporter
+ tags:
+ - openstack-exporter
+
- hosts: controllers
gather_facts: false
roles:
diff --git a/releasenotes/notes/add-openstack-exporter-role-f87a6a6f90a0f236.yaml b/releasenotes/notes/add-openstack-exporter-role-f87a6a6f90a0f236.yaml
new file mode 100644
index 0000000..78c5ee9
--- /dev/null
+++ b/releasenotes/notes/add-openstack-exporter-role-f87a6a6f90a0f236.yaml
@@ -0,0 +1,3 @@
+---
+features:
+ - Added ``openstack-exporter`` with alertings.
\ No newline at end of file
diff --git a/roles/openstack_exporter/defaults/main.yml b/roles/openstack_exporter/defaults/main.yml
new file mode 100644
index 0000000..26435fc
--- /dev/null
+++ b/roles/openstack_exporter/defaults/main.yml
@@ -0,0 +1,24 @@
+---
+# .. vim: foldmarker=[[[,]]]:foldmethod=marker
+
+# .. Copyright (C) 2022 VEXXHOST, Inc.
+# .. SPDX-License-Identifier: Apache-2.0
+
+# Default variables
+# =================
+
+# .. contents:: Sections
+# :local:
+
+# .. envvar:: openstack_exporter_image_repository [[[
+#
+# OpenStack-exporter container image repository location
+openstack_exporter_image_repository: "quay.io/niedbalski"
+
+ # ]]]
+# .. envvar:: openstack_exporter_image_tag [[[
+#
+# openstack-exporter container image tag
+openstack_exporter_image_tag: v1.6.0
+
+ # ]]]
diff --git a/roles/openstack_exporter/meta/main.yml b/roles/openstack_exporter/meta/main.yml
new file mode 100644
index 0000000..4062e84
--- /dev/null
+++ b/roles/openstack_exporter/meta/main.yml
@@ -0,0 +1,23 @@
+# Copyright (c) 2022 VEXXHOST, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+galaxy_info:
+ author: VEXXHOST, Inc.
+ description: Ansible role for OpenStack exporter
+ license: Apache-2.0
+ min_ansible_version: 5.5.0
+ platforms:
+ - name: Ubuntu
+ versions:
+ - focal
diff --git a/roles/openstack_exporter/tasks/main.yml b/roles/openstack_exporter/tasks/main.yml
new file mode 100644
index 0000000..18cc16e
--- /dev/null
+++ b/roles/openstack_exporter/tasks/main.yml
@@ -0,0 +1,391 @@
+# Copyright (c) 2022 VEXXHOST, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+- name: Create keystone user
+ openstack.cloud.identity_user:
+ cloud: atmosphere
+ state: present
+ name: openstack-exporter-{{ openstack_helm_endpoints_region_name }}
+ password: "{{ openstack_helm_endpoints_openstack_exporter_keystone_password }}"
+ domain: service
+ default_project: service
+
+- name: Assign admin role to service user
+ openstack.cloud.role_assignment:
+ cloud: atmosphere
+ user: openstack-exporter-{{ openstack_helm_endpoints_region_name }}
+ role: admin
+ project: service
+ domain: service
+
+- name: Create Secret
+ kubernetes.core.k8s:
+ state: present
+ definition:
+ apiVersion: v1
+ kind: Secret
+ metadata:
+ name: openstack-config
+ namespace: monitoring
+ type: Opaque
+ stringData:
+ clouds.yaml: |
+ clouds:
+ openstack:
+ auth:
+ auth_url: http://keystone-api.openstack.svc.cluster.local:5000
+ project_domain_name: service
+ project_name: service
+ user_domain_name: service
+ username: openstack-exporter-{{ openstack_helm_endpoints_region_name }}
+ password: {{ openstack_helm_endpoints_openstack_exporter_keystone_password }}
+ region_name: {{ openstack_helm_endpoints_region_name }}
+ interface: internal
+ identity_api_version: 3
+ identity_interface: internal
+
+- name: Create deploy
+ kubernetes.core.k8s:
+ state: present
+ definition:
+ apiVersion: apps/v1
+ kind: Deployment
+ metadata:
+ name: openstack-exporter
+ namespace: monitoring
+ labels:
+ application: openstack-exporter
+ spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ application: openstack-exporter
+ template:
+ metadata:
+ labels:
+ application: openstack-exporter
+ spec:
+ nodeSelector:
+ openstack-control-plane: enabled
+ containers:
+ - name: openstack-exporter
+ image: "{{ openstack_exporter_image_repository }}/openstack-exporter-linux-amd64:{{ openstack_exporter_image_tag }}"
+ args:
+ - --endpoint-type
+ - internal
+ - default
+ - --collect-metric-time
+ - --disable-service.identity
+ - --disable-service.image
+ - --disable-metric=cinder-limits_volume_max_gb
+ - --disable-metric=cinder-limits_volume_used_gb
+ - --disable-metric=cinder-volumes
+ - --disable-metric=cinder-volume_status
+ - --disable-metric=neutron-floating_ips
+ - --disable-metric=neutron-networks
+ - --disable-metric=neutron-security_groups
+ - --disable-metric=neutron-subnets
+ - --disable-metric=neutron-routers
+ - --disable-metric=nova-flavors
+ - --disable-metric=nova-availability_zones
+ - --disable-metric=nova-security_groups
+ - --disable-metric=nova-limits_vcpus_max
+ - --disable-metric=nova-limits_vcpus_used
+ - --disable-metric=nova-limits_memory_max
+ - --disable-metric=nova-limits_memory_used
+ port:
+ name: metrics
+ containerPort: 9180
+ volumeMounts:
+ - name: openstack-config
+ mountPath: "/etc/openstack"
+ volumes:
+ - name: openstack-config
+ secret:
+ secretName: openstack-config
+
+- name: Create service
+ kubernetes.core.k8s:
+ state: present
+ definition:
+ apiVersion: v1
+ kind: Service
+ metadata:
+ name: openstack-exporter
+ namespace: monitoring
+ labels:
+ application: openstack-exporter
+ spec:
+ clusterIP: None
+ ports:
+ - name: metrics
+ port: 9180
+ targetPort: metrics
+ selector:
+ application: openstack-exporter
+
+- name: Create service monitor
+ kubernetes.core.k8s:
+ state: present
+ definition:
+ apiVersion: monitoring.coreos.com/v1
+ kind: ServiceMonitor
+ metadata:
+ name: openstack-exporter
+ namespace: monitoring
+ labels:
+ application: openstack-exporter
+ spec:
+ endpoints:
+ - interval: 1m
+ scrapeTimeout: 30s
+ port: metrics
+ relabelings:
+ - action: replace
+ regex: (.*)
+ replacement: default
+ targetLabel: instance
+ jobLabel: jobLabel
+ namespaceSelector:
+ any: true
+ selector:
+ matchLabels:
+ application: openstack-exporter
+
+- name: Create Prometheus Rule
+ kubernetes.core.k8s:
+ state: present
+ definition:
+ apiVersion: monitoring.coreos.com/v1
+ kind: PrometheusRule
+ metadata:
+ name: openstack-exporter
+ namespace: monitoring
+ labels:
+ application: openstack-exporter
+ spec:
+ groups:
+ - name: cinder
+ rules:
+ - alert: CinderAgentDown
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running
+ on {{ $labels.hostname }} is being reported as down.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ down{% endraw %}'
+ expr: |
+ openstack_cinder_agent_state != 1
+ labels:
+ severity: warning
+ - alert: CinderAgentDown
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running on
+ {{ $labels.hostname }} is being reported as down for 5 minutes.
+ This can affect volume operations so it must be resolved as
+ quickly as possible.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ down{% endraw %}'
+ expr: |
+ openstack_cinder_agent_state != 1
+ for: 5m
+ labels:
+ severity: critical
+ - alert: CinderAgentDisabled
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
+ has been disabled for 60 minutes. This can affect volume operations so it must be
+ resolved as quickly as possible.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ disabled{% endraw %}'
+ expr: |
+ openstack_cinder_agent_state{adminState!="enabled"}
+ for: 1h
+ labels:
+ severity: warning
+ - alert: CinderVolumeInError
+ annotations:
+ description: |
+ '{% raw %}The volume {{ $labels.id }} has been in ERROR state for over 24 hours.
+ It must be cleaned up or removed in order to provide a consistent customer
+ experience.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.id }}] Volume in ERROR state{% endraw %}'
+ expr: |
+ openstack_cinder_volume_status{status=~"error.*"}
+ for: 24h
+ labels:
+ severity: warning
+ - name: neutron
+ rules:
+ - alert: NeutronAgentDown
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
+ is being reported as down.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ down{% endraw %}'
+ expr: |
+ openstack_neutron_agent_state != 1
+ labels:
+ severity: warning
+ - alert: NeutronAgentDown
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
+ is being reported as down for 5 minutes. This can affect network operations so it
+ must be resolved as quickly as possible.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ down{% endraw %}'
+ expr: |
+ openstack_neutron_agent_state != 1
+ for: 5m
+ labels:
+ severity: critical
+ - alert: NeutronAgentDisabled
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
+ has been disabled for 60 minutes. This can affect network operations so it must be
+ resolved as quickly as possible.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ disabled{% endraw %}'
+ expr: |
+ openstack_neutron_agent_state{adminState!="up"}
+ for: 1h
+ labels:
+ severity: warning
+ - alert: NeutronBindingFailedPorts
+ annotations:
+ description: |
+ '{% raw %}The NIC {{ $labels.mac_address }} of {{ $labels.device_owner }}
+ has binding failed port now.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.device_owner }}] {{ $labels.mac_address }}
+ binding failed{% endraw %}'
+ expr: |
+ openstack_neutron_port{binding_vif_type="binding_failed"} != 0
+ labels:
+ severity: warning
+ - alert: NeutronNetworkOutOfIPs
+ annotations:
+ description: |
+ '{% raw %}The subnet {{ $labels.subnet_name }} within {{ $labels.network_name }}
+ is currently at {{ $value }}% utilization. If the IP addresses run out, it will
+ impact the provisioning of new ports.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.network_name }}] {{ $labels.subnet_name }}
+ running out of IPs{% endraw %}'
+ expr: |
+ sum by (network_id) (openstack_neutron_network_ip_availabilities_used{project_id!=""}) / sum by (network_id)
+ (openstack_neutron_network_ip_availabilities_total{project_id!=""}) * 100 > 80
+ labels:
+ severity: warning
+ - name: nova
+ rules:
+ - alert: NovaAgentDown
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
+ is being reported as down.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ down{% endraw %}'
+ expr: |
+ openstack_nova_agent_state != 1
+ labels:
+ severity: warning
+ - alert: NovaAgentDown
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} is
+ being reported as down. This can affect compute operations so it must be resolved as
+ quickly as possible.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ down{% endraw %}'
+ expr: |
+ openstack_nova_agent_state != 1
+ for: 5m
+ labels:
+ severity: critical
+ - alert: NovaAgentDisabled
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} has been
+ disabled for 60 minutes. This can affect compute operations so it must be resolved as quickly
+ as possible.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ disabled{% endraw %}'
+ expr: |
+ openstack_nova_agent_state{adminState!="enabled"}
+ for: 1h
+ labels:
+ severity: warning
+ - alert: NovaInstanceInError
+ annotations:
+ description: |
+ '{% raw %}The instance {{ $labels.id }} has been in ERROR state for over 24 hours.
+ It must be cleaned up or removed in order to provide a consistent customer
+ experience.{% endraw %}'
+ summary: '{% raw %}[{{ $labels.id }}] Instance in ERROR state{% endraw %}'
+ expr: |
+ openstack_nova_server_status{status="ERROR"}
+ for: 24h
+ labels:
+ severity: warning
+ - alert: NovaFailureRisk
+ annotations:
+ description: |
+ '{% raw %}The cloud capacity will be at {{ $value }} in the event of the failure of
+ a single hypervisor which puts the cloud at risk of not being able to recover should
+ any hypervisor failures occur. Please ensure that adequate amount of infrastructure
+ is assigned to this deployment to prevent this.{% endraw %}'
+ summary: '{% raw %}[nova] Failure risk{% endraw %}'
+ expr: |
+ (sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) - max(openstack_nova_memory_used_bytes))
+ / sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) * 100 < 0.25
+ for: 6h
+ labels:
+ severity: warning
+ - alert: NovaCapacity
+ annotations:
+ description: |
+ '{% raw %}The cloud capacity is currently at `{{ $value }}` which means there is a
+ risk of running out of capacity due to the timeline required to add new nodes.
+ Please ensure that adequate amount of infrastructure is assigned to this deployment
+ to prevent this.{% endraw %}'
+ summary: '{% raw %}[nova] Capacity risk{% endraw %}'
+ expr: |
+ sum (
+ openstack_nova_memory_used_bytes
+ + on(hostname) group_left(adminState)
+ (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"})
+ ) / sum (
+ openstack_nova_memory_available_bytes
+ + on(hostname) group_left(adminState)
+ (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"})
+ ) * 100 > 75
+ for: 6h
+ labels:
+ severity: warning
diff --git a/roles/openstack_helm_endpoints/defaults/main.yml b/roles/openstack_helm_endpoints/defaults/main.yml
index 7e5987c..b25fe5d 100644
--- a/roles/openstack_helm_endpoints/defaults/main.yml
+++ b/roles/openstack_helm_endpoints/defaults/main.yml
@@ -443,3 +443,9 @@
openstack_helm_endpoints_tempest_keystone_password: "{{ undef(hint='You must specify a Tempest Keystone password') }}"
# ]]]
+# .. envvar:: openstack_helm_endpoints_openstack_exporter_keystone_password [[[
+#
+# Keystone password for service
+openstack_helm_endpoints_openstack_exporter_keystone_password: "{{ undef(hint='You must specify a OpenStack Exporter Keystone password') }}"
+
+ # ]]]