Monitoring: add openstack exporter

Sem-Ver: feature
Depends-On: Id868ea6c1d14de8425944244d1243384acd143be
Change-Id: I20a3f92ba3fe264ebe200f510110278628f3add9
diff --git a/doc/source/roles/openstack_exporter/index.rst b/doc/source/roles/openstack_exporter/index.rst
new file mode 100644
index 0000000..9cedd84
--- /dev/null
+++ b/doc/source/roles/openstack_exporter/index.rst
@@ -0,0 +1,10 @@
+.. Copyright (C) 2022 VEXXHOST, Inc.
+.. SPDX-License-Identifier: Apache-2.0
+
+``openstack_exporter``
+======================
+
+.. toctree::
+   :maxdepth: 2
+
+   defaults/main
\ No newline at end of file
diff --git a/playbooks/openstack.yml b/playbooks/openstack.yml
index aff12a8..2f00a3c 100644
--- a/playbooks/openstack.yml
+++ b/playbooks/openstack.yml
@@ -120,6 +120,10 @@
       tags:
         - openstack-helm-horizon
 
+    - role: openstack_exporter
+      tags:
+        - openstack-exporter
+
 - hosts: controllers
   gather_facts: false
   roles:
diff --git a/releasenotes/notes/add-openstack-exporter-role-f87a6a6f90a0f236.yaml b/releasenotes/notes/add-openstack-exporter-role-f87a6a6f90a0f236.yaml
new file mode 100644
index 0000000..78c5ee9
--- /dev/null
+++ b/releasenotes/notes/add-openstack-exporter-role-f87a6a6f90a0f236.yaml
@@ -0,0 +1,3 @@
+---
+features:
+  - Added ``openstack-exporter`` with alertings.
\ No newline at end of file
diff --git a/roles/openstack_exporter/defaults/main.yml b/roles/openstack_exporter/defaults/main.yml
new file mode 100644
index 0000000..26435fc
--- /dev/null
+++ b/roles/openstack_exporter/defaults/main.yml
@@ -0,0 +1,24 @@
+---
+# .. vim: foldmarker=[[[,]]]:foldmethod=marker
+
+# .. Copyright (C) 2022 VEXXHOST, Inc.
+# .. SPDX-License-Identifier: Apache-2.0
+
+# Default variables
+# =================
+
+# .. contents:: Sections
+#    :local:
+
+# .. envvar:: openstack_exporter_image_repository [[[
+#
+# OpenStack-exporter container image repository location
+openstack_exporter_image_repository: "quay.io/niedbalski"
+
+                                                                   # ]]]
+# .. envvar:: openstack_exporter_image_tag [[[
+#
+# openstack-exporter container image tag
+openstack_exporter_image_tag: v1.6.0
+
+                                                                   # ]]]
diff --git a/roles/openstack_exporter/meta/main.yml b/roles/openstack_exporter/meta/main.yml
new file mode 100644
index 0000000..4062e84
--- /dev/null
+++ b/roles/openstack_exporter/meta/main.yml
@@ -0,0 +1,23 @@
+# Copyright (c) 2022 VEXXHOST, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+galaxy_info:
+  author: VEXXHOST, Inc.
+  description: Ansible role for OpenStack exporter
+  license: Apache-2.0
+  min_ansible_version: 5.5.0
+  platforms:
+    - name: Ubuntu
+      versions:
+        - focal
diff --git a/roles/openstack_exporter/tasks/main.yml b/roles/openstack_exporter/tasks/main.yml
new file mode 100644
index 0000000..18cc16e
--- /dev/null
+++ b/roles/openstack_exporter/tasks/main.yml
@@ -0,0 +1,391 @@
+# Copyright (c) 2022 VEXXHOST, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+- name: Create keystone user
+  openstack.cloud.identity_user:
+    cloud: atmosphere
+    state: present
+    name: openstack-exporter-{{ openstack_helm_endpoints_region_name }}
+    password: "{{ openstack_helm_endpoints_openstack_exporter_keystone_password }}"
+    domain: service
+    default_project: service
+
+- name: Assign admin role to service user
+  openstack.cloud.role_assignment:
+    cloud: atmosphere
+    user: openstack-exporter-{{ openstack_helm_endpoints_region_name }}
+    role: admin
+    project: service
+    domain: service
+
+- name: Create Secret
+  kubernetes.core.k8s:
+    state: present
+    definition:
+      apiVersion: v1
+      kind: Secret
+      metadata:
+        name: openstack-config
+        namespace: monitoring
+      type: Opaque
+      stringData:
+        clouds.yaml: |
+          clouds:
+            openstack:
+              auth:
+                auth_url: http://keystone-api.openstack.svc.cluster.local:5000
+                project_domain_name: service
+                project_name: service
+                user_domain_name: service
+                username: openstack-exporter-{{ openstack_helm_endpoints_region_name }}
+                password: {{ openstack_helm_endpoints_openstack_exporter_keystone_password }}
+              region_name: {{ openstack_helm_endpoints_region_name }}
+              interface: internal
+              identity_api_version: 3
+              identity_interface: internal
+
+- name: Create deploy
+  kubernetes.core.k8s:
+    state: present
+    definition:
+      apiVersion: apps/v1
+      kind: Deployment
+      metadata:
+        name: openstack-exporter
+        namespace: monitoring
+        labels:
+          application: openstack-exporter
+      spec:
+        replicas: 1
+        selector:
+          matchLabels:
+            application: openstack-exporter
+        template:
+          metadata:
+            labels:
+              application: openstack-exporter
+          spec:
+            nodeSelector:
+              openstack-control-plane: enabled
+            containers:
+              - name: openstack-exporter
+                image: "{{ openstack_exporter_image_repository }}/openstack-exporter-linux-amd64:{{ openstack_exporter_image_tag }}"
+                args:
+                  - --endpoint-type
+                  - internal
+                  - default
+                  - --collect-metric-time
+                  - --disable-service.identity
+                  - --disable-service.image
+                  - --disable-metric=cinder-limits_volume_max_gb
+                  - --disable-metric=cinder-limits_volume_used_gb
+                  - --disable-metric=cinder-volumes
+                  - --disable-metric=cinder-volume_status
+                  - --disable-metric=neutron-floating_ips
+                  - --disable-metric=neutron-networks
+                  - --disable-metric=neutron-security_groups
+                  - --disable-metric=neutron-subnets
+                  - --disable-metric=neutron-routers
+                  - --disable-metric=nova-flavors
+                  - --disable-metric=nova-availability_zones
+                  - --disable-metric=nova-security_groups
+                  - --disable-metric=nova-limits_vcpus_max
+                  - --disable-metric=nova-limits_vcpus_used
+                  - --disable-metric=nova-limits_memory_max
+                  - --disable-metric=nova-limits_memory_used
+                port:
+                  name: metrics
+                  containerPort: 9180
+                volumeMounts:
+                  - name: openstack-config
+                    mountPath: "/etc/openstack"
+            volumes:
+              - name: openstack-config
+                secret:
+                  secretName: openstack-config
+
+- name: Create service
+  kubernetes.core.k8s:
+    state: present
+    definition:
+      apiVersion: v1
+      kind: Service
+      metadata:
+        name: openstack-exporter
+        namespace: monitoring
+        labels:
+          application: openstack-exporter
+      spec:
+        clusterIP: None
+        ports:
+          - name: metrics
+            port: 9180
+            targetPort: metrics
+        selector:
+          application: openstack-exporter
+
+- name: Create service monitor
+  kubernetes.core.k8s:
+    state: present
+    definition:
+      apiVersion: monitoring.coreos.com/v1
+      kind: ServiceMonitor
+      metadata:
+        name: openstack-exporter
+        namespace: monitoring
+        labels:
+          application: openstack-exporter
+      spec:
+        endpoints:
+          - interval: 1m
+            scrapeTimeout: 30s
+            port: metrics
+            relabelings:
+              - action: replace
+                regex: (.*)
+                replacement: default
+                targetLabel: instance
+        jobLabel: jobLabel
+        namespaceSelector:
+          any: true
+        selector:
+          matchLabels:
+            application: openstack-exporter
+
+- name: Create Prometheus Rule
+  kubernetes.core.k8s:
+    state: present
+    definition:
+      apiVersion: monitoring.coreos.com/v1
+      kind: PrometheusRule
+      metadata:
+        name: openstack-exporter
+        namespace: monitoring
+        labels:
+          application: openstack-exporter
+      spec:
+        groups:
+          - name: cinder
+            rules:
+              - alert: CinderAgentDown
+                annotations:
+                  description: |
+                    '{% raw %}The service {{ $labels.exported_service }} running
+                    on {{ $labels.hostname }} is being reported as down.{% endraw %}'
+                  summary: |
+                    '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+                    down{% endraw %}'
+                expr: |
+                  openstack_cinder_agent_state != 1
+                labels:
+                  severity: warning
+              - alert: CinderAgentDown
+                annotations:
+                  description: |
+                    '{% raw %}The service {{ $labels.exported_service }} running on
+                    {{ $labels.hostname }} is being reported as down for 5 minutes.
+                    This can affect volume operations so it must be resolved as
+                    quickly as possible.{% endraw %}'
+                  summary: |
+                    '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+                    down{% endraw %}'
+                expr: |
+                  openstack_cinder_agent_state != 1
+                for: 5m
+                labels:
+                  severity: critical
+              - alert: CinderAgentDisabled
+                annotations:
+                  description: |
+                    '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
+                    has been disabled for 60 minutes.  This can affect volume operations so it must be
+                    resolved as quickly as possible.{% endraw %}'
+                  summary: |
+                    '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+                    disabled{% endraw %}'
+                expr: |
+                  openstack_cinder_agent_state{adminState!="enabled"}
+                for: 1h
+                labels:
+                  severity: warning
+              - alert: CinderVolumeInError
+                annotations:
+                  description: |
+                    '{% raw %}The volume {{ $labels.id }} has been in ERROR state for over 24 hours.
+                    It must be cleaned up or removed in order to provide a consistent customer
+                    experience.{% endraw %}'
+                  summary: |
+                    '{% raw %}[{{ $labels.id }}] Volume in ERROR state{% endraw %}'
+                expr: |
+                  openstack_cinder_volume_status{status=~"error.*"}
+                for: 24h
+                labels:
+                  severity: warning
+          - name: neutron
+            rules:
+              - alert: NeutronAgentDown
+                annotations:
+                  description: |
+                    '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
+                    is being reported as down.{% endraw %}'
+                  summary: |
+                    '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+                    down{% endraw %}'
+                expr: |
+                  openstack_neutron_agent_state != 1
+                labels:
+                  severity: warning
+              - alert: NeutronAgentDown
+                annotations:
+                  description: |
+                    '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
+                    is being reported as down for 5 minutes. This can affect network operations so it
+                    must be resolved as quickly as possible.{% endraw %}'
+                  summary: |
+                    '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+                    down{% endraw %}'
+                expr: |
+                  openstack_neutron_agent_state != 1
+                for: 5m
+                labels:
+                  severity: critical
+              - alert: NeutronAgentDisabled
+                annotations:
+                  description: |
+                    '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
+                    has been disabled for 60 minutes.  This can affect network operations so it must be
+                    resolved as quickly as possible.{% endraw %}'
+                  summary: |
+                    '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+                    disabled{% endraw %}'
+                expr: |
+                  openstack_neutron_agent_state{adminState!="up"}
+                for: 1h
+                labels:
+                  severity: warning
+              - alert: NeutronBindingFailedPorts
+                annotations:
+                  description: |
+                    '{% raw %}The NIC {{ $labels.mac_address }} of {{ $labels.device_owner }}
+                    has binding failed port now.{% endraw %}'
+                  summary: |
+                    '{% raw %}[{{ $labels.device_owner }}] {{ $labels.mac_address }}
+                    binding failed{% endraw %}'
+                expr: |
+                  openstack_neutron_port{binding_vif_type="binding_failed"} != 0
+                labels:
+                  severity: warning
+              - alert: NeutronNetworkOutOfIPs
+                annotations:
+                  description: |
+                    '{% raw %}The subnet {{ $labels.subnet_name }} within {{ $labels.network_name }}
+                    is currently at {{ $value }}% utilization.  If the IP addresses run out, it will
+                    impact the provisioning of new ports.{% endraw %}'
+                  summary: |
+                    '{% raw %}[{{ $labels.network_name }}] {{ $labels.subnet_name }}
+                    running out of IPs{% endraw %}'
+                expr: |
+                  sum by (network_id) (openstack_neutron_network_ip_availabilities_used{project_id!=""}) / sum by (network_id)
+                  (openstack_neutron_network_ip_availabilities_total{project_id!=""}) * 100 > 80
+                labels:
+                  severity: warning
+          - name: nova
+            rules:
+              - alert: NovaAgentDown
+                annotations:
+                  description: |
+                    '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
+                    is being reported as down.{% endraw %}'
+                  summary: |
+                    '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+                    down{% endraw %}'
+                expr: |
+                  openstack_nova_agent_state != 1
+                labels:
+                  severity: warning
+              - alert: NovaAgentDown
+                annotations:
+                  description: |
+                    '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} is
+                    being reported as down.  This can affect compute operations so it must be resolved as
+                    quickly as possible.{% endraw %}'
+                  summary: |
+                    '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+                    down{% endraw %}'
+                expr: |
+                  openstack_nova_agent_state != 1
+                for: 5m
+                labels:
+                  severity: critical
+              - alert: NovaAgentDisabled
+                annotations:
+                  description: |
+                    '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} has been
+                    disabled for 60 minutes.  This can affect compute operations so it must be resolved as quickly
+                    as possible.{% endraw %}'
+                  summary: |
+                    '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+                    disabled{% endraw %}'
+                expr: |
+                  openstack_nova_agent_state{adminState!="enabled"}
+                for: 1h
+                labels:
+                  severity: warning
+              - alert: NovaInstanceInError
+                annotations:
+                  description: |
+                    '{% raw %}The instance {{ $labels.id }} has been in ERROR state for over 24 hours.
+                    It must be cleaned up or removed in order to provide a consistent customer
+                    experience.{% endraw %}'
+                  summary: '{% raw %}[{{ $labels.id }}] Instance in ERROR state{% endraw %}'
+                expr: |
+                  openstack_nova_server_status{status="ERROR"}
+                for: 24h
+                labels:
+                  severity: warning
+              - alert: NovaFailureRisk
+                annotations:
+                  description: |
+                    '{% raw %}The cloud capacity will be at {{ $value }} in the event of the failure of
+                    a single hypervisor which puts the cloud at risk of not being able to recover should
+                    any hypervisor failures occur.  Please ensure that adequate amount of infrastructure
+                    is assigned to this deployment to prevent this.{% endraw %}'
+                  summary: '{% raw %}[nova] Failure risk{% endraw %}'
+                expr: |
+                  (sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) - max(openstack_nova_memory_used_bytes))
+                  / sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) * 100 < 0.25
+                for: 6h
+                labels:
+                  severity: warning
+              - alert: NovaCapacity
+                annotations:
+                  description: |
+                    '{% raw %}The cloud capacity is currently at `{{ $value }}` which means there is a
+                    risk of running out of capacity due to the timeline required to add new nodes.
+                    Please ensure that adequate amount of infrastructure is assigned to this deployment
+                    to prevent this.{% endraw %}'
+                  summary: '{% raw %}[nova] Capacity risk{% endraw %}'
+                expr: |
+                  sum (
+                      openstack_nova_memory_used_bytes
+                    + on(hostname) group_left(adminState)
+                      (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"})
+                  ) / sum (
+                      openstack_nova_memory_available_bytes
+                    + on(hostname) group_left(adminState)
+                      (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"})
+                  ) * 100 > 75
+                for: 6h
+                labels:
+                  severity: warning
diff --git a/roles/openstack_helm_endpoints/defaults/main.yml b/roles/openstack_helm_endpoints/defaults/main.yml
index 7e5987c..b25fe5d 100644
--- a/roles/openstack_helm_endpoints/defaults/main.yml
+++ b/roles/openstack_helm_endpoints/defaults/main.yml
@@ -443,3 +443,9 @@
 openstack_helm_endpoints_tempest_keystone_password: "{{ undef(hint='You must specify a Tempest Keystone password') }}"
 
                                                                    # ]]]
+# .. envvar:: openstack_helm_endpoints_openstack_exporter_keystone_password [[[
+#
+# Keystone password for service
+openstack_helm_endpoints_openstack_exporter_keystone_password: "{{ undef(hint='You must specify a OpenStack Exporter Keystone password') }}"
+
+                                                                   # ]]]