Add monitoring for stuck VMs (#1129)
If RabbitMQ decides to go for a nap, some messages may be stuck
and it may cause certain VMs to be stuck in certain states such
as deleting or building.
This adds monitoring in order to catch if any of those operations
are stuck in the cloud.
Depends-On: #1130
diff --git a/roles/defaults/vars/main.yml b/roles/defaults/vars/main.yml
index dec4745..bf6672b 100644
--- a/roles/defaults/vars/main.yml
+++ b/roles/defaults/vars/main.yml
@@ -174,7 +174,7 @@
prometheus_memcached_exporter: quay.io/prometheus/memcached-exporter:v0.10.0
prometheus_mysqld_exporter: quay.io/prometheus/mysqld-exporter:v0.14.0
prometheus_node_exporter: quay.io/prometheus/node-exporter:v1.7.0
- prometheus_openstack_database_exporter: ghcr.io/vexxhost/openstack-database-exporter:v0.2.0
+ prometheus_openstack_database_exporter: ghcr.io/vexxhost/openstack-database-exporter:v0.3.0
prometheus_openstack_exporter: ghcr.io/openstack-exporter/openstack-exporter:1.7.0
prometheus_operator_kube_webhook_certgen: registry.k8s.io/ingress-nginx/kube-webhook-certgen:v20221220-controller-v1.5.1-58-g787ea74b6
prometheus_operator: quay.io/prometheus-operator/prometheus-operator:v0.73.0
diff --git a/roles/kube_prometheus_stack/files/jsonnet/openstack.libsonnet b/roles/kube_prometheus_stack/files/jsonnet/openstack.libsonnet
index b8e6ac1..e77077e 100644
--- a/roles/kube_prometheus_stack/files/jsonnet/openstack.libsonnet
+++ b/roles/kube_prometheus_stack/files/jsonnet/openstack.libsonnet
@@ -204,6 +204,18 @@
},
},
{
+ alert: 'NovaServerTaskStateStuck',
+ annotations: {
+ summary: 'Nova server stuck in task state',
+ description: 'Nova server with ID {{ $labels.id }} stuck in {{ $labels.task_state }} state for more than 1 hour',
+ },
+ expr: 'openstack_nova_server_task_state > 0',
+ 'for': '1h',
+ labels: {
+ severity: 'P3',
+ },
+ },
+ {
alert: 'NovaInstanceError',
expr: 'openstack_nova_server_status{status="ERROR"} > 0',
'for': '24h',
diff --git a/roles/openstack_exporter/tasks/main.yml b/roles/openstack_exporter/tasks/main.yml
index 21e1915..115445d 100644
--- a/roles/openstack_exporter/tasks/main.yml
+++ b/roles/openstack_exporter/tasks/main.yml
@@ -117,15 +117,6 @@
selector:
application: openstack-exporter
-- name: Fetch Octavia DB secret
- run_once: true
- no_log: true
- kubernetes.core.k8s_info:
- kind: Secret
- namespace: openstack
- name: octavia-db-user
- register: _octavia_db_user
-
- name: Fetch Neutron DB secret
run_once: true
no_log: true
@@ -135,6 +126,24 @@
name: neutron-db-user
register: _neutron_db_user
+- name: Fetch Nova DB secret
+ run_once: true
+ no_log: true
+ kubernetes.core.k8s_info:
+ kind: Secret
+ namespace: openstack
+ name: nova-db-user
+ register: _nova_db_user
+
+- name: Fetch Octavia DB secret
+ run_once: true
+ no_log: true
+ kubernetes.core.k8s_info:
+ kind: Secret
+ namespace: openstack
+ name: octavia-db-user
+ register: _octavia_db_user
+
- name: Create "openstack-database-exporter-dsn" secret
run_once: true
no_log: true
@@ -151,6 +160,7 @@
application: openstack-database-exporter
stringData:
NEUTRON_DSN: "{{ _neutron_db_user.resources.0.data.DB_CONNECTION | b64decode | regex_replace('^.*//', '') | regex_replace('@(.*)/', '@tcp(\\1)/') }}"
+ NOVA_DSN: "{{ _nova_db_user.resources.0.data.DB_CONNECTION | b64decode | regex_replace('^.*//', '') | regex_replace('@(.*)/', '@tcp(\\1)/') }}"
OCTAVIA_DSN: "{{ _octavia_db_user.resources.0.data.DB_CONNECTION | b64decode | regex_replace('^.*//', '') | regex_replace('@(.*)/', '@tcp(\\1)/') }}"
- name: Deploy service