[stable/zed] Add monitoring for stuck VMs (#1133)
This is an automated cherry-pick of #1129
/assign mnaser
diff --git a/roles/defaults/vars/main.yml b/roles/defaults/vars/main.yml
index 4b82047..6d36998 100644
--- a/roles/defaults/vars/main.yml
+++ b/roles/defaults/vars/main.yml
@@ -174,7 +174,7 @@
prometheus_memcached_exporter: quay.io/prometheus/memcached-exporter:v0.10.0
prometheus_mysqld_exporter: quay.io/prometheus/mysqld-exporter:v0.14.0
prometheus_node_exporter: quay.io/prometheus/node-exporter:v1.7.0
- prometheus_openstack_database_exporter: ghcr.io/vexxhost/openstack-database-exporter:v0.2.0
+ prometheus_openstack_database_exporter: ghcr.io/vexxhost/openstack-database-exporter:v0.3.0
prometheus_openstack_exporter: ghcr.io/openstack-exporter/openstack-exporter:1.7.0
prometheus_operator_kube_webhook_certgen: registry.k8s.io/ingress-nginx/kube-webhook-certgen:v20221220-controller-v1.5.1-58-g787ea74b6
prometheus_operator: quay.io/prometheus-operator/prometheus-operator:v0.73.0
diff --git a/roles/kube_prometheus_stack/files/jsonnet/openstack.libsonnet b/roles/kube_prometheus_stack/files/jsonnet/openstack.libsonnet
index b8e6ac1..e77077e 100644
--- a/roles/kube_prometheus_stack/files/jsonnet/openstack.libsonnet
+++ b/roles/kube_prometheus_stack/files/jsonnet/openstack.libsonnet
@@ -204,6 +204,18 @@
},
},
{
+ alert: 'NovaServerTaskStateStuck',
+ annotations: {
+ summary: 'Nova server stuck in task state',
+ description: 'Nova server with ID {{ $labels.id }} stuck in {{ $labels.task_state }} state for more than 1 hour',
+ },
+ expr: 'openstack_nova_server_task_state > 0',
+ 'for': '1h',
+ labels: {
+ severity: 'P3',
+ },
+ },
+ {
alert: 'NovaInstanceError',
expr: 'openstack_nova_server_status{status="ERROR"} > 0',
'for': '24h',
diff --git a/roles/openstack_exporter/tasks/main.yml b/roles/openstack_exporter/tasks/main.yml
index 21e1915..115445d 100644
--- a/roles/openstack_exporter/tasks/main.yml
+++ b/roles/openstack_exporter/tasks/main.yml
@@ -117,15 +117,6 @@
selector:
application: openstack-exporter
-- name: Fetch Octavia DB secret
- run_once: true
- no_log: true
- kubernetes.core.k8s_info:
- kind: Secret
- namespace: openstack
- name: octavia-db-user
- register: _octavia_db_user
-
- name: Fetch Neutron DB secret
run_once: true
no_log: true
@@ -135,6 +126,24 @@
name: neutron-db-user
register: _neutron_db_user
+- name: Fetch Nova DB secret
+ run_once: true
+ no_log: true
+ kubernetes.core.k8s_info:
+ kind: Secret
+ namespace: openstack
+ name: nova-db-user
+ register: _nova_db_user
+
+- name: Fetch Octavia DB secret
+ run_once: true
+ no_log: true
+ kubernetes.core.k8s_info:
+ kind: Secret
+ namespace: openstack
+ name: octavia-db-user
+ register: _octavia_db_user
+
- name: Create "openstack-database-exporter-dsn" secret
run_once: true
no_log: true
@@ -151,6 +160,7 @@
application: openstack-database-exporter
stringData:
NEUTRON_DSN: "{{ _neutron_db_user.resources.0.data.DB_CONNECTION | b64decode | regex_replace('^.*//', '') | regex_replace('@(.*)/', '@tcp(\\1)/') }}"
+ NOVA_DSN: "{{ _nova_db_user.resources.0.data.DB_CONNECTION | b64decode | regex_replace('^.*//', '') | regex_replace('@(.*)/', '@tcp(\\1)/') }}"
OCTAVIA_DSN: "{{ _octavia_db_user.resources.0.data.DB_CONNECTION | b64decode | regex_replace('^.*//', '') | regex_replace('@(.*)/', '@tcp(\\1)/') }}"
- name: Deploy service