Add build request failure monitoring [ATMOSPHERE-249] (#1414)
diff --git a/roles/defaults/vars/main.yml b/roles/defaults/vars/main.yml
index ec0a590..b0dcb1c 100644
--- a/roles/defaults/vars/main.yml
+++ b/roles/defaults/vars/main.yml
@@ -187,7 +187,7 @@
prometheus_memcached_exporter: quay.io/prometheus/memcached-exporter:v0.14.3
prometheus_mysqld_exporter: quay.io/prometheus/mysqld-exporter:v0.15.1
prometheus_node_exporter: quay.io/prometheus/node-exporter:v1.8.1
- prometheus_openstack_database_exporter: ghcr.io/vexxhost/openstack-database-exporter:v0.3.0
+ prometheus_openstack_database_exporter: ghcr.io/vexxhost/openstack-database-exporter:v0.4.2
prometheus_openstack_exporter: ghcr.io/openstack-exporter/openstack-exporter:1.7.0
prometheus_operator_kube_webhook_certgen: registry.k8s.io/ingress-nginx/kube-webhook-certgen:v20221220-controller-v1.5.1-58-g787ea74b6
prometheus_operator: quay.io/prometheus-operator/prometheus-operator:v0.74.0
diff --git a/roles/kube_prometheus_stack/files/jsonnet/openstack.libsonnet b/roles/kube_prometheus_stack/files/jsonnet/openstack.libsonnet
index e77077e..0824e07 100644
--- a/roles/kube_prometheus_stack/files/jsonnet/openstack.libsonnet
+++ b/roles/kube_prometheus_stack/files/jsonnet/openstack.libsonnet
@@ -1,4 +1,18 @@
{
+ prometheusRules+:: {
+ groups: [
+ {
+ name: 'recording',
+ rules:
+ [
+ {
+ record: 'nova:build_requests:sum',
+ expr: 'sum(openstack_nova_api_build_request)',
+ },
+ ],
+ },
+ ],
+ },
prometheusAlerts+: {
groups+: [
{
@@ -254,6 +268,35 @@
],
},
{
+ name: 'nova-build-requests',
+ rules: [
+ {
+ alert: 'NovaStuckBuildRequest',
+ annotations: {
+ summary: 'Nova build request stuck in queue for more than 1 hour',
+ description: 'Instance ID {{ $labels.instance_uuid }} (project: {{ $labels.project_id }}) has been stuck in build request state for more than 1 hour.',
+ },
+ expr: 'openstack_nova_api_build_request > 0',
+ 'for': '1h',
+ labels: {
+ severity: 'P4',
+ },
+ },
+ {
+ alert: 'NovaStuckBuildRequestIncreasing',
+ annotations: {
+ summary: 'Nova build request is increasing',
+ description: 'Build request count rate is increasing across the cluster.',
+ },
+ expr: 'rate(nova:build_requests:sum[5m]) > 0',
+ 'for': '15m',
+ labels: {
+ severity: 'P3',
+ },
+ },
+ ]
+ },
+ {
name: 'octavia',
rules:
[
diff --git a/roles/openstack_exporter/tasks/main.yml b/roles/openstack_exporter/tasks/main.yml
index 115445d..d6e067e 100644
--- a/roles/openstack_exporter/tasks/main.yml
+++ b/roles/openstack_exporter/tasks/main.yml
@@ -159,9 +159,10 @@
labels:
application: openstack-database-exporter
stringData:
- NEUTRON_DSN: "{{ _neutron_db_user.resources.0.data.DB_CONNECTION | b64decode | regex_replace('^.*//', '') | regex_replace('@(.*)/', '@tcp(\\1)/') }}"
- NOVA_DSN: "{{ _nova_db_user.resources.0.data.DB_CONNECTION | b64decode | regex_replace('^.*//', '') | regex_replace('@(.*)/', '@tcp(\\1)/') }}"
- OCTAVIA_DSN: "{{ _octavia_db_user.resources.0.data.DB_CONNECTION | b64decode | regex_replace('^.*//', '') | regex_replace('@(.*)/', '@tcp(\\1)/') }}"
+ NEUTRON_DSN: "{{ _neutron_db_user.resources.0.data.DB_CONNECTION | b64decode | regex_replace('^.*//', '') | regex_replace('@(.*)/', '@tcp(\\1)/') }}"
+ NOVA_DSN: "{{ _nova_db_user.resources.0.data.DB_CONNECTION | b64decode | regex_replace('^.*//', '') | regex_replace('@(.*)/', '@tcp(\\1)/') }}"
+ NOVA_API_DSN: "{{ _nova_db_user.resources.0.data.DB_CONNECTION | b64decode | regex_replace('^.*//', '') | regex_replace('@(.*)/', '@tcp(\\1)/') }}_api"
+ OCTAVIA_DSN: "{{ _octavia_db_user.resources.0.data.DB_CONNECTION | b64decode | regex_replace('^.*//', '') | regex_replace('@(.*)/', '@tcp(\\1)/') }}"
- name: Deploy service
run_once: true