chore: refactor servicemonitors into kube-prometheus-stack
diff --git a/roles/ipmi_exporter/tasks/main.yml b/roles/ipmi_exporter/tasks/main.yml
index c52045d..bcc25d8 100644
--- a/roles/ipmi_exporter/tasks/main.yml
+++ b/roles/ipmi_exporter/tasks/main.yml
@@ -74,73 +74,6 @@
- key: feature.node.kubernetes.io/cpu-cpuid.HYPERVISOR
operator: NotIn
values: ["true"]
-
- - apiVersion: monitoring.coreos.com/v1
- kind: PrometheusRule
- metadata:
- name: ipmi-exporter
- namespace: monitoring
- labels:
- application: ipmi-exporter
- release: kube-prometheus-stack
- spec:
- groups:
- - name: rules
- rules:
- - alert: IpmiCollectorDown
- expr: ipmi_up == 0
- - name: collectors-state-warning
- rules:
- - alert: IpmiCurrent
- expr: ipmi_current_state == 1
- labels:
- severity: warning
- - alert: IpmiFanSpeed
- expr: ipmi_fan_speed_state == 1
- labels:
- severity: warning
- - alert: IpmiPower
- expr: ipmi_power_state == 1
- labels:
- severity: warning
- - alert: IpmiSensor
- expr: ipmi_sensor_state == 1
- labels:
- severity: warning
- - alert: IpmiTemperature
- expr: ipmi_temperature_state == 1
- labels:
- severity: warning
- - alert: IpmiVoltage
- expr: ipmi_voltage_state == 1
- labels:
- severity: warning
- - name: collectors-state-critical
- rules:
- - alert: IpmiCurrent
- expr: ipmi_current_state == 2
- labels:
- severity: critical
- - alert: IpmiFanSpeed
- expr: ipmi_fan_speed_state == 2
- labels:
- severity: critical
- - alert: IpmiPower
- expr: ipmi_power_state == 2
- labels:
- severity: critical
- - alert: IpmiSensor
- expr: ipmi_sensor_state == 2
- labels:
- severity: critical
- - alert: IpmiTemperature
- expr: ipmi_temperature_state == 2
- labels:
- severity: critical
- - alert: IpmiVoltage
- expr: ipmi_voltage_state == 2
- labels:
- severity: critical
# NOTE(mnaser): Since we haven't moved to the operator pattern yet, we need to
# keep retrying a few times as the CRDs might not be installed
# yet.
diff --git a/roles/kube_prometheus_stack/vars/main.yml b/roles/kube_prometheus_stack/vars/main.yml
index a25a5d1..ba047b7 100644
--- a/roles/kube_prometheus_stack/vars/main.yml
+++ b/roles/kube_prometheus_stack/vars/main.yml
@@ -368,6 +368,91 @@
for: 2m
labels:
severity: critical
+ ethtool-exporter:
+ groups:
+ - name: rules
+ rules:
+ - alert: EthernetReceiveDiscards
+ expr: rate(node_net_ethtool{type="rx_discards"}[1m]) > 0
+ labels:
+ severity: warning
+ ipmi-exporter:
+ groups:
+ - name: rules
+ rules:
+ - alert: IpmiCollectorDown
+ expr: ipmi_up == 0
+ - name: collectors-state-warning
+ rules:
+ - alert: IpmiCurrent
+ expr: ipmi_current_state == 1
+ labels:
+ severity: warning
+ - alert: IpmiFanSpeed
+ expr: ipmi_fan_speed_state == 1
+ labels:
+ severity: warning
+ - alert: IpmiPower
+ expr: ipmi_power_state == 1
+ labels:
+ severity: warning
+ - alert: IpmiSensor
+ expr: ipmi_sensor_state == 1
+ labels:
+ severity: warning
+ - alert: IpmiTemperature
+ expr: ipmi_temperature_state == 1
+ labels:
+ severity: warning
+ - alert: IpmiVoltage
+ expr: ipmi_voltage_state == 1
+ labels:
+ severity: warning
+ - name: collectors-state-critical
+ rules:
+ - alert: IpmiCurrent
+ expr: ipmi_current_state == 2
+ labels:
+ severity: critical
+ - alert: IpmiFanSpeed
+ expr: ipmi_fan_speed_state == 2
+ labels:
+ severity: critical
+ - alert: IpmiPower
+ expr: ipmi_power_state == 2
+ labels:
+ severity: critical
+ - alert: IpmiSensor
+ expr: ipmi_sensor_state == 2
+ labels:
+ severity: critical
+ - alert: IpmiTemperature
+ expr: ipmi_temperature_state == 2
+ labels:
+ severity: critical
+ - alert: IpmiVoltage
+ expr: ipmi_voltage_state == 2
+ labels:
+ severity: critical
+ memcached:
+ groups:
+ - name: memcached
+ rules:
+ - alert: MemcachedDown
+ expr: memcached_up == 0
+ for: 5m
+ labels:
+ severity: critical
+ - alert: MemcachedConnectionLimitApproaching
+ expr: (memcached_current_connections / memcached_max_connections * 100) > 80
+ for: 5m
+ labels:
+ severity: warning
+ - alert: MemcachedConnectionLimitApproaching
+ expr: (memcached_current_connections / memcached_max_connections * 100) > 95
+ for: 5m
+ labels:
+ severity: critical
node-exporter-local:
groups:
- name: node
@@ -404,3 +489,359 @@
for: 1m
labels:
severity: critical
+ openstack-exporter:
+ groups:
+ - name: cinder
+ rules:
+ - alert: CinderAgentDown
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running
+ on {{ $labels.hostname }} is being reported as down.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ down{% endraw %}'
+ expr: |
+ openstack_cinder_agent_state != 1
+ labels:
+ severity: warning
+ - alert: CinderAgentDown
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running on
+ {{ $labels.hostname }} is being reported as down for 5 minutes.
+ This can affect volume operations so it must be resolved as
+ quickly as possible.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ down{% endraw %}'
+ expr: |
+ openstack_cinder_agent_state != 1
+ for: 5m
+ labels:
+ severity: critical
+ - alert: CinderAgentDisabled
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
+ has been disabled for 60 minutes. This can affect volume operations so it must be
+ resolved as quickly as possible.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ disabled{% endraw %}'
+ expr: |
+ openstack_cinder_agent_state{adminState!="enabled"}
+ for: 1h
+ labels:
+ severity: warning
+ - alert: CinderVolumeInError
+ annotations:
+ description: |
+ '{% raw %}The volume {{ $labels.id }} has been in ERROR state for over 24 hours.
+ It must be cleaned up or removed in order to provide a consistent customer
+ experience.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.id }}] Volume in ERROR state{% endraw %}'
+ expr: |
+ openstack_cinder_volume_status{status=~"error.*"}
+ for: 24h
+ labels:
+ severity: warning
+ - name: neutron
+ rules:
+ - alert: NeutronAgentDown
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
+ is being reported as down.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ down{% endraw %}'
+ expr: |
+ openstack_neutron_agent_state != 1
+ labels:
+ severity: warning
+ - alert: NeutronAgentDown
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
+ is being reported as down for 5 minutes. This can affect network operations so it
+ must be resolved as quickly as possible.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ down{% endraw %}'
+ expr: |
+ openstack_neutron_agent_state != 1
+ for: 5m
+ labels:
+ severity: critical
+ - alert: NeutronAgentDisabled
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
+ has been disabled for 60 minutes. This can affect network operations so it must be
+ resolved as quickly as possible.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ disabled{% endraw %}'
+ expr: |
+ openstack_neutron_agent_state{adminState!="up"}
+ for: 1h
+ labels:
+ severity: warning
+ - alert: NeutronBindingFailedPorts
+ annotations:
+ description: |
+ '{% raw %}The NIC {{ $labels.mac_address }} of {{ $labels.device_owner }}
+ has binding failed port now.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.device_owner }}] {{ $labels.mac_address }}
+ binding failed{% endraw %}'
+ expr: |
+ openstack_neutron_port{binding_vif_type="binding_failed"} != 0
+ labels:
+ severity: warning
+ - alert: NeutronNetworkOutOfIPs
+ annotations:
+ description: |
+ '{% raw %}The subnet {{ $labels.subnet_name }} within {{ $labels.network_name }}
+ is currently at {{ $value }}% utilization. If the IP addresses run out, it will
+ impact the provisioning of new ports.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.network_name }}] {{ $labels.subnet_name }}
+ running out of IPs{% endraw %}'
+ expr: |
+ sum by (network_id) (openstack_neutron_network_ip_availabilities_used{project_id!=""}) / sum by (network_id)
+ (openstack_neutron_network_ip_availabilities_total{project_id!=""}) * 100 > 80
+ labels:
+ severity: warning
+ - name: nova
+ rules:
+ - alert: NovaAgentDown
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
+ is being reported as down.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ down{% endraw %}'
+ expr: |
+ openstack_nova_agent_state != 1
+ labels:
+ severity: warning
+ - alert: NovaAgentDown
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} is
+ being reported as down. This can affect compute operations so it must be resolved as
+ quickly as possible.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ down{% endraw %}'
+ expr: |
+ openstack_nova_agent_state != 1
+ for: 5m
+ labels:
+ severity: critical
+ - alert: NovaAgentDisabled
+ annotations:
+ description: |
+ '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} has been
+ disabled for 60 minutes. This can affect compute operations so it must be resolved as quickly
+ as possible.{% endraw %}'
+ summary: |
+ '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
+ disabled{% endraw %}'
+ expr: |
+ openstack_nova_agent_state{adminState!="enabled"}
+ for: 1h
+ labels:
+ severity: warning
+ - alert: NovaInstanceInError
+ annotations:
+ description: |
+ '{% raw %}The instance {{ $labels.id }} has been in ERROR state for over 24 hours.
+ It must be cleaned up or removed in order to provide a consistent customer
+ experience.{% endraw %}'
+ summary: "{% raw %}[{{ $labels.id }}] Instance in ERROR state{% endraw %}"
+ expr: |
+ openstack_nova_server_status{status="ERROR"}
+ for: 24h
+ labels:
+ severity: warning
+ - alert: NovaFailureRisk
+ annotations:
+ description: |
+ '{% raw %}The cloud capacity will be at {{ $value }} in the event of the failure of
+ a single hypervisor which puts the cloud at risk of not being able to recover should
+ any hypervisor failures occur. Please ensure that adequate amount of infrastructure
+ is assigned to this deployment to prevent this.{% endraw %}'
+ summary: "{% raw %}[nova] Failure risk{% endraw %}"
+ expr: |
+ (sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) - max(openstack_nova_memory_used_bytes))
+ / sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) * 100 < 0.25
+ for: 6h
+ labels:
+ severity: warning
+ - alert: NovaCapacity
+ annotations:
+ description: |
+ '{% raw %}The cloud capacity is currently at `{{ $value }}` which means there is a
+ risk of running out of capacity due to the timeline required to add new nodes.
+ Please ensure that adequate amount of infrastructure is assigned to this deployment
+ to prevent this.{% endraw %}'
+ summary: "{% raw %}[nova] Capacity risk{% endraw %}"
+ expr: |
+ sum (
+ openstack_nova_memory_used_bytes
+ + on(hostname) group_left(adminState)
+ (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"})
+ ) / sum (
+ openstack_nova_memory_available_bytes
+ + on(hostname) group_left(adminState)
+ (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"})
+ ) * 100 > 75
+ for: 6h
+ labels:
+ severity: warning
+ percona-xtradb-pxc:
+ groups:
+ # TODO: basic rules
+ - name: general
+ rules:
+ - alert: MySQLDown
+ expr: mysql_up != 1
+ for: 5m
+ labels:
+ severity: critical
+ - alert: MysqlTooManyConnections
+ expr: max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80
+ for: 2m
+ labels:
+ severity: warning
+ - alert: MysqlHighThreadsRunning
+ expr: max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60
+ for: 2m
+ labels:
+ severity: warning
+ - alert: MysqlSlowQueries
+ expr: increase(mysql_global_status_slow_queries[1m]) > 0
+ for: 2m
+ labels:
+ severity: warning
+ - name: galera
+ rules:
+ - alert: MySQLGaleraNotReady
+ expr: mysql_global_status_wsrep_ready != 1
+ for: 5m
+ labels:
+ severity: critical
+ - alert: MySQLGaleraOutOfSync
+ expr: mysql_global_status_wsrep_local_state != 4 and mysql_global_variables_wsrep_desync == 0
+ for: 5m
+ labels:
+ severity: critical
+ - alert: MySQLGaleraDonorFallingBehind
+ expr: mysql_global_status_wsrep_local_state == 2 and mysql_global_status_wsrep_local_recv_queue > 100
+ for: 5m
+ labels:
+ severity: warning
+ - alert: MySQLReplicationNotRunning
+ expr: mysql_slave_status_slave_io_running == 0 or mysql_slave_status_slave_sql_running == 0
+ for: 2m
+ labels:
+ severity: critical
+ - alert: MySQLReplicationLag
+ expr: (instance:mysql_slave_lag_seconds > 30) and on(instance) (predict_linear(instance:mysql_slave_lag_seconds[5m], 60 * 2) > 0)
+ for: 1m
+ labels:
+ severity: critical
+ - alert: MySQLHeartbeatLag
+ expr: (instance:mysql_heartbeat_lag_seconds > 30) and on(instance) (predict_linear(instance:mysql_heartbeat_lag_seconds[5m], 60 * 2) > 0)
+ for: 1m
+ labels:
+ severity: critical
+ - alert: MySQLInnoDBLogWaits
+ expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
+ labels:
+ severity: warning
+ rabbitmq:
+ groups:
+ - name: recording
+ rules:
+ - record: rabbitmq:usage:memory
+ labels:
+ job: rabbitmq
+ expr: |
+ sum without (job) (
+ rabbitmq_process_resident_memory_bytes
+ ) / sum without (
+ container,
+ pod,
+ job,
+ namespace,
+ node,
+ resource,
+ uid,
+ unit
+ ) (
+ label_replace(
+ cluster:namespace:pod_memory:active:kube_pod_container_resource_limits,
+ "instance",
+ "$1",
+ "pod",
+ "(.*)"
+ )
+ )
+ - name: alarms
+ rules:
+ - alert: RabbitmqAlarmFreeDiskSpace
+ expr: rabbitmq_alarms_free_disk_space_watermark == 1
+ labels:
+ severity: critical
+ - alert: RabbitmqAlarmMemoryUsedWatermark
+ expr: rabbitmq_alarms_memory_used_watermark == 1
+ labels:
+ severity: critical
+ - alert: RabbitmqAlarmFileDescriptorLimit
+ expr: rabbitmq_alarms_file_descriptor_limit == 1
+ labels:
+ severity: critical
+ - name: limits
+ rules:
+ - alert: RabbitmqMemoryHigh
+ expr: rabbitmq:usage:memory > 0.80
+ labels:
+ severity: warning
+ - alert: RabbitmqMemoryHigh
+ expr: rabbitmq:usage:memory > 0.95
+ labels:
+ severity: critical
+ - alert: RabbitmqFileDescriptorsUsage
+ expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds > 0.80
+ labels:
+ severity: warning
+ - alert: RabbitmqFileDescriptorsUsage
+ expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds > 0.95
+ labels:
+ severity: critical
+ - alert: RabbitmqTcpSocketsUsage
+ expr: rabbitmq_process_open_tcp_sockets / rabbitmq_process_max_tcp_sockets > 0.80
+ labels:
+ severity: warning
+ - alert: RabbitmqTcpSocketsUsage
+ expr: rabbitmq_process_open_tcp_sockets / rabbitmq_process_max_tcp_sockets > 0.95
+ labels:
+ severity: critical
+ - name: msgs
+ rules:
+ - alert: RabbitmqUnackedMessages
+ expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000
+ for: 5m
+ labels:
+ severity: warning
+ - alert: RabbitmqUnackedMessages
+ expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000
+ for: 1h
+ labels:
+ severity: critical
diff --git a/roles/openstack_exporter/tasks/main.yml b/roles/openstack_exporter/tasks/main.yml
index 0a364c5..0b38ef4 100644
--- a/roles/openstack_exporter/tasks/main.yml
+++ b/roles/openstack_exporter/tasks/main.yml
@@ -126,229 +126,6 @@
targetPort: metrics
selector:
application: openstack-exporter
-
- - apiVersion: monitoring.coreos.com/v1
- kind: PrometheusRule
- metadata:
- name: openstack-exporter
- namespace: monitoring
- labels:
- application: openstack-exporter
- spec:
- groups:
- - name: cinder
- rules:
- - alert: CinderAgentDown
- annotations:
- description: |
- '{% raw %}The service {{ $labels.exported_service }} running
- on {{ $labels.hostname }} is being reported as down.{% endraw %}'
- summary: |
- '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
- down{% endraw %}'
- expr: |
- openstack_cinder_agent_state != 1
- labels:
- severity: warning
- - alert: CinderAgentDown
- annotations:
- description: |
- '{% raw %}The service {{ $labels.exported_service }} running on
- {{ $labels.hostname }} is being reported as down for 5 minutes.
- This can affect volume operations so it must be resolved as
- quickly as possible.{% endraw %}'
- summary: |
- '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
- down{% endraw %}'
- expr: |
- openstack_cinder_agent_state != 1
- for: 5m
- labels:
- severity: critical
- - alert: CinderAgentDisabled
- annotations:
- description: |
- '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
- has been disabled for 60 minutes. This can affect volume operations so it must be
- resolved as quickly as possible.{% endraw %}'
- summary: |
- '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
- disabled{% endraw %}'
- expr: |
- openstack_cinder_agent_state{adminState!="enabled"}
- for: 1h
- labels:
- severity: warning
- - alert: CinderVolumeInError
- annotations:
- description: |
- '{% raw %}The volume {{ $labels.id }} has been in ERROR state for over 24 hours.
- It must be cleaned up or removed in order to provide a consistent customer
- experience.{% endraw %}'
- summary: |
- '{% raw %}[{{ $labels.id }}] Volume in ERROR state{% endraw %}'
- expr: |
- openstack_cinder_volume_status{status=~"error.*"}
- for: 24h
- labels:
- severity: warning
- - name: neutron
- rules:
- - alert: NeutronAgentDown
- annotations:
- description: |
- '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
- is being reported as down.{% endraw %}'
- summary: |
- '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
- down{% endraw %}'
- expr: |
- openstack_neutron_agent_state != 1
- labels:
- severity: warning
- - alert: NeutronAgentDown
- annotations:
- description: |
- '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
- is being reported as down for 5 minutes. This can affect network operations so it
- must be resolved as quickly as possible.{% endraw %}'
- summary: |
- '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
- down{% endraw %}'
- expr: |
- openstack_neutron_agent_state != 1
- for: 5m
- labels:
- severity: critical
- - alert: NeutronAgentDisabled
- annotations:
- description: |
- '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
- has been disabled for 60 minutes. This can affect network operations so it must be
- resolved as quickly as possible.{% endraw %}'
- summary: |
- '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
- disabled{% endraw %}'
- expr: |
- openstack_neutron_agent_state{adminState!="up"}
- for: 1h
- labels:
- severity: warning
- - alert: NeutronBindingFailedPorts
- annotations:
- description: |
- '{% raw %}The NIC {{ $labels.mac_address }} of {{ $labels.device_owner }}
- has binding failed port now.{% endraw %}'
- summary: |
- '{% raw %}[{{ $labels.device_owner }}] {{ $labels.mac_address }}
- binding failed{% endraw %}'
- expr: |
- openstack_neutron_port{binding_vif_type="binding_failed"} != 0
- labels:
- severity: warning
- - alert: NeutronNetworkOutOfIPs
- annotations:
- description: |
- '{% raw %}The subnet {{ $labels.subnet_name }} within {{ $labels.network_name }}
- is currently at {{ $value }}% utilization. If the IP addresses run out, it will
- impact the provisioning of new ports.{% endraw %}'
- summary: |
- '{% raw %}[{{ $labels.network_name }}] {{ $labels.subnet_name }}
- running out of IPs{% endraw %}'
- expr: |
- sum by (network_id) (openstack_neutron_network_ip_availabilities_used{project_id!=""}) / sum by (network_id)
- (openstack_neutron_network_ip_availabilities_total{project_id!=""}) * 100 > 80
- labels:
- severity: warning
- - name: nova
- rules:
- - alert: NovaAgentDown
- annotations:
- description: |
- '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
- is being reported as down.{% endraw %}'
- summary: |
- '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
- down{% endraw %}'
- expr: |
- openstack_nova_agent_state != 1
- labels:
- severity: warning
- - alert: NovaAgentDown
- annotations:
- description: |
- '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} is
- being reported as down. This can affect compute operations so it must be resolved as
- quickly as possible.{% endraw %}'
- summary: |
- '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
- down{% endraw %}'
- expr: |
- openstack_nova_agent_state != 1
- for: 5m
- labels:
- severity: critical
- - alert: NovaAgentDisabled
- annotations:
- description: |
- '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} has been
- disabled for 60 minutes. This can affect compute operations so it must be resolved as quickly
- as possible.{% endraw %}'
- summary: |
- '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
- disabled{% endraw %}'
- expr: |
- openstack_nova_agent_state{adminState!="enabled"}
- for: 1h
- labels:
- severity: warning
- - alert: NovaInstanceInError
- annotations:
- description: |
- '{% raw %}The instance {{ $labels.id }} has been in ERROR state for over 24 hours.
- It must be cleaned up or removed in order to provide a consistent customer
- experience.{% endraw %}'
- summary: '{% raw %}[{{ $labels.id }}] Instance in ERROR state{% endraw %}'
- expr: |
- openstack_nova_server_status{status="ERROR"}
- for: 24h
- labels:
- severity: warning
- - alert: NovaFailureRisk
- annotations:
- description: |
- '{% raw %}The cloud capacity will be at {{ $value }} in the event of the failure of
- a single hypervisor which puts the cloud at risk of not being able to recover should
- any hypervisor failures occur. Please ensure that adequate amount of infrastructure
- is assigned to this deployment to prevent this.{% endraw %}'
- summary: '{% raw %}[nova] Failure risk{% endraw %}'
- expr: |
- (sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) - max(openstack_nova_memory_used_bytes))
- / sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) * 100 < 0.25
- for: 6h
- labels:
- severity: warning
- - alert: NovaCapacity
- annotations:
- description: |
- '{% raw %}The cloud capacity is currently at `{{ $value }}` which means there is a
- risk of running out of capacity due to the timeline required to add new nodes.
- Please ensure that adequate amount of infrastructure is assigned to this deployment
- to prevent this.{% endraw %}'
- summary: '{% raw %}[nova] Capacity risk{% endraw %}'
- expr: |
- sum (
- openstack_nova_memory_used_bytes
- + on(hostname) group_left(adminState)
- (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"})
- ) / sum (
- openstack_nova_memory_available_bytes
- + on(hostname) group_left(adminState)
- (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"})
- ) * 100 > 75
- for: 6h
- labels:
- severity: warning
# NOTE(mnaser): Since we haven't moved to the operator pattern yet, we need to
# keep retrying a few times as the CRDs might not be installed
# yet.
diff --git a/roles/openstack_helm_infra_memcached/tasks/main.yml b/roles/openstack_helm_infra_memcached/tasks/main.yml
index 78397e1..9a21e7e 100644
--- a/roles/openstack_helm_infra_memcached/tasks/main.yml
+++ b/roles/openstack_helm_infra_memcached/tasks/main.yml
@@ -54,30 +54,3 @@
- name: metrics
port: 9150
targetPort: 9150
-
- - apiVersion: monitoring.coreos.com/v1
- kind: PrometheusRule
- metadata:
- name: memcached
- namespace: monitoring
- labels:
- release: kube-prometheus-stack
- spec:
- groups:
- - name: memcached
- rules:
- - alert: MemcachedDown
- expr: memcached_up == 0
- for: 5m
- labels:
- severity: critical
- - alert: MemcachedConnectionLimitApproaching
- expr: (memcached_current_connections / memcached_max_connections * 100) > 80
- for: 5m
- labels:
- severity: warning
- - alert: MemcachedConnectionLimitApproaching
- expr: (memcached_current_connections / memcached_max_connections * 100) > 95
- for: 5m
- labels:
- severity: critical
diff --git a/roles/percona_xtradb_cluster/tasks/main.yml b/roles/percona_xtradb_cluster/tasks/main.yml
index 7c45364..71258e6 100644
--- a/roles/percona_xtradb_cluster/tasks/main.yml
+++ b/roles/percona_xtradb_cluster/tasks/main.yml
@@ -89,75 +89,6 @@
image: percona/percona-xtradb-cluster-operator:1.10.0-haproxy
nodeSelector:
openstack-control-plane: enabled
-
- - apiVersion: monitoring.coreos.com/v1
- kind: PrometheusRule
- metadata:
- name: percona-xtradb-pxc
- namespace: monitoring
- labels:
- release: kube-prometheus-stack
- spec:
- groups:
- # TODO: basic rules
- - name: general
- rules:
- - alert: MySQLDown
- expr: mysql_up != 1
- for: 5m
- labels:
- severity: critical
- - alert: MysqlTooManyConnections
- expr: max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80
- for: 2m
- labels:
- severity: warning
- - alert: MysqlHighThreadsRunning
- expr: max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60
- for: 2m
- labels:
- severity: warning
- - alert: MysqlSlowQueries
- expr: increase(mysql_global_status_slow_queries[1m]) > 0
- for: 2m
- labels:
- severity: warning
- - name: galera
- rules:
- - alert: MySQLGaleraNotReady
- expr: mysql_global_status_wsrep_ready != 1
- for: 5m
- labels:
- severity: critical
- - alert: MySQLGaleraOutOfSync
- expr: mysql_global_status_wsrep_local_state != 4 and mysql_global_variables_wsrep_desync == 0
- for: 5m
- labels:
- severity: critical
- - alert: MySQLGaleraDonorFallingBehind
- expr: mysql_global_status_wsrep_local_state == 2 and mysql_global_status_wsrep_local_recv_queue > 100
- for: 5m
- labels:
- severity: warning
- - alert: MySQLReplicationNotRunning
- expr: mysql_slave_status_slave_io_running == 0 or mysql_slave_status_slave_sql_running == 0
- for: 2m
- labels:
- severity: critical
- - alert: MySQLReplicationLag
- expr: (instance:mysql_slave_lag_seconds > 30) and on(instance) (predict_linear(instance:mysql_slave_lag_seconds[5m], 60 * 2) > 0)
- for: 1m
- labels:
- severity: critical
- - alert: MySQLHeartbeatLag
- expr: (instance:mysql_heartbeat_lag_seconds > 30) and on(instance) (predict_linear(instance:mysql_heartbeat_lag_seconds[5m], 60 * 2) > 0)
- for: 1m
- labels:
- severity: critical
- - alert: MySQLInnoDBLogWaits
- expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
- labels:
- severity: warning
# NOTE(mnaser): Since we haven't moved to the operator pattern yet, we need to
# keep retrying a few times as the CRDs might not be installed
# yet.
diff --git a/roles/prometheus_ethtool_exporter/tasks/main.yml b/roles/prometheus_ethtool_exporter/tasks/main.yml
index 4f024a8..24d01a8 100644
--- a/roles/prometheus_ethtool_exporter/tasks/main.yml
+++ b/roles/prometheus_ethtool_exporter/tasks/main.yml
@@ -42,20 +42,3 @@
ports:
- name: metrics
containerPort: 9417
-
- - apiVersion: monitoring.coreos.com/v1
- kind: PrometheusRule
- metadata:
- name: ethtool-exporter
- namespace: monitoring
- labels:
- application: ethtool-exporter
- release: kube-prometheus-stack
- spec:
- groups:
- - name: rules
- rules:
- - alert: EthernetReceiveDiscards
- expr: rate(node_net_ethtool{type="rx_discards"}[1m]) > 0
- labels:
- severity: warning
diff --git a/roles/rabbitmq_operator/tasks/main.yml b/roles/rabbitmq_operator/tasks/main.yml
index 670a91a..2ba1c90 100644
--- a/roles/rabbitmq_operator/tasks/main.yml
+++ b/roles/rabbitmq_operator/tasks/main.yml
@@ -56,102 +56,3 @@
nodeSelector:
openstack-control-plane: enabled
useCertManager: true
-
-- name: Deploy monitoring for RabbitMQ
- kubernetes.core.k8s:
- state: present
- definition:
- - apiVersion: monitoring.coreos.com/v1
- kind: PrometheusRule
- metadata:
- name: rabbitmq
- namespace: monitoring
- labels:
- release: kube-prometheus-stack
- spec:
- groups:
- - name: recording
- rules:
- - record: rabbitmq:usage:memory
- labels:
- job: rabbitmq
- expr: |
- sum without (job) (
- rabbitmq_process_resident_memory_bytes
- ) / sum without (
- container,
- pod,
- job,
- namespace,
- node,
- resource,
- uid,
- unit
- ) (
- label_replace(
- cluster:namespace:pod_memory:active:kube_pod_container_resource_limits,
- "instance",
- "$1",
- "pod",
- "(.*)"
- )
- )
- - name: alarms
- rules:
- - alert: RabbitmqAlarmFreeDiskSpace
- expr: rabbitmq_alarms_free_disk_space_watermark == 1
- labels:
- severity: critical
- - alert: RabbitmqAlarmMemoryUsedWatermark
- expr: rabbitmq_alarms_memory_used_watermark == 1
- labels:
- severity: critical
- - alert: RabbitmqAlarmFileDescriptorLimit
- expr: rabbitmq_alarms_file_descriptor_limit == 1
- labels:
- severity: critical
- - name: limits
- rules:
- - alert: RabbitmqMemoryHigh
- expr: rabbitmq:usage:memory > 0.80
- labels:
- severity: warning
- - alert: RabbitmqMemoryHigh
- expr: rabbitmq:usage:memory > 0.95
- labels:
- severity: critical
- - alert: RabbitmqFileDescriptorsUsage
- expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds > 0.80
- labels:
- severity: warning
- - alert: RabbitmqFileDescriptorsUsage
- expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds > 0.95
- labels:
- severity: critical
- - alert: RabbitmqTcpSocketsUsage
- expr: rabbitmq_process_open_tcp_sockets / rabbitmq_process_max_tcp_sockets > 0.80
- labels:
- severity: warning
- - alert: RabbitmqTcpSocketsUsage
- expr: rabbitmq_process_open_tcp_sockets / rabbitmq_process_max_tcp_sockets > 0.95
- labels:
- severity: critical
- - name: msgs
- rules:
- - alert: RabbitmqUnackedMessages
- expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000
- for: 5m
- labels:
- severity: warning
- - alert: RabbitmqUnackedMessages
- expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000
- for: 1h
- labels:
- severity: critical
- # NOTE(mnaser): Since we haven't moved to the operator pattern yet, we need to
- # keep retrying a few times as the CRDs might not be installed
- # yet.
- retries: 60
- delay: 5
- register: _result
- until: _result is not failed