blob: ba047b7f5d8dbcf966bfd3704bab2dbeaa3f5411 [file] [log] [blame]
# Copyright (c) 2022 VEXXHOST, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
_kube_prometheus_stack_values:
defaultRules:
disabled:
# NOTE(mnaser): https://github.com/prometheus-community/helm-charts/issues/144
# https://github.com/openshift/cluster-monitoring-operator/issues/248
etcdHighNumberOfFailedGRPCRequests: true
alertmanager:
serviceMonitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
alertmanagerSpec:
storage:
volumeClaimTemplate:
spec:
storageClassName: general
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 40Gi
nodeSelector:
openstack-control-plane: enabled
grafana:
serviceMonitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
nodeSelector:
openstack-control-plane: enabled
kubeApiServer:
serviceMonitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
kubelet:
serviceMonitor:
cAdvisorRelabelings:
- sourceLabels: [__metrics_path__]
targetLabel: metrics_path
- sourceLabels: ["node"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|node|service)$"
probesRelabelings:
- sourceLabels: [__metrics_path__]
targetLabel: metrics_path
- sourceLabels: ["node"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|node|service)$"
relabelings:
- sourceLabels: [__metrics_path__]
targetLabel: metrics_path
- sourceLabels: ["node"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|node|service)$"
kubeControllerManager:
serviceMonitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
coreDns:
serviceMonitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_name"]
targetLabel: "instance"
- regex: "^(container|endpoint|namespace|pod|service)$"
action: "labeldrop"
kubeEtcd:
serviceMonitor:
scheme: https
serverName: localhost
insecureSkipVerify: false
caFile: /etc/prometheus/secrets/kube-prometheus-stack-etcd-client-cert/ca.crt
certFile: /etc/prometheus/secrets/kube-prometheus-stack-etcd-client-cert/healthcheck-client.crt
keyFile: /etc/prometheus/secrets/kube-prometheus-stack-etcd-client-cert/healthcheck-client.key
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
kubeScheduler:
service:
port: 10259
targetPort: 10259
serviceMonitor:
https: true
insecureSkipVerify: true
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
kubeProxy:
serviceMonitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
kube-state-metrics:
prometheus:
monitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
nodeSelector:
openstack-control-plane: enabled
prometheus:
serviceMonitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
prometheusSpec:
nodeSelector:
openstack-control-plane: enabled
secrets:
- kube-prometheus-stack-etcd-client-cert
additionalServiceMonitors:
- name: ceph
jobLabel: application
selector:
matchLabels:
application: ceph
namespaceSelector:
matchNames:
- openstack
endpoints:
- port: metrics
honorLabels: true
relabelings:
- action: replace
regex: (.*)
replacement: ceph
targetLabel: cluster
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
- name: coredns
jobLabel: app.kubernetes.io/name
namespaceSelector:
matchNames:
- openstack
selector:
matchLabels:
app.kubernetes.io/name: coredns
app.kubernetes.io/component: metrics
endpoints:
- port: "metrics"
path: "/metrics"
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_label_application"]
targetLabel: "application"
- sourceLabels: ["__meta_kubernetes_pod_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
- name: memcached
jobLabel: application
namespaceSelector:
matchNames:
- openstack
selector:
matchLabels:
application: memcached
component: server
endpoints:
- port: "metrics"
path: "/metrics"
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
- name: openstack-exporter
jobLabel: jobLabel
namespaceSelector:
matchNames:
- openstack
selector:
matchLabels:
application: openstack-exporter
endpoints:
- interval: 1m
scrapeTimeout: 30s
port: metrics
relabelings:
- action: replace
regex: (.*)
replacement: default
targetLabel: instance
additionalPodMonitors:
- name: ethtool-exporter
jobLabel: job
selector:
matchLabels:
application: ethtool-exporter
podMetricsEndpoints:
- port: metrics
path: /metrics
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: instance
- action: labeldrop
regex: ^(container|endpoint|namespace|pod)$
- name: ipmi-exporter
jobLabel: job
selector:
matchLabels:
application: ipmi-exporter
podMetricsEndpoints:
- port: metrics
path: /metrics
interval: 60s
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: instance
- action: labeldrop
regex: ^(container|endpoint|namespace|pod)$
- name: percona-xtradb-pxc
jobLabel: app.kubernetes.io/component
namespaceSelector:
matchNames:
- openstack
selector:
matchLabels:
app.kubernetes.io/component: pxc
app.kubernetes.io/instance: percona-xtradb
podMetricsEndpoints:
- port: metrics
path: /metrics
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
- name: rabbitmq
jobLabel: app.kubernetes.io/component
namespaceSelector:
matchNames:
- openstack
selector:
matchLabels:
app.kubernetes.io/component: rabbitmq
podMetricsEndpoints:
- port: prometheus
path: /metrics
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
prometheusOperator:
admissionWebhooks:
patch:
nodeSelector:
openstack-control-plane: enabled
serviceMonitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
nodeSelector:
openstack-control-plane: enabled
prometheus-node-exporter:
extraArgs:
- --collector.diskstats.ignored-devices=^(ram|loop|nbd|fd|(h|s|v|xv)d[a-z]|nvme\\d+n\\d+p)\\d+$
- --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|fuse.squashfuse_ll|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
- --collector.filesystem.mount-points-exclude=^/(dev|proc|run/credentials/.+|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+|var/lib/kubelet/plugins/kubernetes.io/csi/.+|run/containerd/.+)($|/)
- --collector.netclass.ignored-devices=^(lxc|cilium_|qbr|qvb|qvo|tap|ovs-system|br|tbr|gre_sys).*$
- --collector.netdev.device-exclude=^(lxc|cilium_|qbr|qvb|qvo|tap|ovs-system|br|tbr|gre_sys).*$
prometheus:
monitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
additionalPrometheusRulesMap:
ceph: "{{ lookup('ansible.builtin.file', 'prometheus_alerts.yml') | from_yaml }}"
coredns:
groups:
- name: coredns
rules:
- alert: CoreDNSDown
expr: absent(up{job="coredns"} == 1)
for: 15m
labels:
severity: critical
- alert: CoreDNSLatencyHigh
expr: histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="coredns"}[5m])) by(server, zone, le)) > 4
for: 10m
labels:
severity: critical
- alert: CoreDNSErrorsHigh
expr:
sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[5m]))
/
sum(rate(coredns_dns_responses_total{job="coredns"}[5m])) > 0.01
for: 10m
labels:
severity: warning
- alert: CoreDNSErrorsHigh
expr:
sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[5m]))
/
sum(rate(coredns_dns_responses_total{job="coredns"}[5m])) > 0.03
for: 10m
labels:
severity: critical
- name: coredns_forward
rules:
- alert: CoreDNSForwardLatencyHigh
expr: histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="coredns"}[5m])) by(to, le)) > 4
for: 10m
labels:
severity: critical
- alert: CoreDNSForwardErrorsHigh
expr:
sum(rate(coredns_forward_responses_total{job="coredns",rcode="SERVFAIL"}[5m]))
/
sum(rate(coredns_forward_responses_total{job="coredns"}[5m])) > 0.01
for: 10m
labels:
severity: warning
- alert: CoreDNSForwardErrorsHigh
expr:
sum(rate(coredns_forward_responses_total{job="coredns",rcode="SERVFAIL"}[5m]))
/
sum(rate(coredns_forward_responses_total{job="coredns"}[5m])) > 0.03
for: 10m
labels:
severity: critical
- alert: CoreDNSForwardHealthcheckFailureCount
expr: sum(rate(coredns_forward_healthcheck_failures_total{job="coredns"}[5m])) by (to) > 0
for: 2m
labels:
severity: warning
- alert: CoreDNSForwardHealthcheckBrokenCount
expr: sum(rate(coredns_forward_healthcheck_broken_total{job="coredns"}[5m])) > 0
for: 2m
labels:
severity: critical
ethtool-exporter:
groups:
- name: rules
rules:
- alert: EthernetReceiveDiscards
expr: rate(node_net_ethtool{type="rx_discards"}[1m]) > 0
labels:
severity: warning
ipmi-exporter:
groups:
- name: rules
rules:
- alert: IpmiCollectorDown
expr: ipmi_up == 0
- name: collectors-state-warning
rules:
- alert: IpmiCurrent
expr: ipmi_current_state == 1
labels:
severity: warning
- alert: IpmiFanSpeed
expr: ipmi_fan_speed_state == 1
labels:
severity: warning
- alert: IpmiPower
expr: ipmi_power_state == 1
labels:
severity: warning
- alert: IpmiSensor
expr: ipmi_sensor_state == 1
labels:
severity: warning
- alert: IpmiTemperature
expr: ipmi_temperature_state == 1
labels:
severity: warning
- alert: IpmiVoltage
expr: ipmi_voltage_state == 1
labels:
severity: warning
- name: collectors-state-critical
rules:
- alert: IpmiCurrent
expr: ipmi_current_state == 2
labels:
severity: critical
- alert: IpmiFanSpeed
expr: ipmi_fan_speed_state == 2
labels:
severity: critical
- alert: IpmiPower
expr: ipmi_power_state == 2
labels:
severity: critical
- alert: IpmiSensor
expr: ipmi_sensor_state == 2
labels:
severity: critical
- alert: IpmiTemperature
expr: ipmi_temperature_state == 2
labels:
severity: critical
- alert: IpmiVoltage
expr: ipmi_voltage_state == 2
labels:
severity: critical
memcached:
groups:
- name: memcached
rules:
- alert: MemcachedDown
expr: memcached_up == 0
for: 5m
labels:
severity: critical
- alert: MemcachedConnectionLimitApproaching
expr: (memcached_current_connections / memcached_max_connections * 100) > 80
for: 5m
labels:
severity: warning
- alert: MemcachedConnectionLimitApproaching
expr: (memcached_current_connections / memcached_max_connections * 100) > 95
for: 5m
labels:
severity: critical
node-exporter-local:
groups:
- name: node
rules:
- alert: NodeHighLoadAverage
expr: node_load5 / count(node_cpu_seconds_total{mode="system"}) without (cpu, mode) > 1.5
for: 30m
labels:
severity: warning
- alert: NodeHighMemoryUsage
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 2.5
for: 2m
labels:
severity: critical
- alert: NodeHighCpuUsage
expr: sum by(instance)(irate(node_cpu_seconds_total{mode='idle'}[5m])) < 1
for: 2m
labels:
severity: warning
- alert: NodeLowEntropy
expr: node_entropy_available_bits < 1000
for: 5m
labels:
severity: warning
- name: softnet
rules:
- alert: NodeSoftNetTimesSqueezed
expr: sum(rate(node_softnet_times_squeezed_total[1m])) by (instance) > 10
for: 10m
labels:
severity: warning
- alert: NodeSoftNetDrops
expr: sum(rate(node_softnet_dropped_total[1m])) by (instance) != 0
for: 1m
labels:
severity: critical
openstack-exporter:
groups:
- name: cinder
rules:
- alert: CinderAgentDown
annotations:
description: |
'{% raw %}The service {{ $labels.exported_service }} running
on {{ $labels.hostname }} is being reported as down.{% endraw %}'
summary: |
'{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
down{% endraw %}'
expr: |
openstack_cinder_agent_state != 1
labels:
severity: warning
- alert: CinderAgentDown
annotations:
description: |
'{% raw %}The service {{ $labels.exported_service }} running on
{{ $labels.hostname }} is being reported as down for 5 minutes.
This can affect volume operations so it must be resolved as
quickly as possible.{% endraw %}'
summary: |
'{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
down{% endraw %}'
expr: |
openstack_cinder_agent_state != 1
for: 5m
labels:
severity: critical
- alert: CinderAgentDisabled
annotations:
description: |
'{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
has been disabled for 60 minutes. This can affect volume operations so it must be
resolved as quickly as possible.{% endraw %}'
summary: |
'{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
disabled{% endraw %}'
expr: |
openstack_cinder_agent_state{adminState!="enabled"}
for: 1h
labels:
severity: warning
- alert: CinderVolumeInError
annotations:
description: |
'{% raw %}The volume {{ $labels.id }} has been in ERROR state for over 24 hours.
It must be cleaned up or removed in order to provide a consistent customer
experience.{% endraw %}'
summary: |
'{% raw %}[{{ $labels.id }}] Volume in ERROR state{% endraw %}'
expr: |
openstack_cinder_volume_status{status=~"error.*"}
for: 24h
labels:
severity: warning
- name: neutron
rules:
- alert: NeutronAgentDown
annotations:
description: |
'{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
is being reported as down.{% endraw %}'
summary: |
'{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
down{% endraw %}'
expr: |
openstack_neutron_agent_state != 1
labels:
severity: warning
- alert: NeutronAgentDown
annotations:
description: |
'{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
is being reported as down for 5 minutes. This can affect network operations so it
must be resolved as quickly as possible.{% endraw %}'
summary: |
'{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
down{% endraw %}'
expr: |
openstack_neutron_agent_state != 1
for: 5m
labels:
severity: critical
- alert: NeutronAgentDisabled
annotations:
description: |
'{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
has been disabled for 60 minutes. This can affect network operations so it must be
resolved as quickly as possible.{% endraw %}'
summary: |
'{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
disabled{% endraw %}'
expr: |
openstack_neutron_agent_state{adminState!="up"}
for: 1h
labels:
severity: warning
- alert: NeutronBindingFailedPorts
annotations:
description: |
'{% raw %}The NIC {{ $labels.mac_address }} of {{ $labels.device_owner }}
has binding failed port now.{% endraw %}'
summary: |
'{% raw %}[{{ $labels.device_owner }}] {{ $labels.mac_address }}
binding failed{% endraw %}'
expr: |
openstack_neutron_port{binding_vif_type="binding_failed"} != 0
labels:
severity: warning
- alert: NeutronNetworkOutOfIPs
annotations:
description: |
'{% raw %}The subnet {{ $labels.subnet_name }} within {{ $labels.network_name }}
is currently at {{ $value }}% utilization. If the IP addresses run out, it will
impact the provisioning of new ports.{% endraw %}'
summary: |
'{% raw %}[{{ $labels.network_name }}] {{ $labels.subnet_name }}
running out of IPs{% endraw %}'
expr: |
sum by (network_id) (openstack_neutron_network_ip_availabilities_used{project_id!=""}) / sum by (network_id)
(openstack_neutron_network_ip_availabilities_total{project_id!=""}) * 100 > 80
labels:
severity: warning
- name: nova
rules:
- alert: NovaAgentDown
annotations:
description: |
'{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }}
is being reported as down.{% endraw %}'
summary: |
'{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
down{% endraw %}'
expr: |
openstack_nova_agent_state != 1
labels:
severity: warning
- alert: NovaAgentDown
annotations:
description: |
'{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} is
being reported as down. This can affect compute operations so it must be resolved as
quickly as possible.{% endraw %}'
summary: |
'{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
down{% endraw %}'
expr: |
openstack_nova_agent_state != 1
for: 5m
labels:
severity: critical
- alert: NovaAgentDisabled
annotations:
description: |
'{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} has been
disabled for 60 minutes. This can affect compute operations so it must be resolved as quickly
as possible.{% endraw %}'
summary: |
'{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }}
disabled{% endraw %}'
expr: |
openstack_nova_agent_state{adminState!="enabled"}
for: 1h
labels:
severity: warning
- alert: NovaInstanceInError
annotations:
description: |
'{% raw %}The instance {{ $labels.id }} has been in ERROR state for over 24 hours.
It must be cleaned up or removed in order to provide a consistent customer
experience.{% endraw %}'
summary: "{% raw %}[{{ $labels.id }}] Instance in ERROR state{% endraw %}"
expr: |
openstack_nova_server_status{status="ERROR"}
for: 24h
labels:
severity: warning
- alert: NovaFailureRisk
annotations:
description: |
'{% raw %}The cloud capacity will be at {{ $value }} in the event of the failure of
a single hypervisor which puts the cloud at risk of not being able to recover should
any hypervisor failures occur. Please ensure that adequate amount of infrastructure
is assigned to this deployment to prevent this.{% endraw %}'
summary: "{% raw %}[nova] Failure risk{% endraw %}"
expr: |
(sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) - max(openstack_nova_memory_used_bytes))
/ sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) * 100 < 0.25
for: 6h
labels:
severity: warning
- alert: NovaCapacity
annotations:
description: |
'{% raw %}The cloud capacity is currently at `{{ $value }}` which means there is a
risk of running out of capacity due to the timeline required to add new nodes.
Please ensure that adequate amount of infrastructure is assigned to this deployment
to prevent this.{% endraw %}'
summary: "{% raw %}[nova] Capacity risk{% endraw %}"
expr: |
sum (
openstack_nova_memory_used_bytes
+ on(hostname) group_left(adminState)
(0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"})
) / sum (
openstack_nova_memory_available_bytes
+ on(hostname) group_left(adminState)
(0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"})
) * 100 > 75
for: 6h
labels:
severity: warning
percona-xtradb-pxc:
groups:
# TODO: basic rules
- name: general
rules:
- alert: MySQLDown
expr: mysql_up != 1
for: 5m
labels:
severity: critical
- alert: MysqlTooManyConnections
expr: max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80
for: 2m
labels:
severity: warning
- alert: MysqlHighThreadsRunning
expr: max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60
for: 2m
labels:
severity: warning
- alert: MysqlSlowQueries
expr: increase(mysql_global_status_slow_queries[1m]) > 0
for: 2m
labels:
severity: warning
- name: galera
rules:
- alert: MySQLGaleraNotReady
expr: mysql_global_status_wsrep_ready != 1
for: 5m
labels:
severity: critical
- alert: MySQLGaleraOutOfSync
expr: mysql_global_status_wsrep_local_state != 4 and mysql_global_variables_wsrep_desync == 0
for: 5m
labels:
severity: critical
- alert: MySQLGaleraDonorFallingBehind
expr: mysql_global_status_wsrep_local_state == 2 and mysql_global_status_wsrep_local_recv_queue > 100
for: 5m
labels:
severity: warning
- alert: MySQLReplicationNotRunning
expr: mysql_slave_status_slave_io_running == 0 or mysql_slave_status_slave_sql_running == 0
for: 2m
labels:
severity: critical
- alert: MySQLReplicationLag
expr: (instance:mysql_slave_lag_seconds > 30) and on(instance) (predict_linear(instance:mysql_slave_lag_seconds[5m], 60 * 2) > 0)
for: 1m
labels:
severity: critical
- alert: MySQLHeartbeatLag
expr: (instance:mysql_heartbeat_lag_seconds > 30) and on(instance) (predict_linear(instance:mysql_heartbeat_lag_seconds[5m], 60 * 2) > 0)
for: 1m
labels:
severity: critical
- alert: MySQLInnoDBLogWaits
expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
labels:
severity: warning
rabbitmq:
groups:
- name: recording
rules:
- record: rabbitmq:usage:memory
labels:
job: rabbitmq
expr: |
sum without (job) (
rabbitmq_process_resident_memory_bytes
) / sum without (
container,
pod,
job,
namespace,
node,
resource,
uid,
unit
) (
label_replace(
cluster:namespace:pod_memory:active:kube_pod_container_resource_limits,
"instance",
"$1",
"pod",
"(.*)"
)
)
- name: alarms
rules:
- alert: RabbitmqAlarmFreeDiskSpace
expr: rabbitmq_alarms_free_disk_space_watermark == 1
labels:
severity: critical
- alert: RabbitmqAlarmMemoryUsedWatermark
expr: rabbitmq_alarms_memory_used_watermark == 1
labels:
severity: critical
- alert: RabbitmqAlarmFileDescriptorLimit
expr: rabbitmq_alarms_file_descriptor_limit == 1
labels:
severity: critical
- name: limits
rules:
- alert: RabbitmqMemoryHigh
expr: rabbitmq:usage:memory > 0.80
labels:
severity: warning
- alert: RabbitmqMemoryHigh
expr: rabbitmq:usage:memory > 0.95
labels:
severity: critical
- alert: RabbitmqFileDescriptorsUsage
expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds > 0.80
labels:
severity: warning
- alert: RabbitmqFileDescriptorsUsage
expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds > 0.95
labels:
severity: critical
- alert: RabbitmqTcpSocketsUsage
expr: rabbitmq_process_open_tcp_sockets / rabbitmq_process_max_tcp_sockets > 0.80
labels:
severity: warning
- alert: RabbitmqTcpSocketsUsage
expr: rabbitmq_process_open_tcp_sockets / rabbitmq_process_max_tcp_sockets > 0.95
labels:
severity: critical
- name: msgs
rules:
- alert: RabbitmqUnackedMessages
expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000
for: 5m
labels:
severity: warning
- alert: RabbitmqUnackedMessages
expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000
for: 1h
labels:
severity: critical