| # Copyright (c) 2022 VEXXHOST, Inc. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); you may |
| # not use this file except in compliance with the License. You may obtain |
| # a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| # License for the specific language governing permissions and limitations |
| # under the License. |
| |
| _kube_prometheus_stack_values: |
| defaultRules: |
| disabled: |
| # NOTE(mnaser): https://github.com/prometheus-community/helm-charts/issues/144 |
| # https://github.com/openshift/cluster-monitoring-operator/issues/248 |
| etcdHighNumberOfFailedGRPCRequests: true |
| alertmanager: |
| serviceMonitor: |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_name"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|pod|service)$" |
| alertmanagerSpec: |
| storage: |
| volumeClaimTemplate: |
| spec: |
| storageClassName: general |
| accessModes: ["ReadWriteOnce"] |
| resources: |
| requests: |
| storage: 40Gi |
| nodeSelector: |
| openstack-control-plane: enabled |
| grafana: |
| serviceMonitor: |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_name"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|pod|service)$" |
| nodeSelector: |
| openstack-control-plane: enabled |
| kubeApiServer: |
| serviceMonitor: |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_node_name"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|pod|service)$" |
| kubelet: |
| serviceMonitor: |
| cAdvisorRelabelings: |
| - sourceLabels: [__metrics_path__] |
| targetLabel: metrics_path |
| - sourceLabels: ["node"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|node|service)$" |
| probesRelabelings: |
| - sourceLabels: [__metrics_path__] |
| targetLabel: metrics_path |
| - sourceLabels: ["node"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|node|service)$" |
| relabelings: |
| - sourceLabels: [__metrics_path__] |
| targetLabel: metrics_path |
| - sourceLabels: ["node"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|node|service)$" |
| kubeControllerManager: |
| serviceMonitor: |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_node_name"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|pod|service)$" |
| coreDns: |
| serviceMonitor: |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_name"] |
| targetLabel: "instance" |
| - regex: "^(container|endpoint|namespace|pod|service)$" |
| action: "labeldrop" |
| kubeEtcd: |
| serviceMonitor: |
| scheme: https |
| serverName: localhost |
| insecureSkipVerify: false |
| caFile: /etc/prometheus/secrets/kube-prometheus-stack-etcd-client-cert/ca.crt |
| certFile: /etc/prometheus/secrets/kube-prometheus-stack-etcd-client-cert/healthcheck-client.crt |
| keyFile: /etc/prometheus/secrets/kube-prometheus-stack-etcd-client-cert/healthcheck-client.key |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_node_name"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|pod|service)$" |
| kubeScheduler: |
| service: |
| port: 10259 |
| targetPort: 10259 |
| serviceMonitor: |
| https: true |
| insecureSkipVerify: true |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_node_name"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|pod|service)$" |
| kubeProxy: |
| serviceMonitor: |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_node_name"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|pod|service)$" |
| kube-state-metrics: |
| prometheus: |
| monitor: |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_name"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|pod|service)$" |
| nodeSelector: |
| openstack-control-plane: enabled |
| prometheus: |
| serviceMonitor: |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_name"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|pod|service)$" |
| prometheusSpec: |
| nodeSelector: |
| openstack-control-plane: enabled |
| secrets: |
| - kube-prometheus-stack-etcd-client-cert |
| additionalServiceMonitors: |
| - name: ceph |
| jobLabel: application |
| selector: |
| matchLabels: |
| application: ceph |
| namespaceSelector: |
| matchNames: |
| - openstack |
| endpoints: |
| - port: metrics |
| honorLabels: true |
| relabelings: |
| - action: replace |
| regex: (.*) |
| replacement: ceph |
| targetLabel: cluster |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|pod|service)$" |
| - name: coredns |
| jobLabel: app.kubernetes.io/name |
| namespaceSelector: |
| matchNames: |
| - openstack |
| selector: |
| matchLabels: |
| app.kubernetes.io/name: coredns |
| app.kubernetes.io/component: metrics |
| endpoints: |
| - port: "metrics" |
| path: "/metrics" |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_label_application"] |
| targetLabel: "application" |
| - sourceLabels: ["__meta_kubernetes_pod_name"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|pod|service)$" |
| - name: memcached |
| jobLabel: application |
| namespaceSelector: |
| matchNames: |
| - openstack |
| selector: |
| matchLabels: |
| application: memcached |
| component: server |
| endpoints: |
| - port: "metrics" |
| path: "/metrics" |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_name"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|pod|service)$" |
| - name: openstack-exporter |
| jobLabel: jobLabel |
| namespaceSelector: |
| matchNames: |
| - openstack |
| selector: |
| matchLabels: |
| application: openstack-exporter |
| endpoints: |
| - interval: 1m |
| scrapeTimeout: 30s |
| port: metrics |
| relabelings: |
| - action: replace |
| regex: (.*) |
| replacement: default |
| targetLabel: instance |
| additionalPodMonitors: |
| - name: ethtool-exporter |
| jobLabel: job |
| selector: |
| matchLabels: |
| application: ethtool-exporter |
| podMetricsEndpoints: |
| - port: metrics |
| path: /metrics |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_node_name"] |
| targetLabel: instance |
| - action: labeldrop |
| regex: ^(container|endpoint|namespace|pod)$ |
| - name: ipmi-exporter |
| jobLabel: job |
| selector: |
| matchLabels: |
| application: ipmi-exporter |
| podMetricsEndpoints: |
| - port: metrics |
| path: /metrics |
| interval: 60s |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_node_name"] |
| targetLabel: instance |
| - action: labeldrop |
| regex: ^(container|endpoint|namespace|pod)$ |
| - name: percona-xtradb-pxc |
| jobLabel: app.kubernetes.io/component |
| namespaceSelector: |
| matchNames: |
| - openstack |
| selector: |
| matchLabels: |
| app.kubernetes.io/component: pxc |
| app.kubernetes.io/instance: percona-xtradb |
| podMetricsEndpoints: |
| - port: metrics |
| path: /metrics |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_name"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|pod|service)$" |
| - name: rabbitmq |
| jobLabel: app.kubernetes.io/component |
| namespaceSelector: |
| matchNames: |
| - openstack |
| selector: |
| matchLabels: |
| app.kubernetes.io/component: rabbitmq |
| podMetricsEndpoints: |
| - port: prometheus |
| path: /metrics |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_name"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|pod|service)$" |
| prometheusOperator: |
| admissionWebhooks: |
| patch: |
| nodeSelector: |
| openstack-control-plane: enabled |
| serviceMonitor: |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_name"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|pod|service)$" |
| nodeSelector: |
| openstack-control-plane: enabled |
| prometheus-node-exporter: |
| extraArgs: |
| - --collector.diskstats.ignored-devices=^(ram|loop|nbd|fd|(h|s|v|xv)d[a-z]|nvme\\d+n\\d+p)\\d+$ |
| - --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|fuse.squashfuse_ll|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$ |
| - --collector.filesystem.mount-points-exclude=^/(dev|proc|run/credentials/.+|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+|var/lib/kubelet/plugins/kubernetes.io/csi/.+|run/containerd/.+)($|/) |
| - --collector.netclass.ignored-devices=^(lxc|cilium_|qbr|qvb|qvo|tap|ovs-system|br|tbr|gre_sys).*$ |
| - --collector.netdev.device-exclude=^(lxc|cilium_|qbr|qvb|qvo|tap|ovs-system|br|tbr|gre_sys).*$ |
| prometheus: |
| monitor: |
| relabelings: |
| - sourceLabels: ["__meta_kubernetes_pod_node_name"] |
| targetLabel: "instance" |
| - action: "labeldrop" |
| regex: "^(container|endpoint|namespace|pod|service)$" |
| additionalPrometheusRulesMap: |
| ceph: "{{ lookup('ansible.builtin.file', 'prometheus_alerts.yml') | from_yaml }}" |
| coredns: |
| groups: |
| - name: coredns |
| rules: |
| - alert: CoreDNSDown |
| expr: absent(up{job="coredns"} == 1) |
| for: 15m |
| labels: |
| severity: critical |
| - alert: CoreDNSLatencyHigh |
| expr: histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="coredns"}[5m])) by(server, zone, le)) > 4 |
| for: 10m |
| labels: |
| severity: critical |
| - alert: CoreDNSErrorsHigh |
| expr: |
| sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[5m])) |
| / |
| sum(rate(coredns_dns_responses_total{job="coredns"}[5m])) > 0.01 |
| for: 10m |
| labels: |
| severity: warning |
| - alert: CoreDNSErrorsHigh |
| expr: |
| sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[5m])) |
| / |
| sum(rate(coredns_dns_responses_total{job="coredns"}[5m])) > 0.03 |
| for: 10m |
| labels: |
| severity: critical |
| - name: coredns_forward |
| rules: |
| - alert: CoreDNSForwardLatencyHigh |
| expr: histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="coredns"}[5m])) by(to, le)) > 4 |
| for: 10m |
| labels: |
| severity: critical |
| - alert: CoreDNSForwardErrorsHigh |
| expr: |
| sum(rate(coredns_forward_responses_total{job="coredns",rcode="SERVFAIL"}[5m])) |
| / |
| sum(rate(coredns_forward_responses_total{job="coredns"}[5m])) > 0.01 |
| for: 10m |
| labels: |
| severity: warning |
| - alert: CoreDNSForwardErrorsHigh |
| expr: |
| sum(rate(coredns_forward_responses_total{job="coredns",rcode="SERVFAIL"}[5m])) |
| / |
| sum(rate(coredns_forward_responses_total{job="coredns"}[5m])) > 0.03 |
| for: 10m |
| labels: |
| severity: critical |
| - alert: CoreDNSForwardHealthcheckFailureCount |
| expr: sum(rate(coredns_forward_healthcheck_failures_total{job="coredns"}[5m])) by (to) > 0 |
| for: 2m |
| labels: |
| severity: warning |
| - alert: CoreDNSForwardHealthcheckBrokenCount |
| expr: sum(rate(coredns_forward_healthcheck_broken_total{job="coredns"}[5m])) > 0 |
| for: 2m |
| labels: |
| severity: critical |
| ethtool-exporter: |
| groups: |
| - name: rules |
| rules: |
| - alert: EthernetReceiveDiscards |
| expr: rate(node_net_ethtool{type="rx_discards"}[1m]) > 0 |
| labels: |
| severity: warning |
| ipmi-exporter: |
| groups: |
| - name: rules |
| rules: |
| - alert: IpmiCollectorDown |
| expr: ipmi_up == 0 |
| - name: collectors-state-warning |
| rules: |
| - alert: IpmiCurrent |
| expr: ipmi_current_state == 1 |
| labels: |
| severity: warning |
| - alert: IpmiFanSpeed |
| expr: ipmi_fan_speed_state == 1 |
| labels: |
| severity: warning |
| - alert: IpmiPower |
| expr: ipmi_power_state == 1 |
| labels: |
| severity: warning |
| - alert: IpmiSensor |
| expr: ipmi_sensor_state == 1 |
| labels: |
| severity: warning |
| - alert: IpmiTemperature |
| expr: ipmi_temperature_state == 1 |
| labels: |
| severity: warning |
| - alert: IpmiVoltage |
| expr: ipmi_voltage_state == 1 |
| labels: |
| severity: warning |
| - name: collectors-state-critical |
| rules: |
| - alert: IpmiCurrent |
| expr: ipmi_current_state == 2 |
| labels: |
| severity: critical |
| - alert: IpmiFanSpeed |
| expr: ipmi_fan_speed_state == 2 |
| labels: |
| severity: critical |
| - alert: IpmiPower |
| expr: ipmi_power_state == 2 |
| labels: |
| severity: critical |
| - alert: IpmiSensor |
| expr: ipmi_sensor_state == 2 |
| labels: |
| severity: critical |
| - alert: IpmiTemperature |
| expr: ipmi_temperature_state == 2 |
| labels: |
| severity: critical |
| - alert: IpmiVoltage |
| expr: ipmi_voltage_state == 2 |
| labels: |
| severity: critical |
| memcached: |
| groups: |
| - name: memcached |
| rules: |
| - alert: MemcachedDown |
| expr: memcached_up == 0 |
| for: 5m |
| labels: |
| severity: critical |
| - alert: MemcachedConnectionLimitApproaching |
| expr: (memcached_current_connections / memcached_max_connections * 100) > 80 |
| for: 5m |
| labels: |
| severity: warning |
| - alert: MemcachedConnectionLimitApproaching |
| expr: (memcached_current_connections / memcached_max_connections * 100) > 95 |
| for: 5m |
| labels: |
| severity: critical |
| node-exporter-local: |
| groups: |
| - name: node |
| rules: |
| - alert: NodeHighLoadAverage |
| expr: node_load5 / count(node_cpu_seconds_total{mode="system"}) without (cpu, mode) > 1.5 |
| for: 30m |
| labels: |
| severity: warning |
| - alert: NodeHighMemoryUsage |
| expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 2.5 |
| for: 2m |
| labels: |
| severity: critical |
| - alert: NodeHighCpuUsage |
| expr: sum by(instance)(irate(node_cpu_seconds_total{mode='idle'}[5m])) < 1 |
| for: 2m |
| labels: |
| severity: warning |
| - alert: NodeLowEntropy |
| expr: node_entropy_available_bits < 1000 |
| for: 5m |
| labels: |
| severity: warning |
| - name: softnet |
| rules: |
| - alert: NodeSoftNetTimesSqueezed |
| expr: sum(rate(node_softnet_times_squeezed_total[1m])) by (instance) > 10 |
| for: 10m |
| labels: |
| severity: warning |
| - alert: NodeSoftNetDrops |
| expr: sum(rate(node_softnet_dropped_total[1m])) by (instance) != 0 |
| for: 1m |
| labels: |
| severity: critical |
| openstack-exporter: |
| groups: |
| - name: cinder |
| rules: |
| - alert: CinderAgentDown |
| annotations: |
| description: | |
| '{% raw %}The service {{ $labels.exported_service }} running |
| on {{ $labels.hostname }} is being reported as down.{% endraw %}' |
| summary: | |
| '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| down{% endraw %}' |
| expr: | |
| openstack_cinder_agent_state != 1 |
| labels: |
| severity: warning |
| - alert: CinderAgentDown |
| annotations: |
| description: | |
| '{% raw %}The service {{ $labels.exported_service }} running on |
| {{ $labels.hostname }} is being reported as down for 5 minutes. |
| This can affect volume operations so it must be resolved as |
| quickly as possible.{% endraw %}' |
| summary: | |
| '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| down{% endraw %}' |
| expr: | |
| openstack_cinder_agent_state != 1 |
| for: 5m |
| labels: |
| severity: critical |
| - alert: CinderAgentDisabled |
| annotations: |
| description: | |
| '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} |
| has been disabled for 60 minutes. This can affect volume operations so it must be |
| resolved as quickly as possible.{% endraw %}' |
| summary: | |
| '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| disabled{% endraw %}' |
| expr: | |
| openstack_cinder_agent_state{adminState!="enabled"} |
| for: 1h |
| labels: |
| severity: warning |
| - alert: CinderVolumeInError |
| annotations: |
| description: | |
| '{% raw %}The volume {{ $labels.id }} has been in ERROR state for over 24 hours. |
| It must be cleaned up or removed in order to provide a consistent customer |
| experience.{% endraw %}' |
| summary: | |
| '{% raw %}[{{ $labels.id }}] Volume in ERROR state{% endraw %}' |
| expr: | |
| openstack_cinder_volume_status{status=~"error.*"} |
| for: 24h |
| labels: |
| severity: warning |
| - name: neutron |
| rules: |
| - alert: NeutronAgentDown |
| annotations: |
| description: | |
| '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} |
| is being reported as down.{% endraw %}' |
| summary: | |
| '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| down{% endraw %}' |
| expr: | |
| openstack_neutron_agent_state != 1 |
| labels: |
| severity: warning |
| - alert: NeutronAgentDown |
| annotations: |
| description: | |
| '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} |
| is being reported as down for 5 minutes. This can affect network operations so it |
| must be resolved as quickly as possible.{% endraw %}' |
| summary: | |
| '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| down{% endraw %}' |
| expr: | |
| openstack_neutron_agent_state != 1 |
| for: 5m |
| labels: |
| severity: critical |
| - alert: NeutronAgentDisabled |
| annotations: |
| description: | |
| '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} |
| has been disabled for 60 minutes. This can affect network operations so it must be |
| resolved as quickly as possible.{% endraw %}' |
| summary: | |
| '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| disabled{% endraw %}' |
| expr: | |
| openstack_neutron_agent_state{adminState!="up"} |
| for: 1h |
| labels: |
| severity: warning |
| - alert: NeutronBindingFailedPorts |
| annotations: |
| description: | |
| '{% raw %}The NIC {{ $labels.mac_address }} of {{ $labels.device_owner }} |
| has binding failed port now.{% endraw %}' |
| summary: | |
| '{% raw %}[{{ $labels.device_owner }}] {{ $labels.mac_address }} |
| binding failed{% endraw %}' |
| expr: | |
| openstack_neutron_port{binding_vif_type="binding_failed"} != 0 |
| labels: |
| severity: warning |
| - alert: NeutronNetworkOutOfIPs |
| annotations: |
| description: | |
| '{% raw %}The subnet {{ $labels.subnet_name }} within {{ $labels.network_name }} |
| is currently at {{ $value }}% utilization. If the IP addresses run out, it will |
| impact the provisioning of new ports.{% endraw %}' |
| summary: | |
| '{% raw %}[{{ $labels.network_name }}] {{ $labels.subnet_name }} |
| running out of IPs{% endraw %}' |
| expr: | |
| sum by (network_id) (openstack_neutron_network_ip_availabilities_used{project_id!=""}) / sum by (network_id) |
| (openstack_neutron_network_ip_availabilities_total{project_id!=""}) * 100 > 80 |
| labels: |
| severity: warning |
| - name: nova |
| rules: |
| - alert: NovaAgentDown |
| annotations: |
| description: | |
| '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} |
| is being reported as down.{% endraw %}' |
| summary: | |
| '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| down{% endraw %}' |
| expr: | |
| openstack_nova_agent_state != 1 |
| labels: |
| severity: warning |
| - alert: NovaAgentDown |
| annotations: |
| description: | |
| '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} is |
| being reported as down. This can affect compute operations so it must be resolved as |
| quickly as possible.{% endraw %}' |
| summary: | |
| '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| down{% endraw %}' |
| expr: | |
| openstack_nova_agent_state != 1 |
| for: 5m |
| labels: |
| severity: critical |
| - alert: NovaAgentDisabled |
| annotations: |
| description: | |
| '{% raw %}The service {{ $labels.exported_service }} running on {{ $labels.hostname }} has been |
| disabled for 60 minutes. This can affect compute operations so it must be resolved as quickly |
| as possible.{% endraw %}' |
| summary: | |
| '{% raw %}[{{ $labels.hostname }}] {{ $labels.exported_service }} |
| disabled{% endraw %}' |
| expr: | |
| openstack_nova_agent_state{adminState!="enabled"} |
| for: 1h |
| labels: |
| severity: warning |
| - alert: NovaInstanceInError |
| annotations: |
| description: | |
| '{% raw %}The instance {{ $labels.id }} has been in ERROR state for over 24 hours. |
| It must be cleaned up or removed in order to provide a consistent customer |
| experience.{% endraw %}' |
| summary: "{% raw %}[{{ $labels.id }}] Instance in ERROR state{% endraw %}" |
| expr: | |
| openstack_nova_server_status{status="ERROR"} |
| for: 24h |
| labels: |
| severity: warning |
| - alert: NovaFailureRisk |
| annotations: |
| description: | |
| '{% raw %}The cloud capacity will be at {{ $value }} in the event of the failure of |
| a single hypervisor which puts the cloud at risk of not being able to recover should |
| any hypervisor failures occur. Please ensure that adequate amount of infrastructure |
| is assigned to this deployment to prevent this.{% endraw %}' |
| summary: "{% raw %}[nova] Failure risk{% endraw %}" |
| expr: | |
| (sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) - max(openstack_nova_memory_used_bytes)) |
| / sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) * 100 < 0.25 |
| for: 6h |
| labels: |
| severity: warning |
| - alert: NovaCapacity |
| annotations: |
| description: | |
| '{% raw %}The cloud capacity is currently at `{{ $value }}` which means there is a |
| risk of running out of capacity due to the timeline required to add new nodes. |
| Please ensure that adequate amount of infrastructure is assigned to this deployment |
| to prevent this.{% endraw %}' |
| summary: "{% raw %}[nova] Capacity risk{% endraw %}" |
| expr: | |
| sum ( |
| openstack_nova_memory_used_bytes |
| + on(hostname) group_left(adminState) |
| (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) |
| ) / sum ( |
| openstack_nova_memory_available_bytes |
| + on(hostname) group_left(adminState) |
| (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) |
| ) * 100 > 75 |
| for: 6h |
| labels: |
| severity: warning |
| percona-xtradb-pxc: |
| groups: |
| # TODO: basic rules |
| - name: general |
| rules: |
| - alert: MySQLDown |
| expr: mysql_up != 1 |
| for: 5m |
| labels: |
| severity: critical |
| - alert: MysqlTooManyConnections |
| expr: max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80 |
| for: 2m |
| labels: |
| severity: warning |
| - alert: MysqlHighThreadsRunning |
| expr: max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60 |
| for: 2m |
| labels: |
| severity: warning |
| - alert: MysqlSlowQueries |
| expr: increase(mysql_global_status_slow_queries[1m]) > 0 |
| for: 2m |
| labels: |
| severity: warning |
| - name: galera |
| rules: |
| - alert: MySQLGaleraNotReady |
| expr: mysql_global_status_wsrep_ready != 1 |
| for: 5m |
| labels: |
| severity: critical |
| - alert: MySQLGaleraOutOfSync |
| expr: mysql_global_status_wsrep_local_state != 4 and mysql_global_variables_wsrep_desync == 0 |
| for: 5m |
| labels: |
| severity: critical |
| - alert: MySQLGaleraDonorFallingBehind |
| expr: mysql_global_status_wsrep_local_state == 2 and mysql_global_status_wsrep_local_recv_queue > 100 |
| for: 5m |
| labels: |
| severity: warning |
| - alert: MySQLReplicationNotRunning |
| expr: mysql_slave_status_slave_io_running == 0 or mysql_slave_status_slave_sql_running == 0 |
| for: 2m |
| labels: |
| severity: critical |
| - alert: MySQLReplicationLag |
| expr: (instance:mysql_slave_lag_seconds > 30) and on(instance) (predict_linear(instance:mysql_slave_lag_seconds[5m], 60 * 2) > 0) |
| for: 1m |
| labels: |
| severity: critical |
| - alert: MySQLHeartbeatLag |
| expr: (instance:mysql_heartbeat_lag_seconds > 30) and on(instance) (predict_linear(instance:mysql_heartbeat_lag_seconds[5m], 60 * 2) > 0) |
| for: 1m |
| labels: |
| severity: critical |
| - alert: MySQLInnoDBLogWaits |
| expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10 |
| labels: |
| severity: warning |
| rabbitmq: |
| groups: |
| - name: recording |
| rules: |
| - record: rabbitmq:usage:memory |
| labels: |
| job: rabbitmq |
| expr: | |
| sum without (job) ( |
| rabbitmq_process_resident_memory_bytes |
| ) / sum without ( |
| container, |
| pod, |
| job, |
| namespace, |
| node, |
| resource, |
| uid, |
| unit |
| ) ( |
| label_replace( |
| cluster:namespace:pod_memory:active:kube_pod_container_resource_limits, |
| "instance", |
| "$1", |
| "pod", |
| "(.*)" |
| ) |
| ) |
| - name: alarms |
| rules: |
| - alert: RabbitmqAlarmFreeDiskSpace |
| expr: rabbitmq_alarms_free_disk_space_watermark == 1 |
| labels: |
| severity: critical |
| - alert: RabbitmqAlarmMemoryUsedWatermark |
| expr: rabbitmq_alarms_memory_used_watermark == 1 |
| labels: |
| severity: critical |
| - alert: RabbitmqAlarmFileDescriptorLimit |
| expr: rabbitmq_alarms_file_descriptor_limit == 1 |
| labels: |
| severity: critical |
| - name: limits |
| rules: |
| - alert: RabbitmqMemoryHigh |
| expr: rabbitmq:usage:memory > 0.80 |
| labels: |
| severity: warning |
| - alert: RabbitmqMemoryHigh |
| expr: rabbitmq:usage:memory > 0.95 |
| labels: |
| severity: critical |
| - alert: RabbitmqFileDescriptorsUsage |
| expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds > 0.80 |
| labels: |
| severity: warning |
| - alert: RabbitmqFileDescriptorsUsage |
| expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds > 0.95 |
| labels: |
| severity: critical |
| - alert: RabbitmqTcpSocketsUsage |
| expr: rabbitmq_process_open_tcp_sockets / rabbitmq_process_max_tcp_sockets > 0.80 |
| labels: |
| severity: warning |
| - alert: RabbitmqTcpSocketsUsage |
| expr: rabbitmq_process_open_tcp_sockets / rabbitmq_process_max_tcp_sockets > 0.95 |
| labels: |
| severity: critical |
| - name: msgs |
| rules: |
| - alert: RabbitmqUnackedMessages |
| expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000 |
| for: 5m |
| labels: |
| severity: warning |
| - alert: RabbitmqUnackedMessages |
| expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000 |
| for: 1h |
| labels: |
| severity: critical |