blob: 02c22505675961aff7eecd858ccf1b48dfc74add [file] [log] [blame]
# Copyright (c) 2022 VEXXHOST, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
_kube_prometheus_stack_values:
defaultRules:
disabled:
# NOTE(mnaser): https://github.com/prometheus-community/helm-charts/issues/144
# https://github.com/openshift/cluster-monitoring-operator/issues/248
etcdHighNumberOfFailedGRPCRequests: true
alertmanager:
serviceMonitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
alertmanagerSpec:
storage:
volumeClaimTemplate:
spec:
storageClassName: general
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 40Gi
nodeSelector:
openstack-control-plane: enabled
grafana:
serviceMonitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
nodeSelector:
openstack-control-plane: enabled
kubeApiServer:
serviceMonitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
kubelet:
serviceMonitor:
cAdvisorRelabelings:
- sourceLabels: [__metrics_path__]
targetLabel: metrics_path
- sourceLabels: ["node"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|node|service)$"
probesRelabelings:
- sourceLabels: [__metrics_path__]
targetLabel: metrics_path
- sourceLabels: ["node"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|node|service)$"
relabelings:
- sourceLabels: [__metrics_path__]
targetLabel: metrics_path
- sourceLabels: ["node"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|node|service)$"
kubeControllerManager:
serviceMonitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
coreDns:
serviceMonitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_name"]
targetLabel: "instance"
- regex: "^(container|endpoint|namespace|pod|service)$"
action: "labeldrop"
kubeEtcd:
serviceMonitor:
scheme: https
serverName: localhost
insecureSkipVerify: false
caFile: /etc/prometheus/secrets/kube-prometheus-stack-etcd-client-cert/ca.crt
certFile: /etc/prometheus/secrets/kube-prometheus-stack-etcd-client-cert/healthcheck-client.crt
keyFile: /etc/prometheus/secrets/kube-prometheus-stack-etcd-client-cert/healthcheck-client.key
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
kubeScheduler:
service:
port: 10259
targetPort: 10259
serviceMonitor:
https: true
insecureSkipVerify: true
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
kubeProxy:
serviceMonitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
kube-state-metrics:
prometheus:
monitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
nodeSelector:
openstack-control-plane: enabled
prometheus:
serviceMonitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
prometheusSpec:
nodeSelector:
openstack-control-plane: enabled
secrets:
- kube-prometheus-stack-etcd-client-cert
additionalServiceMonitors:
- name: ceph
selector:
matchLabels:
application: ceph
jobLabel: application
namespaceSelector:
matchNames:
- openstack
endpoints:
- port: metrics
honorLabels: true
relabelings:
- action: replace
regex: (.*)
replacement: ceph
targetLabel: cluster
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
prometheusOperator:
admissionWebhooks:
patch:
nodeSelector:
openstack-control-plane: enabled
serviceMonitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
nodeSelector:
openstack-control-plane: enabled
prometheus-node-exporter:
extraArgs:
- --collector.diskstats.ignored-devices=^(ram|loop|nbd|fd|(h|s|v|xv)d[a-z]|nvme\\d+n\\d+p)\\d+$
- --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|fuse.squashfuse_ll|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
- --collector.filesystem.mount-points-exclude=^/(dev|proc|run/credentials/.+|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+|var/lib/kubelet/plugins/kubernetes.io/csi/.+|run/containerd/.+)($|/)
- --collector.netclass.ignored-devices=^(lxc|cilium_|qbr|qvb|qvo|tap|ovs-system|br|tbr|gre_sys).*$
- --collector.netdev.device-exclude=^(lxc|cilium_|qbr|qvb|qvo|tap|ovs-system|br|tbr|gre_sys).*$
prometheus:
monitor:
relabelings:
- sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: "instance"
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
additionalPrometheusRulesMap:
ceph: "{{ lookup('ansible.builtin.file', 'prometheus_alerts.yml') | from_yaml }}"
coredns:
groups:
- name: coredns
rules:
- alert: CoreDNSDown
expr: absent(up{job="coredns"} == 1)
for: 15m
labels:
severity: critical
- alert: CoreDNSLatencyHigh
expr: histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="coredns"}[5m])) by(server, zone, le)) > 4
for: 10m
labels:
severity: critical
- alert: CoreDNSErrorsHigh
expr:
sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[5m]))
/
sum(rate(coredns_dns_responses_total{job="coredns"}[5m])) > 0.01
for: 10m
labels:
severity: warning
- alert: CoreDNSErrorsHigh
expr:
sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[5m]))
/
sum(rate(coredns_dns_responses_total{job="coredns"}[5m])) > 0.03
for: 10m
labels:
severity: critical
- name: coredns_forward
rules:
- alert: CoreDNSForwardLatencyHigh
expr: histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="coredns"}[5m])) by(to, le)) > 4
for: 10m
labels:
severity: critical
- alert: CoreDNSForwardErrorsHigh
expr:
sum(rate(coredns_forward_responses_total{job="coredns",rcode="SERVFAIL"}[5m]))
/
sum(rate(coredns_forward_responses_total{job="coredns"}[5m])) > 0.01
for: 10m
labels:
severity: warning
- alert: CoreDNSForwardErrorsHigh
expr:
sum(rate(coredns_forward_responses_total{job="coredns",rcode="SERVFAIL"}[5m]))
/
sum(rate(coredns_forward_responses_total{job="coredns"}[5m])) > 0.03
for: 10m
labels:
severity: critical
- alert: CoreDNSForwardHealthcheckFailureCount
expr: sum(rate(coredns_forward_healthcheck_failures_total{job="coredns"}[5m])) by (to) > 0
for: 2m
labels:
severity: warning
- alert: CoreDNSForwardHealthcheckBrokenCount
expr: sum(rate(coredns_forward_healthcheck_broken_total{job="coredns"}[5m])) > 0
for: 2m
labels:
severity: critical
node-exporter-local:
groups:
- name: node
rules:
- alert: NodeHighLoadAverage
expr: node_load5 / count(node_cpu_seconds_total{mode="system"}) without (cpu, mode) > 1.5
for: 30m
labels:
severity: warning
- alert: NodeHighMemoryUsage
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 2.5
for: 2m
labels:
severity: critical
- alert: NodeHighCpuUsage
expr: sum by(instance)(irate(node_cpu_seconds_total{mode='idle'}[5m])) < 1
for: 2m
labels:
severity: warning
- alert: NodeLowEntropy
expr: node_entropy_available_bits < 1000
for: 5m
labels:
severity: warning
- name: softnet
rules:
- alert: NodeSoftNetTimesSqueezed
expr: sum(rate(node_softnet_times_squeezed_total[1m])) by (instance) > 10
for: 10m
labels:
severity: warning
- alert: NodeSoftNetDrops
expr: sum(rate(node_softnet_dropped_total[1m])) by (instance) != 0
for: 1m
labels:
severity: critical