blob: 144e263f7061fb2d84de98bd97dd58f9baa19d28 [file] [log] [blame]
Giovanni Tirloni59219b62024-04-09 14:50:25 -03001---
2groups:
3 - name: "loki_alerts"
4 rules:
5{{- if not (.Values.monitoring.rules.disabled.LokiRequestErrors | default false) }}
6 - alert: "LokiRequestErrors"
7 annotations:
8 message: |
9 {{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% errors.
10 expr: |
11 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
12 /
13 sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
14 > 10
15 for: "15m"
16 labels:
17 severity: "critical"
18{{- if .Values.monitoring.rules.additionalRuleLabels }}
19{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
20{{- end }}
21{{- end }}
22{{- if not (.Values.monitoring.rules.disabled.LokiRequestPanics | default false) }}
23 - alert: "LokiRequestPanics"
24 annotations:
25 message: |
26 {{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% increase of panics.
27 expr: |
28 sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
29 labels:
30 severity: "critical"
31{{- if .Values.monitoring.rules.additionalRuleLabels }}
32{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
33{{- end }}
34{{- end }}
35{{- if not (.Values.monitoring.rules.disabled.LokiRequestLatency | default false) }}
36 - alert: "LokiRequestLatency"
37 annotations:
38 message: |
39 {{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
40 expr: |
41 namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1
42 for: "15m"
43 labels:
44 severity: "critical"
45{{- if .Values.monitoring.rules.additionalRuleLabels }}
46{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
47{{- end }}
48{{- end }}
49{{- if not (.Values.monitoring.rules.disabled.LokiTooManyCompactorsRunning | default false) }}
50 - alert: "LokiTooManyCompactorsRunning"
51 annotations:
52 message: |
53 {{`{{`}} $labels.cluster {{`}}`}} {{`{{`}} $labels.namespace {{`}}`}} has had {{`{{`}} printf "%.0f" $value {{`}}`}} compactors running for more than 5m. Only one compactor should run at a time.
54 expr: |
55 sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
56 for: "5m"
57 labels:
58 severity: "warning"
59{{- if .Values.monitoring.rules.additionalRuleLabels }}
60{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
61{{- end }}
62{{- end }}
63{{- if not (.Values.monitoring.rules.disabled.LokiCanaryLatency | default false) }}
64 - name: "loki_canaries_alerts"
65 rules:
66 - alert: "LokiCanaryLatency"
67 annotations:
68 message: |
69 {{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
70 expr: |
71 histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > 5
72 for: "15m"
73 labels:
74 severity: "warning"
75{{- if .Values.monitoring.rules.additionalRuleLabels }}
76{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
77{{- end }}
78{{- end }}