blob: 144e263f7061fb2d84de98bd97dd58f9baa19d28 [file] [log] [blame] [edit]
---
groups:
- name: "loki_alerts"
rules:
{{- if not (.Values.monitoring.rules.disabled.LokiRequestErrors | default false) }}
- alert: "LokiRequestErrors"
annotations:
message: |
{{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% errors.
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
> 10
for: "15m"
labels:
severity: "critical"
{{- if .Values.monitoring.rules.additionalRuleLabels }}
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
{{- end }}
{{- end }}
{{- if not (.Values.monitoring.rules.disabled.LokiRequestPanics | default false) }}
- alert: "LokiRequestPanics"
annotations:
message: |
{{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% increase of panics.
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
labels:
severity: "critical"
{{- if .Values.monitoring.rules.additionalRuleLabels }}
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
{{- end }}
{{- end }}
{{- if not (.Values.monitoring.rules.disabled.LokiRequestLatency | default false) }}
- alert: "LokiRequestLatency"
annotations:
message: |
{{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
expr: |
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1
for: "15m"
labels:
severity: "critical"
{{- if .Values.monitoring.rules.additionalRuleLabels }}
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
{{- end }}
{{- end }}
{{- if not (.Values.monitoring.rules.disabled.LokiTooManyCompactorsRunning | default false) }}
- alert: "LokiTooManyCompactorsRunning"
annotations:
message: |
{{`{{`}} $labels.cluster {{`}}`}} {{`{{`}} $labels.namespace {{`}}`}} has had {{`{{`}} printf "%.0f" $value {{`}}`}} compactors running for more than 5m. Only one compactor should run at a time.
expr: |
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
for: "5m"
labels:
severity: "warning"
{{- if .Values.monitoring.rules.additionalRuleLabels }}
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
{{- end }}
{{- end }}
{{- if not (.Values.monitoring.rules.disabled.LokiCanaryLatency | default false) }}
- name: "loki_canaries_alerts"
rules:
- alert: "LokiCanaryLatency"
annotations:
message: |
{{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
expr: |
histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > 5
for: "15m"
labels:
severity: "warning"
{{- if .Values.monitoring.rules.additionalRuleLabels }}
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
{{- end }}
{{- end }}