blob: 3a31daa948d1ea6ec23c917829002fad7615fa5e [file] [log] [blame]
// NOTE(mnaser): The following are a list of disabled alerts known to be noisy
// or not useful.
local disabledAlerts = [
// * Dropped `CephNodeDiskspaceWarning` because we already have a
// few alerts like `NodeFilesystemSpaceFillingUp`, etc.
'CephNodeDiskspaceWarning',
// * Dropped `CephNodeNetworkPacketDrops` due to noisy alerts with
// no actionable items to fix it.
'CephNodeNetworkPacketDrops',
// Superseded by CephHealthDetail* alerts
'CephHealthWarning',
'CephHealthError',
// * Dropped `CephPGImbalance`
// the balancer module takes care of this
'CephPGImbalance',
// * Dropped `MySQLDown` due to noisy alerts even
// the replication still more than minimum
'MySQLDown',
];
// NOTE(mnaser): This is the default mapping for severities:
// - P1: Full service disruption or significant loss of
// functionality. Requires immediate action.
// - P2: Major functionality broken, affecting large group of
// users or critical components. Prompt attention needed.
// - P3: Issues affecting smaller group of users or a single
// system. Attention required during business hours.
// - P4: Minor issues with limited impact. Attention and potential
// action needed during standard business hours.
// - P5: Normal activities or minor issues. Typically no immediate
// attention or action required.
local defaultSeverityMapping = {
critical: 'P1',
warning: 'P3',
info: 'P5',
};
// NOTE(mnaser): The mapping here follows the format 'AlertName:Severity'. The
// 'Severity' corresponds to the severity level of the alert, and
// it maps to one of the severity levels defined in
// defaultSeverityMapping.
local customSeverityMapping = {
'CephMgrPrometheusModuleInactive:critical': 'P4',
'CephMonDown:warning': 'P4',
'CephMonDownQuorumAtRisk:critical': 'P3',
'CephOSDTimeoutsClusterNetwork:warning': 'P4',
'CephOSDTimeoutsPublicNetwork:warning': 'P4',
'KubeJobFailed:warning': 'P4',
};
local getSeverity(rule) =
// Return immediately if the string starts with "P"
if std.startsWith(rule.labels.severity, 'P') then rule.labels.severity
else
local key = rule.alert + ':' + rule.labels.severity;
if key in customSeverityMapping then customSeverityMapping[key]
else defaultSeverityMapping[rule.labels.severity];
local mixins = {
alertmanager: (import 'vendor/github.com/prometheus/alertmanager/doc/alertmanager-mixin/mixin.libsonnet') + {
_config+:: {
alertmanagerSelector: 'job="kube-prometheus-stack-alertmanager",namespace="monitoring"',
alertmanagerClusterLabels: 'namespace,service,cluster',
},
},
ceph: (import 'vendor/github.com/ceph/ceph/monitoring/ceph-mixin/mixin.libsonnet') + {
prometheusAlerts+:: {
groups+: [
{
name: 'cluster health detail',
rules: [
{
alert: 'CephHealthDetailError',
'for': '5m',
expr: 'ceph_health_detail{severity="HEALTH_ERROR"} == 1',
labels: { severity: 'critical' },
annotations: {
summary: 'Ceph is in the ERROR state',
description: "Health check {{ $labels.name }} has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information.",
},
},
{
alert: 'CephHealthDetailWarning',
'for': '15m',
expr: 'ceph_health_detail{severity="HEALTH_WARN"} == 1',
labels: { severity: 'warning' },
annotations: {
summary: 'Ceph is in the WARNING state',
description: "Health check {{ $labels.name }} has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information.",
},
},
],
},
],
}
},
coredns: (import 'vendor/github.com/povilasv/coredns-mixin/mixin.libsonnet') + {
_config+:: {
corednsSelector: 'job="coredns"',
},
},
kube: (import 'vendor/github.com/kubernetes-monitoring/kubernetes-mixin/mixin.libsonnet') + {
_config+:: {
kubeApiserverSelector: 'job="apiserver"',
},
},
memcached: (import 'vendor/github.com/grafana/jsonnet-libs/memcached-mixin/mixin.libsonnet'),
mysqld: (import 'vendor/github.com/prometheus/mysqld_exporter/mysqld-mixin/mixin.libsonnet') + {
prometheusAlerts+:: {
groups+: [
{
name: 'mysqld-extras',
rules: [
{
alert: 'MysqlTooManyConnections',
'for': '1m',
expr: |||
max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80
|||,
labels: {
severity: 'warning',
},
},
{
alert: 'MysqlHighThreadsRunning',
'for': '1m',
expr: |||
max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60
|||,
labels: {
severity: 'warning',
},
},
{
alert: 'MysqlSlowQueries',
'for': '2m',
expr: |||
increase(mysql_global_status_slow_queries[1m]) > 0
|||,
labels: {
severity: 'warning',
},
},
{
alert: 'MysqlClusterDown',
'for': '5m',
expr: 'mysql_up == 0',
labels: { severity: 'info' },
annotations: {
summary: 'Percona XtraDB Cluster replica is down',
description: "{{ $labels.instance }} replica is down.",
},
},
{
alert: 'MysqlClusterDown',
'for': '5m',
expr: 'round(count(mysql_up==1) / count(mysql_up) * 100) <= 50',
labels: { severity: 'warning' },
annotations: {
summary: 'Percona XtraDB Cluster replicas are down',
description: "{{ $value }}% of replicas are online.",
},
},
{
alert: 'MysqlClusterDown',
'for': '1m',
expr: 'count(mysql_up==0) == count(mysql_up)',
labels: { severity: 'critical' },
annotations: {
summary: 'Percona XtraDB Cluster is down',
description: "All replicas are down.",
},
},
],
},
],
},
},
node: (import 'vendor/github.com/prometheus/node_exporter/docs/node-mixin/mixin.libsonnet'),
openstack: (import 'openstack.libsonnet'),
} + (import 'legacy.libsonnet');
{
[key]: mixins[key] {
prometheusAlerts: {
groups: [
{
name: group.name,
rules: [
rule {
labels+: {
severity: getSeverity(rule),
},
}
for rule in group.rules
if !std.member(disabledAlerts, rule.alert)
],
}
for group in mixins[key].prometheusAlerts.groups
],
},
}
for key in std.objectFields(mixins)
}