blob: b11d8a6f073d901e549672530e0fb3fbdf9251c5 [file] [log] [blame]
{
'ipmi-exporter': {
prometheusAlerts+:: {
groups: [
{
name: 'rules',
rules: [
{
alert: 'IpmiCollectorDown',
'for': '15m',
expr: 'ipmi_up == 0',
labels: {
severity: 'warning',
},
},
],
},
{
name: 'collectors-state-warning',
rules: [
{
alert: 'IpmiCurrent',
expr: 'ipmi_current_state == 1',
labels: {
severity: 'warning',
},
},
{
alert: 'IpmiFanSpeed',
expr: 'ipmi_fan_speed_state == 1',
labels: {
severity: 'warning',
},
},
{
alert: 'IpmiPower',
expr: 'ipmi_power_state == 1',
labels: {
severity: 'warning',
},
},
{
alert: 'IpmiSensor',
expr: 'ipmi_sensor_state == 1',
labels: {
severity: 'warning',
},
},
{
alert: 'IpmiTemperature',
expr: 'ipmi_temperature_state == 1',
labels: {
severity: 'warning',
},
},
{
alert: 'IpmiVoltage',
expr: 'ipmi_voltage_state == 1',
labels: {
severity: 'warning',
},
},
],
},
{
name: 'collectors-state-critical',
rules: [
{
alert: 'IpmiCurrent',
expr: 'ipmi_current_state == 2',
labels: {
severity: 'critical',
},
},
{
alert: 'IpmiFanSpeed',
expr: 'ipmi_fan_speed_state == 2',
labels: {
severity: 'critical',
},
},
{
alert: 'IpmiPower',
expr: 'ipmi_power_state == 2',
labels: {
severity: 'critical',
},
},
{
alert: 'IpmiSensor',
expr: 'ipmi_sensor_state{name!="TPM Presence"} == 2',
labels: {
severity: 'critical',
},
},
{
alert: 'IpmiTemperature',
expr: 'ipmi_temperature_state == 2',
labels: {
severity: 'critical',
},
},
{
alert: 'IpmiVoltage',
expr: 'ipmi_voltage_state == 2',
labels: {
severity: 'critical',
},
},
],
},
],
},
},
'node-exporter-local': {
prometheusRules+:: {
groups: [
{
name: 'softnet.rules',
rules:
local recordingRule(metric, expr) = {
record: 'node:softnet:' + metric + ':1m',
expr: expr,
};
[
recordingRule('backlog', 'sum(node_softnet_backlog_len) by (instance)'),
recordingRule('squeezed', 'sum(rate(node_softnet_times_squeezed_total[1m])) by (instance)'),
recordingRule('dropped', 'sum(rate(node_softnet_dropped_total[1m])) by (instance)'),
],
},
],
},
prometheusAlerts+:: {
groups: [
{
name: 'node',
rules: [
{
alert: 'NodeHighLoadAverage',
expr: 'node_load5 / count(node_cpu_seconds_total{mode="system"}) without (cpu, mode) > 1.5',
'for': '30m',
labels: {
severity: 'warning',
},
},
{
alert: 'NodeHighMemoryUsage',
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 2.5',
'for': '2m',
labels: {
severity: 'critical',
},
},
{
alert: 'NodeHighCpuUsage',
expr: "sum by(instance)(irate(node_cpu_seconds_total{mode='idle'}[5m])) < 1",
'for': '2m',
labels: {
severity: 'warning',
},
},
{
alert: 'NodeLowEntropy',
expr: 'node_entropy_available_bits / node_entropy_pool_size_bits < 0.20',
'for': '5m',
labels: {
severity: 'P5',
},
},
{
alert: 'NodeNonLTSKernel',
expr: 'node_uname_info{release!~"^5.(4|15).*"}',
labels: {
severity: 'P5',
},
},
],
},
{
name: 'network',
rules: [
{
alert: 'NodeNetworkMulticast',
expr: 'rate(node_network_receive_multicast_total[1m]) > 1000',
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'High multicast traffic on node {{ $labels.instance }}: {{ $value }} packets/sec',
description: 'This can result in high software interrupt load on the node which can bring network performance down.',
runbook_url: 'https://github.com/vexxhost/atmosphere/tree/main/roles/kube_prometheus_stack#NodeNetworkMulticast',
},
},
],
},
{
name: 'softnet',
rules:
local capitalize(s) = std.asciiUpper(std.substr(s, 0, 1)) + std.substr(s, 1, std.length(s) - 1);
local alertRule(metric, threshold, nodesAffected) = {
alert: {
'0': 'SingleNodeSoftNet' + capitalize(metric),
'0.5': 'MultipleNodesSoftNet' + capitalize(metric),
'0.75': 'MajorityNodesSoftNet' + capitalize(metric),
}[nodesAffected],
expr: 'count(node:softnet:%s:1m > %s) > (count(node:softnet:%s:1m) * %s)' % [metric, threshold, metric, nodesAffected],
'for': '1m',
labels: {
severity: {
'0': 'P3',
'0.5': 'P2',
'0.75': 'P1',
}[nodesAffected],
},
};
[
alertRule('backlog', '5000', '0'),
alertRule('backlog', '5000', '0.5'),
alertRule('backlog', '5000', '0.75'),
alertRule('squeezed', '0', '0'),
alertRule('dropped', '0', '0'),
alertRule('dropped', '0', '0.5'),
alertRule('dropped', '0', '0.75'),
],
},
],
},
},
rabbitmq: {
prometheusRules+:: {
groups: [
{
name: 'recording',
rules:
[
{
record: 'rabbitmq:usage:memory',
labels: {
job: 'rabbitmq',
},
expr: 'sum without (job) ( rabbitmq_process_resident_memory_bytes ) / sum without ( container, pod, job, namespace, node, resource, uid, unit ) ( label_replace( cluster:namespace:pod_memory:active:kube_pod_container_resource_limits, "instance", "$1", "pod", "(.*)" ) )',
},
],
},
],
},
prometheusAlerts+:: {
groups: [
{
name: 'alarms',
rules: [
{
alert: 'RabbitmqAlarmFreeDiskSpace',
expr: 'rabbitmq_alarms_free_disk_space_watermark == 1',
labels: {
severity: 'critical',
},
},
{
alert: 'RabbitmqAlarmMemoryUsedWatermark',
expr: 'rabbitmq_alarms_memory_used_watermark == 1',
labels: {
severity: 'critical',
},
},
{
alert: 'RabbitmqAlarmFileDescriptorLimit',
expr: 'rabbitmq_alarms_file_descriptor_limit == 1',
labels: {
severity: 'critical',
},
},
],
},
{
name: 'limits',
rules: [
{
alert: 'RabbitmqMemoryHigh',
expr: 'rabbitmq:usage:memory > 0.80',
labels: {
severity: 'warning',
},
},
{
alert: 'RabbitmqMemoryHigh',
expr: 'rabbitmq:usage:memory > 0.95',
labels: {
severity: 'critical',
},
},
{
alert: 'RabbitmqFileDescriptorsUsage',
expr: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds > 0.80',
labels: {
severity: 'warning',
},
},
{
alert: 'RabbitmqFileDescriptorsUsage',
expr: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds > 0.95',
labels: {
severity: 'critical',
},
},
{
alert: 'RabbitmqTcpSocketsUsage',
expr: 'rabbitmq_process_open_tcp_sockets / rabbitmq_process_max_tcp_sockets > 0.80',
labels: {
severity: 'warning',
},
},
{
alert: 'RabbitmqTcpSocketsUsage',
expr: 'rabbitmq_process_open_tcp_sockets / rabbitmq_process_max_tcp_sockets > 0.95',
labels: {
severity: 'critical',
},
},
],
},
{
name: 'msgs',
rules: [
{
alert: 'RabbitmqUnackedMessages',
expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000',
'for': '5m',
labels: {
severity: 'warning',
},
},
{
alert: 'RabbitmqUnackedMessages',
expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000',
'for': '1h',
labels: {
severity: 'critical',
},
},
],
},
],
},
},
}