[ATMOSPHERE-369] [stable/2023.1] Add the NodeTimeSkewDetected alert (#2178)
This is an automated cherry-pick of #2151
/assign larainema
Depends-On #2157
diff --git a/roles/kube_prometheus_stack/files/jsonnet/tests.yml b/roles/kube_prometheus_stack/files/jsonnet/tests.yml
index 9ccba90..4775bb2 100644
--- a/roles/kube_prometheus_stack/files/jsonnet/tests.yml
+++ b/roles/kube_prometheus_stack/files/jsonnet/tests.yml
@@ -72,3 +72,36 @@
exp_annotations:
summary: "Nova service group down"
description: "All instances of a specific Nova service have been down for more than 5 minutes."
+
+ - interval: 1m
+ input_series:
+ - series: 'node_time_seconds{instance="instance1", job="node"}'
+ values: '0 60 120 180 240 300'
+ - series: 'node_time_seconds{instance="instance2", job="node"}'
+ values: '1 61 121 181 241 301'
+ - series: 'node_time_seconds{instance="instance3", job="node"}'
+ values: '2 62 122 182 242 302'
+ alert_rule_test:
+ - eval_time: 5m
+ alertname: NodeTimeSkewDetected
+ exp_alerts:
+ - exp_labels:
+ severity: P3
+ instance: instance3
+ job: node
+ exp_annotations:
+ summary: "Node instance3 has a time difference."
+ description: "Node instance3 has a time difference 2."
+
+ - interval: 1m
+ input_series:
+ - series: 'node_time_seconds{instance="instance1", job="node"}'
+ values: '0 60 120 180 240 300'
+ - series: 'node_time_seconds{instance="instance2", job="node"}'
+ values: '0 60 120 180 240 300'
+ - series: 'node_time_seconds{instance="instance3", job="node"}'
+ values: '0 60 120 180 240 300'
+ alert_rule_test:
+ - eval_time: 5m
+ alertname: NodeTimeSkewDetected
+ exp_alerts: []
diff --git a/roles/kube_prometheus_stack/files/jsonnet/vendor/github.com/prometheus/node_exporter/docs/node-mixin/alerts/alerts.libsonnet b/roles/kube_prometheus_stack/files/jsonnet/vendor/github.com/prometheus/node_exporter/docs/node-mixin/alerts/alerts.libsonnet
index 1eaedd3..7712033 100644
--- a/roles/kube_prometheus_stack/files/jsonnet/vendor/github.com/prometheus/node_exporter/docs/node-mixin/alerts/alerts.libsonnet
+++ b/roles/kube_prometheus_stack/files/jsonnet/vendor/github.com/prometheus/node_exporter/docs/node-mixin/alerts/alerts.libsonnet
@@ -407,6 +407,20 @@
description: 'Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}',
},
},
+ {
+ alert: 'NodeTimeSkewDetected',
+ expr: |||
+ abs(timestamp(node_time_seconds{%(nodeExporterSelector)s}) - node_time_seconds{%(nodeExporterSelector)s}) > 1
+ ||| % $._config,
+ 'for': '5m',
+ labels: {
+ severity: 'warning',
+ },
+ annotations: {
+ summary: 'Node {{ $labels.instance }} has a time difference.',
+ description: 'Node {{ $labels.instance }} has a time difference {{ $value }}.',
+ },
+ },
],
},
],