monitoring: upgrade kube-prometheus-stack
This also adds support for Ceph monitoring + alerts
Sem-Ver: feature
Change-Id: I01a3ceab040b5d8c4dfa33c423f4349c999fb88f
diff --git a/.ansible-lint b/.ansible-lint
new file mode 100644
index 0000000..9b58a7d
--- /dev/null
+++ b/.ansible-lint
@@ -0,0 +1,3 @@
+---
+exclude_paths:
+ - roles/kube_prometheus_stack/files/
\ No newline at end of file
diff --git a/releasenotes/notes/upgrade-kube-prometheus-stack-b5eac8346cc693b6.yaml b/releasenotes/notes/upgrade-kube-prometheus-stack-b5eac8346cc693b6.yaml
new file mode 100644
index 0000000..12209d7
--- /dev/null
+++ b/releasenotes/notes/upgrade-kube-prometheus-stack-b5eac8346cc693b6.yaml
@@ -0,0 +1,3 @@
+---
+fixes:
+ - Upgrade ``kube-prometheus-stack`` to ``36.2.0`` and add Ceph monitoring.
diff --git a/roles/ceph_repository/defaults/main.yml b/roles/ceph_repository/defaults/main.yml
index 85b5d7b..63c9290 100644
--- a/roles/ceph_repository/defaults/main.yml
+++ b/roles/ceph_repository/defaults/main.yml
@@ -26,6 +26,6 @@
# .. envvar:: ceph_repository_version [[[
#
# Ceph version to pin package manager to
-ceph_repository_version: 16.2.7
+ceph_repository_version: 16.2.9
# ]]]
diff --git a/roles/kube_prometheus_stack/files/prometheus_alerts.yml b/roles/kube_prometheus_stack/files/prometheus_alerts.yml
new file mode 100644
index 0000000..f1eb420
--- /dev/null
+++ b/roles/kube_prometheus_stack/files/prometheus_alerts.yml
@@ -0,0 +1,885 @@
+# NOTE(mnaser): Imported from upstream ceph/ceph, with the following changes:
+#
+# * Dropped `CephNodeNetworkPacketDrops` due to noisy alerts with
+# no actionable items to fix it.
+#
+# https://raw.githubusercontent.com/ceph/ceph/v16.2.9/monitoring/ceph-mixin/prometheus_alerts.yml
+
+groups:
+ - name: cluster health
+ rules:
+ - alert: CephHealthError
+ expr: ceph_health_status == 2
+ for: 5m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.2.1
+ annotations:
+ summary: Cluster is in an ERROR state
+ description: >
+ Ceph in HEALTH_ERROR state for more than 5 minutes.
+ Please check "ceph health detail" for more information.
+
+ - alert: CephHealthWarning
+ expr: ceph_health_status == 1
+ for: 15m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ summary: Cluster is in a WARNING state
+ description: >
+ Ceph has been in HEALTH_WARN for more than 15 minutes.
+ Please check "ceph health detail" for more information.
+
+ - name: mon
+ rules:
+ - alert: CephMonDownQuorumAtRisk
+ expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1
+ for: 30s
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.3.1
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
+ summary: Monitor quorum is at risk
+ description: |
+ {{ $min := query "floor(count(ceph_mon_metadata) / 2) +1" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active
+ Without quorum the cluster will become inoperable, affecting all connected clients and services.
+
+ The following monitors are down:
+ {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
+ - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
+ {{- end }}
+ - alert: CephMonDown
+ expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1))
+ for: 30s
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
+ summary: One of more ceph monitors are down
+ description: |
+ {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down.
+ Quorum is still intact, but the loss of further monitors will make your cluster inoperable.
+
+ The following monitors are down:
+ {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
+ - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
+ {{- end }}
+ - alert: CephMonDiskspaceCritical
+ expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
+ for: 1m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.3.2
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
+ summary: Disk space on at least one monitor is critically low
+ description: |
+ The free space available to a monitor's store is critically low (<5% by default).
+ You should increase the space available to the monitor(s). The
+ default location for the store sits under /var/lib/ceph. Your monitor hosts are;
+ {{- range query "ceph_mon_metadata"}}
+ - {{ .Labels.hostname }}
+ {{- end }}
+
+ - alert: CephMonDiskspaceLow
+ expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
+ for: 5m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
+ summary: Disk space on at least one monitor is approaching full
+ description: |
+ The space available to a monitor's store is approaching full (>70% is the default).
+ You should increase the space available to the monitor store. The
+ default location for the store sits under /var/lib/ceph. Your monitor hosts are;
+ {{- range query "ceph_mon_metadata"}}
+ - {{ .Labels.hostname }}
+ {{- end }}
+
+ - alert: CephMonClockSkew
+ expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
+ for: 1m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
+ summary: Clock skew across the Monitor hosts detected
+ description: |
+ The ceph monitors rely on a consistent time reference to maintain
+ quorum and cluster consistency. This event indicates that at least
+ one of your mons is not sync'd correctly.
+
+ Review the cluster status with ceph -s. This will show which monitors
+ are affected. Check the time sync status on each monitor host.
+
+ - name: osd
+ rules:
+ - alert: CephOSDDownHigh
+ expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.4.1
+ annotations:
+ summary: More than 10% of OSDs are down
+ description: |
+ {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%).
+
+ The following OSDs are down:
+ {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
+ - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
+ {{- end }}
+ - alert: CephOSDHostDown
+ expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
+ for: 5m
+ labels:
+ severity: warning
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.4.8
+ annotations:
+ summary: An OSD host is offline
+ description: |
+ The following OSDs are down:
+ {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
+ - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }}
+ {{- end }}
+ - alert: CephOSDDown
+ expr: ceph_health_detail{name="OSD_DOWN"} == 1
+ for: 5m
+ labels:
+ severity: warning
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.4.2
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
+ summary: An OSD has been marked down/unavailable
+ description: |
+ {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins.
+
+ The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down:
+ {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}}
+ - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
+ {{- end }}
+
+ - alert: CephOSDNearFull
+ expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
+ for: 5m
+ labels:
+ severity: warning
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.4.3
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
+ summary: OSD(s) running low on free space (NEARFULL)
+ description: |
+ One or more OSDs have reached their NEARFULL threshold
+
+ Use 'ceph health detail' to identify which OSDs have reached this threshold.
+ To resolve, either add capacity to the cluster, or delete unwanted data
+ - alert: CephOSDFull
+ expr: ceph_health_detail{name="OSD_FULL"} > 0
+ for: 1m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.4.6
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
+ summary: OSD(s) is full, writes blocked
+ description: |
+ An OSD has reached it's full threshold. Writes from all pools that share the
+ affected OSD will be blocked.
+
+ To resolve, either add capacity to the cluster, or delete unwanted data
+ - alert: CephOSDBackfillFull
+ expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0
+ for: 1m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
+ summary: OSD(s) too full for backfill operations
+ description: |
+ An OSD has reached it's BACKFILL FULL threshold. This will prevent rebalance operations
+ completing for some pools. Check the current capacity utilisation with 'ceph df'
+
+ To resolve, either add capacity to the cluster, or delete unwanted data
+ - alert: CephOSDTooManyRepairs
+ expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1
+ for: 30s
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
+ summary: OSD has hit a high number of read errors
+ description: |
+ Reads from an OSD have used a secondary PG to return data to the client, indicating
+ a potential failing disk.
+ - alert: CephOSDTimeoutsPublicNetwork
+ expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1
+ for: 1m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ summary: Network issues delaying OSD heartbeats (public network)
+ description: |
+ OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network
+ for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs.
+ - alert: CephOSDTimeoutsClusterNetwork
+ expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1
+ for: 1m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ summary: Network issues delaying OSD heartbeats (cluster network)
+ description: |
+ OSD heartbeats on the cluster's 'cluster' network (backend) are running slow. Investigate the network
+ for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs.
+ - alert: CephOSDInternalDiskSizeMismatch
+ expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1
+ for: 1m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
+ summary: OSD size inconsistency error
+ description: |
+ One or more OSDs have an internal inconsistency between the size of the physical device and it's metadata.
+ This could lead to the OSD(s) crashing in future. You should redeploy the effected OSDs.
+ - alert: CephDeviceFailurePredicted
+ expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
+ for: 1m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
+ summary: Device(s) have been predicted to fail soon
+ description: |
+ The device health module has determined that one or more devices will fail
+ soon. To review the device states use 'ceph device ls'. To show a specific
+ device use 'ceph device info <dev id>'.
+
+ Mark the OSD as out (so data may migrate to other OSDs in the cluster). Once
+ the osd is empty remove and replace the OSD.
+ - alert: CephDeviceFailurePredictionTooHigh
+ expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
+ for: 1m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.4.7
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
+ summary: Too many devices have been predicted to fail, unable to resolve
+ description: |
+ The device health module has determined that the number of devices predicted to
+ fail can not be remediated automatically, since it would take too many osd's out of
+ the cluster, impacting performance and potentially availabililty. You should add new
+ OSDs to the cluster to allow data to be relocated to avoid the data integrity issues.
+ - alert: CephDeviceFailureRelocationIncomplete
+ expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
+ for: 1m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
+ summary: A device failure is predicted, but unable to relocate data
+ description: |
+ The device health module has determined that one or more devices will fail
+ soon, but the normal process of relocating the data on the device to other
+ OSDs in the cluster is blocked.
+
+ Check the the cluster has available freespace. It may be necessary to add
+ more disks to the cluster to allow the data from the failing device to
+ successfully migrate.
+
+ - alert: CephOSDFlapping
+ expr: |
+ (
+ rate(ceph_osd_up[5m])
+ * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
+ ) * 60 > 1
+ labels:
+ severity: warning
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.4.4
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
+ summary: Network issues are causing OSD's to flap (mark each other out)
+ description: >
+ OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was
+ marked down and back up at {{ $value | humanize }} times once a
+ minute for 5 minutes. This could indicate a network issue (latency,
+ packet drop, disruption) on the clusters "cluster network". Check the
+ network environment on the listed host(s).
+
+ - alert: CephOSDReadErrors
+ expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
+ for: 30s
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
+ summary: Device read errors detected
+ description: >
+ An OSD has encountered read errors, but the OSD has recovered by retrying
+ the reads. This may indicate an issue with the Hardware or Kernel.
+ # alert on high deviation from average PG count
+ - alert: CephPGImbalance
+ expr: |
+ abs(
+ (
+ (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)
+ ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
+ ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
+ for: 5m
+ labels:
+ severity: warning
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.4.5
+ annotations:
+ summary: PG allocations are not balanced across devices
+ description: >
+ OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
+ by more than 30% from average PG count.
+ # alert on high commit latency...but how high is too high
+
+ - name: mds
+ rules:
+ - alert: CephFilesystemDamaged
+ expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
+ for: 1m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.5.1
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
+ summary: Ceph filesystem is damaged.
+ description: >
+ The filesystems metadata has been corrupted. Data access
+ may be blocked.
+
+ Either analyse the output from the mds daemon admin socket, or
+ escalate to support
+ - alert: CephFilesystemOffline
+ expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
+ for: 1m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.5.3
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
+ summary: Ceph filesystem is offline
+ description: >
+ All MDS ranks are unavailable. The ceph daemons providing the metadata
+ for the Ceph filesystem are all down, rendering the filesystem offline.
+ - alert: CephFilesystemDegraded
+ expr: ceph_health_detail{name="FS_DEGRADED"} > 0
+ for: 1m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.5.4
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
+ summary: Ceph filesystem is degraded
+ description: >
+ One or more metadata daemons (MDS ranks) are failed or in a
+ damaged state. At best the filesystem is partially available,
+ worst case is the filesystem is completely unusable.
+ - alert: CephFilesystemMDSRanksLow
+ expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
+ for: 1m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
+ summary: Ceph MDS daemon count is lower than configured
+ description: >
+ The filesystem's "max_mds" setting defined the number of MDS ranks in
+ the filesystem. The current number of active MDS daemons is less than
+ this setting.
+ - alert: CephFilesystemInsufficientStandby
+ expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
+ for: 1m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
+ summary: Ceph filesystem standby daemons too low
+ description: >
+ The minimum number of standby daemons determined by standby_count_wanted
+ is less than the actual number of standby daemons. Adjust the standby count
+ or increase the number of mds daemons within the filesystem.
+ - alert: CephFilesystemFailureNoStandby
+ expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
+ for: 1m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.5.5
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
+ summary: Ceph MDS daemon failed, no further standby available
+ description: >
+ An MDS daemon has failed, leaving only one active rank without
+ further standby. Investigate the cause of the failure or add a
+ standby daemon
+ - alert: CephFilesystemReadOnly
+ expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
+ for: 1m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.5.2
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
+ summary: Ceph filesystem in read only mode, due to write error(s)
+ description: >
+ The filesystem has switched to READ ONLY due to an unexpected
+ write error, when writing to the metadata pool
+
+ Either analyse the output from the mds daemon admin socket, or
+ escalate to support
+
+ - name: mgr
+ rules:
+ - alert: CephMgrModuleCrash
+ expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
+ for: 5m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.6.1
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
+ summary: A mgr module has recently crashed
+ description: >
+ One or more mgr modules have crashed and are yet to be acknowledged by the administrator. A
+ crashed module may impact functionality within the cluster. Use the 'ceph crash' commands to
+ investigate which module has failed, and archive it to acknowledge the failure.
+ - alert: CephMgrPrometheusModuleInactive
+ expr: up{job="ceph"} == 0
+ for: 1m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.6.2
+ annotations:
+ summary: Ceph's mgr/prometheus module is not available
+ description: >
+ The mgr/prometheus module at {{ $labels.instance }} is unreachable. This
+ could mean that the module has been disabled or the mgr itself is down.
+
+ Without the mgr/prometheus module metrics and alerts will no longer
+ function. Open a shell to ceph and use 'ceph -s' to to determine whether the
+ mgr is active. If the mgr is not active, restart it, otherwise you can check
+ the mgr/prometheus module is loaded with 'ceph mgr module ls' and if it's
+ not listed as enabled, enable it with 'ceph mgr module enable prometheus'
+
+ - name: pgs
+ rules:
+ - alert: CephPGsInactive
+ expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
+ for: 5m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.7.1
+ annotations:
+ summary: One or more Placement Groups are inactive
+ description: >
+ {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
+ Inactive placement groups aren't able to serve read/write
+ requests.
+ - alert: CephPGsUnclean
+ expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
+ for: 15m
+ labels:
+ severity: warning
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.7.2
+ annotations:
+ summary: One or more platcment groups are marked unclean
+ description: >
+ {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
+ Unclean PGs haven't been able to completely recover from a previous failure.
+ - alert: CephPGsDamaged
+ expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
+ for: 5m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.7.4
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
+ summary: Placement group damaged, manual intervention needed
+ description: >
+ During data consistency checks (scrub), at least one PG has been flagged as being
+ damaged or inconsistent.
+
+ Check to see which PG is affected, and attempt a manual repair if necessary. To list
+ problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use
+ the 'ceph pg repair <pg_num>' command.
+ - alert: CephPGRecoveryAtRisk
+ expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 1
+ for: 1m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.7.5
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
+ summary: OSDs are too full for automatic recovery
+ description: >
+ Data redundancy may be reduced, or is at risk, since one or more OSDs are at or above their
+ 'full' threshold. Add more capacity to the cluster, or delete unwanted data.
+ - alert: CephPGUnavilableBlockingIO
+ # PG_AVAILABILITY, but an OSD is not in a DOWN state
+ expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) == 1
+ for: 1m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.7.3
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
+ summary: Placement group is unavailable, blocking some I/O
+ description: >
+ Data availability is reduced impacting the clusters ability to service I/O to some data. One or
+ more placement groups (PGs) are in a state that blocks IO.
+ - alert: CephPGBackfillAtRisk
+ expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 1
+ for: 1m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.7.6
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
+ summary: Backfill operations are blocked, due to lack of freespace
+ description: >
+ Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs
+ have breached their 'backfillfull' threshold. Add more capacity, or delete unwanted data.
+ - alert: CephPGNotScrubbed
+ expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
+ for: 5m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
+ summary: Placement group(s) have not been scrubbed
+ description: |
+ One or more PGs have not been scrubbed recently. The scrub process is a data integrity
+ feature, protectng against bit-rot. It checks that objects and their metadata (size and
+ attributes) match across object replicas. When PGs miss their scrub window, it may
+ indicate the scrub window is too small, or PGs were not in a 'clean' state during the
+ scrub window.
+
+ You can manually initiate a scrub with: ceph pg scrub <pgid>
+ - alert: CephPGsHighPerOSD
+ expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
+ for: 1m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
+ summary: Placement groups per OSD is too high
+ description: |
+ The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).
+
+ Check that the pg_autoscaler hasn't been disabled for any of the pools, with 'ceph osd pool autoscale-status'
+ and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide
+ the autoscaler based on the expected relative size of the pool
+ (i.e. 'ceph osd pool set cephfs.cephfs.meta target_size_ratio .1')
+ - alert: CephPGNotDeepScrubbed
+ expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
+ for: 5m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
+ summary: Placement group(s) have not been deep scrubbed
+ description: |
+ One or more PGs have not been deep scrubbed recently. Deep scrub is a data integrity
+ feature, protectng against bit-rot. It compares the contents of objects and their
+ replicas for inconsistency. When PGs miss their deep scrub window, it may indicate
+ that the window is too small or PGs were not in a 'clean' state during the deep-scrub
+ window.
+
+ You can manually initiate a deep scrub with: ceph pg deep-scrub <pgid>
+
+ - name: nodes
+ rules:
+ - alert: CephNodeRootFilesystemFull
+ expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
+ for: 5m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.8.1
+ annotations:
+ summary: Root filesystem is dangerously full
+ description: >
+ Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free.
+
+ - alert: CephNodeNetworkPacketErrors
+ expr: |
+ (
+ increase(node_network_receive_errs_total{device!="lo"}[1m]) +
+ increase(node_network_transmit_errs_total{device!="lo"}[1m])
+ ) / (
+ increase(node_network_receive_packets_total{device!="lo"}[1m]) +
+ increase(node_network_transmit_packets_total{device!="lo"}[1m])
+ ) >= 0.0001 or (
+ increase(node_network_receive_errs_total{device!="lo"}[1m]) +
+ increase(node_network_transmit_errs_total{device!="lo"}[1m])
+ ) >= 10
+ labels:
+ severity: warning
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.8.3
+ annotations:
+ summary: One or more Nics is seeing packet errors
+ description: >
+ Node {{ $labels.instance }} experiences packet errors > 0.01% or
+ > 10 packets/s on interface {{ $labels.device }}.
+
+ # Restrict to device names beginning with '/' to skip false alarms from
+ # tmpfs, overlay type filesystems
+ - alert: CephNodeDiskspaceWarning
+ expr: |
+ predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *
+ on(instance) group_left(nodename) node_uname_info < 0
+ labels:
+ severity: warning
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.8.4
+ annotations:
+ summary: Host filesystem freespace is getting low
+ description: >
+ Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }}
+ will be full in less than 5 days assuming the average fill-up
+ rate of the past 48 hours.
+
+ - alert: CephNodeInconsistentMTU
+ expr: |
+ node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
+ scalar(
+ max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
+ quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
+ )
+ or
+ node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
+ scalar(
+ min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
+ quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
+ )
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ summary: MTU settings across Ceph hosts are inconsistent
+ description: >
+ Node {{ $labels.instance }} has a different MTU size ({{ $value }})
+ than the median of devices named {{ $labels.device }}.
+
+ - name: pools
+ rules:
+ - alert: CephPoolGrowthWarning
+ expr: |
+ (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)
+ group_right ceph_pool_metadata) >= 95
+ labels:
+ severity: warning
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.9.2
+ annotations:
+ summary: Pool growth rate may soon exceed it's capacity
+ description: >
+ Pool '{{ $labels.name }}' will be full in less than 5 days
+ assuming the average fill-up rate of the past 48 hours.
+ - alert: CephPoolBackfillFull
+ expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ summary: Freespace in a pool is too low for recovery/rebalance
+ description: >
+ A pool is approaching it's near full threshold, which will
+ prevent rebalance operations from completing. You should
+ consider adding more capacity to the pool.
+
+ - alert: CephPoolFull
+ expr: ceph_health_detail{name="POOL_FULL"} > 0
+ for: 1m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.9.1
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
+ summary: Pool is full - writes are blocked
+ description: |
+ A pool has reached it's MAX quota, or the OSDs supporting the pool
+ have reached their FULL threshold. Until this is resolved, writes to
+ the pool will be blocked.
+ Pool Breakdown (top 5)
+ {{- range query "topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))" }}
+ - {{ .Labels.name }} at {{ .Value }}%
+ {{- end }}
+ Either increase the pools quota, or add capacity to the cluster first
+ then increase it's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
+ - alert: CephPoolNearFull
+ expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0
+ for: 5m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ summary: One or more Ceph pools are getting full
+ description: |
+ A pool has exceeeded it warning (percent full) threshold, or the OSDs
+ supporting the pool have reached their NEARFULL thresholds. Writes may
+ continue, but you are at risk of the pool going read only if more capacity
+ isn't made available.
+
+ Determine the affected pool with 'ceph df detail', for example looking
+ at QUOTA BYTES and STORED. Either increase the pools quota, or add
+ capacity to the cluster first then increase it's quota
+ (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
+ - name: healthchecks
+ rules:
+ - alert: CephSlowOps
+ expr: ceph_healthcheck_slow_ops > 0
+ for: 30s
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
+ summary: MON/OSD operations are slow to complete
+ description: >
+ {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)
+# cephadm alerts
+ - name: cephadm
+ rules:
+ - alert: CephadmUpgradeFailed
+ expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
+ for: 30s
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.11.2
+ annotations:
+ summary: Ceph version upgrade has failed
+ description: >
+ The cephadm cluster upgrade process has failed. The cluster remains in
+ an undetermined state.
+
+ Please review the cephadm logs, to understand the nature of the issue
+ - alert: CephadmDaemonFailed
+ expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
+ for: 30s
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.11.1
+ annotations:
+ summary: A ceph daemon manged by cephadm is down
+ description: >
+ A daemon managed by cephadm is no longer active. Determine, which
+ daemon is down with 'ceph health detail'. you may start daemons with
+ the 'ceph orch daemon start <daemon_id>'
+ - alert: CephadmPaused
+ expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
+ for: 1m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused
+ summary: Orchestration tasks via cephadm are PAUSED
+ description: >
+ Cluster management has been paused manually. This will prevent the
+ orchestrator from service management and reconciliation. If this is
+ not intentional, resume cephadm operations with 'ceph orch resume'
+
+# prometheus alerts
+ - name: PrometheusServer
+ rules:
+ - alert: PrometheusJobMissing
+ expr: absent(up{job="ceph"})
+ for: 30s
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.12.1
+ annotations:
+ summary: The scrape job for Ceph is missing from Prometheus
+ description: |
+ The prometheus job that scrapes from Ceph is no longer defined, this
+ will effectively mean you'll have no metrics or alerts for the cluster.
+
+ Please review the job definitions in the prometheus.yml file of the prometheus
+ instance.
+# Object related events
+ - name: rados
+ rules:
+ - alert: CephObjectMissing
+ expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1
+ for: 30s
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.10.1
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
+ summary: Object(s) has been marked UNFOUND
+ description: |
+ A version of a RADOS object can not be found, even though all OSDs are up. I/O
+ requests for this object from clients will block (hang). Resolving this issue may
+ require the object to be rolled back to a prior version manually, and manually verified.
+# Generic
+ - name: generic
+ rules:
+ - alert: CephDaemonCrash
+ expr: ceph_health_detail{name="RECENT_CRASH"} == 1
+ for: 1m
+ labels:
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.1.2
+ annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
+ summary: One or more Ceph daemons have crashed, and are pending acknowledgement
+ description: |
+ One or more daemons have crashed recently, and need to be acknowledged. This notification
+ ensures that software crashes don't go unseen. To acknowledge a crash, use the
+ 'ceph crash archive <id>' command.
diff --git a/roles/kube_prometheus_stack/tasks/main.yml b/roles/kube_prometheus_stack/tasks/main.yml
index d32ea15..4e866a9 100644
--- a/roles/kube_prometheus_stack/tasks/main.yml
+++ b/roles/kube_prometheus_stack/tasks/main.yml
@@ -50,11 +50,33 @@
healthcheck-client.crt: "{{ _etcd_healthcheck_client_crt.content }}"
healthcheck-client.key: "{{ _etcd_healthcheck_client_key.content }}"
+- name: Create CRDs for Prometheus Operator
+ kubernetes.core.k8s:
+ state: present
+ definition: "{{ lookup('ansible.builtin.url', 'https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.57.0/example/prometheus-operator-crd/monitoring.coreos.com_' ~ item ~ '.yaml', split_lines=false) | regex_replace('- =$', '- \"=\"', multiline=True) | from_yaml_all }}" # yamllint disable-line rule:line-length
+ apply: true
+ server_side_apply:
+ field_manager: Atmosphere
+ force_conflicts: true
+ loop:
+ - alertmanagerconfigs
+ - alertmanagers
+ - podmonitors
+ - probes
+ - prometheuses
+ - prometheusrules
+ - servicemonitors
+ - thanosrulers
+ # NOTE(mnaser): We replace `- =` with `- "="` to avoid a YAML error, this also
+ # breaks idempotency so we flip to `changed_when: false`.
+ # See: https://github.com/yaml/pyyaml/issues/619
+ changed_when: false
+
- name: Deploy Helm chart
kubernetes.core.helm:
name: kube-prometheus-stack
chart_ref: prometheus-community/kube-prometheus-stack
- chart_version: 30.2.0
+ chart_version: 36.2.0
release_namespace: monitoring
kubeconfig: /etc/kubernetes/admin.conf
values: "{{ _kube_prometheus_stack_values | combine(kube_prometheus_stack_values, recursive=True) }}"
diff --git a/roles/kube_prometheus_stack/vars/main.yml b/roles/kube_prometheus_stack/vars/main.yml
index 8de845f..2e098c5 100644
--- a/roles/kube_prometheus_stack/vars/main.yml
+++ b/roles/kube_prometheus_stack/vars/main.yml
@@ -135,6 +135,25 @@
openstack-control-plane: enabled
secrets:
- kube-prometheus-stack-etcd-client-cert
+ additionalServiceMonitors:
+ - name: ceph
+ selector:
+ matchLabels:
+ application: ceph
+ jobLabel: application
+ namespaceSelector:
+ matchNames:
+ - openstack
+ endpoints:
+ - port: metrics
+ honorLabels: true
+ relabelings:
+ - action: replace
+ regex: (.*)
+ replacement: ceph
+ targetLabel: cluster
+ - action: "labeldrop"
+ regex: "^(container|endpoint|namespace|pod|service)$"
prometheusOperator:
admissionWebhooks:
patch:
@@ -153,8 +172,8 @@
- --collector.diskstats.ignored-devices=^(ram|loop|nbd|fd|(h|s|v|xv)d[a-z]|nvme\\d+n\\d+p)\\d+$
- --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|fuse.squashfuse_ll|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
- --collector.filesystem.mount-points-exclude=^/(dev|proc|run/credentials/.+|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+|var/lib/kubelet/plugins/kubernetes.io/csi/.+|run/containerd/.+)($|/)
- - --collector.netclass.ignored-devices=^(lxc|cilium_|qbr|qvb|qvo|ovs-system).*$
- - --collector.netdev.device-exclude=^(lxc|cilium_|qbr|qvb|qvo|ovs-system).*$
+ - --collector.netclass.ignored-devices=^(lxc|cilium_|qbr|qvb|qvo|tap|ovs-system|br).*$
+ - --collector.netdev.device-exclude=^(lxc|cilium_|qbr|qvb|qvo|tap|ovs-system|br).*$
prometheus:
monitor:
relabelings:
@@ -163,6 +182,7 @@
- action: "labeldrop"
regex: "^(container|endpoint|namespace|pod|service)$"
additionalPrometheusRulesMap:
+ ceph: "{{ lookup('ansible.builtin.file', 'prometheus_alerts.yml') | from_yaml }}"
coredns:
groups:
- name: coredns
diff --git a/tools/generate-galaxy-yml.py b/tools/generate-galaxy-yml.py
index e3e6c29..d8e2163 100644
--- a/tools/generate-galaxy-yml.py
+++ b/tools/generate-galaxy-yml.py
@@ -14,7 +14,7 @@
'ansible.utils': '2.5.2',
'community.crypto': '2.2.3',
'community.general': '4.5.0',
- 'kubernetes.core': '2.2.3',
+ 'kubernetes.core': '2.3.2',
'openstack.cloud': '1.7.0',
},
'build_ignore': [