fix: added monitoring for high 500s count
diff --git a/roles/kube_prometheus_stack/files/jsonnet/openstack.libsonnet b/roles/kube_prometheus_stack/files/jsonnet/openstack.libsonnet
index 4e78c21..9d05f36 100644
--- a/roles/kube_prometheus_stack/files/jsonnet/openstack.libsonnet
+++ b/roles/kube_prometheus_stack/files/jsonnet/openstack.libsonnet
@@ -2,6 +2,29 @@
prometheusAlerts+: {
groups+: [
{
+ name: 'api',
+ rules: [
+ {
+ alert: 'HighInternalServerErrors',
+ expr: |||
+ (
+ sum(rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[5m])) by (service)
+ /
+ sum(rate(nginx_ingress_controller_requests[5m])) by (service)
+ ) > 0.01
+ |||,
+ 'for': '2m',
+ labels: {
+ severity: 'P2',
+ },
+ annotations: {
+ summary: 'High percentage of HTTP 500 errors',
+ description: 'The service {{ $labels.service }} is returning HTTP 500 errors above the configured threshold.',
+ },
+ },
+ ],
+ },
+ {
name: 'cinder',
rules: [
{