Created
April 25, 2018 17:29
-
-
Save rmeleromira/e0ec0126ab745760ae7b3317851f43e2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
- name: alert.rules | |
rules: | |
- alert: HAproxyMysqlClusterHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="mysql_cluster"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: ContrailNamedProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-named"} == 0) >= count(procstat_running{process_name="contrail-named"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-named" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: DockerServiceDockerRegistryCriticalReplicasNumber | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="docker_registry"}[1m])) <= 3 * 0.4 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "docker_registry" | |
annotations: | |
description: "{{ $value }}/3 replicas are running for the Docker Swarn service 'docker_registry' for 2 minutes." | |
summary: "Docker Swarm service docker_registry invalid number of replicas for 2 minutes" | |
- alert: HAproxyNovaApiBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_api"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="nova_api"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_api"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: DockerServiceMonitoringServerWarningReplicasNumber | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_server"}[1m])) <= 2 * 0.7 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "monitoring_server" | |
annotations: | |
description: "{{ $value }}/2 replicas are running for the Docker Swarn service 'monitoring_server' for 2 minutes." | |
summary: "Docker Swarm service monitoring_server invalid number of replicas for 2 minutes" | |
- alert: ContrailWebServerProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-web-server"} == 0) >= count(procstat_running{process_name="contrail-web-server"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-web-server" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: HAproxyContrailApiHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="contrail_api"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: ContrailControlProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-control"} == 0) >= count(procstat_running{process_name="contrail-control"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-control" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: HAproxyHeatApiBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="heat_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="heat_api"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: ContrailIrondProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-irond"} == 0) >= count(procstat_running{process_name="contrail-irond"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-irond" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailJobServerProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-job-server"} == 0) == count(procstat_running{process_name="contrail-job-server"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-job-server" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: CassandraServerProcessInfo | |
expr: >- | |
procstat_running{process_name="cassandra-server"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "cassandra-server" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: ContrailNodemgrConfigProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-nodemgr-config"} == 0) >= count(procstat_running{process_name="contrail-nodemgr-config"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-nodemgr-config" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailSupervisordAnalyticsProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-supervisord-analytics"} == 0) >= count(procstat_running{process_name="contrail-supervisord-analytics"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-supervisord-analytics" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: DockerServiceElasticsearchElasticsearchclusterReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="elasticsearch_elasticsearch-cluster"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="elasticsearch_elasticsearch-cluster"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "elasticsearch_elasticsearch-cluster" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'elasticsearch_elasticsearch-cluster'. for 2 minutes" | |
summary: "Docker Swarm service elasticsearch_elasticsearch-cluster down for 2 minutes" | |
- alert: HAproxyRabbitmqClusterBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="rabbitmq_cluster"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="rabbitmq_cluster"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailSupervisordConfigProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-supervisord-config"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-supervisord-config" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: ContrailVrouterAgentProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-vrouter-agent"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-vrouter-agent" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: KibanaProcessWarning | |
expr: >- | |
count(procstat_running{process_name="kibana"} == 0) >= count(procstat_running{process_name="kibana"}) * 0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "kibana" | |
annotations: | |
description: "More than 30.0% of Kibana services are down" | |
summary: "More than 30.0% of Kibana services are down" | |
- alert: ContrailDiscoveryProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-discovery"} == 0) >= count(procstat_running{process_name="contrail-discovery"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-discovery" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: HAproxyKeystoneAdminApiHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="keystone_admin_api"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: CinderServicesInfo | |
expr: >- | |
openstack_cinder_service == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "'{{ $labels.service }}' is down on {{ $labels.hostname }} for the last 2 minutes." | |
summary: "'{{ $labels.service }}' is down" | |
- alert: DockerServiceMonitoringPushgatewayReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_pushgateway"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_pushgateway"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "monitoring_pushgateway" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'monitoring_pushgateway'. for 2 minutes" | |
summary: "Docker Swarm service monitoring_pushgateway down for 2 minutes" | |
- alert: SystemMemoryAvailableTooLow | |
expr: >- | |
avg_over_time(mem_available_percent[5m]) < 5.0 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "system" | |
annotations: | |
description: "The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold=5.0%)." | |
summary: "Free memory too low on {{ $labels.host }}" | |
- alert: HAproxyGlanceRegistryApiBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="glance_registry_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="glance_registry_api"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: InfluxdbCritical | |
expr: >- | |
count(influxdb_up == 0) >= count(influxdb_up) * 0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "influxdb" | |
annotations: | |
description: "More than 60.0% of InfluxDB services are down" | |
summary: "More than 60.0% of InfluxDB services are down" | |
- alert: DockerServiceMonitoringAlertmanagerReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_alertmanager"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_alertmanager"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "monitoring_alertmanager" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'monitoring_alertmanager'. for 2 minutes" | |
summary: "Docker Swarm service monitoring_alertmanager down for 2 minutes" | |
- alert: NovaAPIDown | |
expr: >- | |
openstack_api_check_status{service=~"nova.*|placement"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "Endpoint check for '{{ $labels.service }}' is down for the last 2 minutes" | |
summary: "Endpoint check for '{{ $labels.service }}' is down" | |
- alert: DockerServiceMonitoringAlertmanagerCriticalReplicasNumber | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_alertmanager"}[1m])) <= 2 * 0.4 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "monitoring_alertmanager" | |
annotations: | |
description: "{{ $value }}/2 replicas are running for the Docker Swarn service 'monitoring_alertmanager' for 2 minutes." | |
summary: "Docker Swarm service monitoring_alertmanager invalid number of replicas for 2 minutes" | |
- alert: RedisServerProcessDown | |
expr: >- | |
count(procstat_running{process_name="redis-server"} == 0) == count(procstat_running{process_name="redis-server"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "redis-server" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: HAproxyInfluxdbRelayBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="influxdb_relay"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="influxdb_relay"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="influxdb_relay"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailCollectorAPIDown | |
expr: >- | |
count(http_response_status{service=~"contrail.collector"} == 0) by (service) == count(http_response_status{service=~"contrail.collector"}) by (service) | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "All '{{ $labels.service }}' APIs are down" | |
summary: "All '{{ $labels.service }}' APIs are down" | |
- alert: KafkaServerProcessDown | |
expr: >- | |
count(procstat_running{process_name="kafka-server"} == 0) == count(procstat_running{process_name="kafka-server"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "kafka-server" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: ContrailVrouterAgentProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-vrouter-agent"} == 0) >= count(procstat_running{process_name="contrail-vrouter-agent"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-vrouter-agent" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailNodemgrControlProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-nodemgr-control"} == 0) >= count(procstat_running{process_name="contrail-nodemgr-control"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-nodemgr-control" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: HAproxyInfluxdbRelayBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="influxdb_relay"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="influxdb_relay"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: HAproxyNovaApiBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="nova_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="nova_api"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: HeatAPIDown | |
expr: >- | |
openstack_api_check_status{service=~"heat.*"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "Endpoint check for '{{ $labels.service }}' is down for 2 minutes" | |
summary: "Endpoint check for '{{ $labels.service }}' is down" | |
- alert: HAproxyKibanaBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="kibana"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="kibana"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: ContrailXMPPSessionsTooManyVariations | |
expr: >- | |
abs(delta(contrail_xmpp_session_count[2m])) >= 100 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-control" | |
annotations: | |
description: "There are too many XMPP sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold=100)" | |
summary: "Number of XMPP sessions changed between checks is too high" | |
- alert: ContrailWebServerProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-web-server"} == 0) >= count(procstat_running{process_name="contrail-web-server"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-web-server" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: InfluxdbHTTPPointsWrittenFail | |
expr: >- | |
rate(influxdb_httpd_pointsWrittenFail[2m]) / rate(influxdb_httpd_pointsWrittenOK[2m]) * 100 > 5 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "influxdb" | |
annotations: | |
description: "{{ printf `%.1f` $value }}% of written points have failed on {{ $labels.host }} (threshold=5)." | |
summary: "Influxdb too many failed writes" | |
- alert: ContrailCollectorAPICritical | |
expr: >- | |
count(http_response_status{service=~"contrail.collector"} == 0) by (service) >= count(http_response_status{service=~"contrail.collector"}) by (service) *0.6 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: NovaServicesCritical | |
expr: >- | |
openstack_nova_services{state="down",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} >= on (service) sum(openstack_nova_services{service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"}) by (service) * 0.6 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "More than 60.0% of {{ $labels.service }} services are down for the last 2 minutes" | |
summary: "More than 60.0% of {{ $labels.service }} services are down" | |
- alert: DockerServiceAptlyPublicCriticalReplicasNumber | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="aptly_public"}[1m])) <= 3 * 0.4 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "aptly_public" | |
annotations: | |
description: "{{ $value }}/3 replicas are running for the Docker Swarn service 'aptly_public' for 2 minutes." | |
summary: "Docker Swarm service aptly_public invalid number of replicas for 2 minutes" | |
- alert: NovaTotalFreeMemoryShortage | |
expr: >- | |
(100.0 * openstack_nova_total_free_ram) / (openstack_nova_total_free_ram + openstack_nova_total_used_ram) < 2.0 | |
for: 1m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "nova" | |
annotations: | |
description: "Memory shortage for 1 minutes" | |
summary: "Memory shortage for new instances" | |
- alert: ContrailVrouterDNSXMPPSessionsNone | |
expr: >- | |
max(contrail_vrouter_dns_xmpp) by (host) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are no vRouter DNS-XMPP sessions on node {{ $labels.host }}" | |
summary: "No vRouter DNS-XMPP sessions" | |
- alert: ContrailNamedProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-named"} == 0) >= count(procstat_running{process_name="contrail-named"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-named" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: RabbitMQTooManyMessages | |
expr: >- | |
rabbitmq_overview_messages > 1048576 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "rabbitmq" | |
annotations: | |
description: "The number of outstanding messages in RabbitMQ is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=1048576)." | |
summary: "Too many messages in RabbitMQ" | |
- alert: SystemMemoryAvailableLow | |
expr: >- | |
avg_over_time(mem_available_percent[5m]) < 10.0 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "system" | |
annotations: | |
description: "The percentage of free memory is low on node {{ $labels.host }} (current value={{ $value }}%, threshold=10.0%)." | |
summary: "Free memory low on {{ $labels.host }}" | |
- alert: DockerServiceDockerRegistryWarningReplicasNumber | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="docker_registry"}[1m])) <= 3 * 0.7 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "docker_registry" | |
annotations: | |
description: "{{ $value }}/3 replicas are running for the Docker Swarn service 'docker_registry' for 2 minutes." | |
summary: "Docker Swarm service docker_registry invalid number of replicas for 2 minutes" | |
- alert: GlusterFSDown | |
expr: >- | |
glusterfs_up != 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "glusterfs" | |
annotations: | |
description: "GlusterFS service is down on node {{ $labels.host }}" | |
summary: "GlusterFS service down" | |
- alert: ContrailJobServerProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-job-server"} == 0) >= count(procstat_running{process_name="contrail-job-server"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-job-server" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailXMPPSessionsNoneUp | |
expr: >- | |
max(contrail_xmpp_session_up_count) by (host) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-control" | |
annotations: | |
description: "There are no active XMPP sessions on node {{ $labels.host }}" | |
summary: "no active XMPP sessions" | |
- alert: ZookeeperCritical | |
expr: >- | |
count(zookeeper_up == 0) >= count(zookeeper_up) * 0.6 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "zookeeper" | |
annotations: | |
description: "More than 60.0% of Zookeeper services are down" | |
summary: "More than 60.0% of Zookeeper services are down" | |
- alert: SystemRxPacketsDroppedTooHigh | |
expr: >- | |
rate(net_drop_in[1m]) > 100 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "system" | |
annotations: | |
description: "The rate of received packets which are dropped is too high on node {{ $labels.host }} for interface {{ $labels.interface }} (current value={{ $value }}/sec, threshold=100/sec)" | |
summary: "Too many received packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}" | |
- alert: ContrailSchemaProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-schema"} == 0) == count(procstat_running{process_name="contrail-schema"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-schema" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: HAproxyElasticsearchBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="elasticsearch"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: KeystoneErrorLogsTooHigh | |
expr: >- | |
sum(rate(log_messages{service="keystone",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > 0.2 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=0.2)." | |
summary: "Too many errors in {{ $labels.service }} logs" | |
- alert: HAproxyContrailAnalyticsBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_analytics"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="contrail_analytics"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: HAproxyHeatCloudwatchApiBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="heat_cloudwatch_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="heat_cloudwatch_api"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: ContrailSvcMonitorProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-svc-monitor"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-svc-monitor" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: NovaTotalFreeVCPUsShortage | |
expr: >- | |
(100.0 * openstack_nova_total_free_vcpus) / (openstack_nova_total_free_vcpus + openstack_nova_total_used_vcpus) < 2.0 | |
for: 1m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "nova" | |
annotations: | |
description: "VPCU shortage for 1 minutes" | |
summary: "VCPU shortage for new instances" | |
- alert: ContrailFlowsInvalidLabelTooMany | |
expr: >- | |
min(contrail_vrouter_flows_invalid_label) by (host) >= 100 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are too many vRouter flows with invalid label on node {{ $labels.host }} (current value={{ $value }}, threshold=100)" | |
summary: "Too many vRouter flows with invalid label" | |
- alert: SystemFreeOpenFilesTooLow | |
expr: >- | |
predict_linear(linux_sysctl_fs_file_nr[1h], 8*3600) > linux_sysctl_fs_file_max | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "system" | |
annotations: | |
description: "Host {{ $labels.host }}) will run out of free open files in less than 8 hours." | |
summary: "Free open files for {{ $labels.path }} too low on {{ $labels.host }}" | |
- alert: SaltMasterProcessDown | |
expr: >- | |
procstat_running{process_name="salt-master"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "salt-master" | |
annotations: | |
description: "Salt-master service is down on node {{ $labels.host }}" | |
summary: "Salt-master service is down" | |
- alert: DockerServiceMonitoringPushgatewayCriticalReplicasNumber | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_pushgateway"}[1m])) <= 2 * 0.4 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "monitoring_pushgateway" | |
annotations: | |
description: "{{ $value }}/2 replicas are running for the Docker Swarn service 'monitoring_pushgateway' for 2 minutes." | |
summary: "Docker Swarm service monitoring_pushgateway invalid number of replicas for 2 minutes" | |
- alert: ContrailNodemgrProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-nodemgr"} == 0) == count(procstat_running{process_name="contrail-nodemgr"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-nodemgr" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: ContrailCollectorAPIWarning | |
expr: >- | |
count(http_response_status{service=~"contrail.collector"} == 0) by (service) >= count(http_response_status{service=~"contrail.collector"}) by (service) *0.3 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: HAproxyNovaMetadataApiBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_metadata_api"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="nova_metadata_api"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_metadata_api"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: SshFailedLoginsTooHigh | |
expr: >- | |
rate(failed_logins_total[5m]) > 0.2 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "ssh" | |
annotations: | |
description: "The rate of failed logins is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=0.2)." | |
summary: "Too many failed SSH logins" | |
- alert: HAproxyInfluxdbRelayHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="influxdb_relay"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: HAproxyHeatCfnApiHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="heat_cfn_api"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: DockerServiceRundeckRundeckapiReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="rundeck_rundeck-api"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="rundeck_rundeck-api"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "rundeck_rundeck-api" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'rundeck_rundeck-api'. for 2 minutes" | |
summary: "Docker Swarm service rundeck_rundeck-api down for 2 minutes" | |
- alert: ContrailTopologyProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-topology"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-topology" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: ContrailBGPSessionsSomeDown | |
expr: >- | |
min(contrail_bgp_session_down_count) by (host) > 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-control" | |
annotations: | |
description: "There are inactive BGP sessions on node {{ $labels.host }}" | |
summary: "inactive BGP sessions" | |
- alert: NovaServicesDown | |
expr: >- | |
openstack_nova_services{state="up",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down for the last 2 minutes" | |
summary: "All {{ $labels.service }} services down" | |
- alert: ContrailSchemaProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-schema"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-schema" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: RedisServerProcessCritical | |
expr: >- | |
count(procstat_running{process_name="redis-server"} == 0) >= count(procstat_running{process_name="redis-server"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "redis-server" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailNodemgrProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-nodemgr"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-nodemgr" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: HAproxyContrailAnalyticsBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="contrail_analytics"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="contrail_analytics"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: ContrailSvcMonitorProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-svc-monitor"} == 0) >= count(procstat_running{process_name="contrail-svc-monitor"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-svc-monitor" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: InfluxdbHTTPClientErrors | |
expr: >- | |
rate(influxdb_httpd_clientError[2m]) / rate(influxdb_httpd_req[2m]) * 100 > 5 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "influxdb" | |
annotations: | |
description: "{{ printf `%.1f` $value }}% of client requests are in error on {{ $labels.host }} (threshold=5)." | |
summary: "Influxdb number of client errors is high" | |
- alert: HAproxyNovaMetadataApiBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="nova_metadata_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="nova_metadata_api"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: CinderServicesWarning | |
expr: >- | |
openstack_cinder_services{service=~"cinder-volume|cinder-scheduler", state="down"} >= on (service) sum(openstack_cinder_services{service=~"cinder-volume|cinder-scheduler"}) by (service) * 0.3 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "{{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than 30.0%)" | |
summary: "More than 30.0% of {{ $labels.service }} services are down" | |
- alert: HAproxyGlanceApiHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="glance_api"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: ContrailVrouterLLSSessionsTooMany | |
expr: >- | |
min(contrail_vrouter_lls) by (host) >= 10 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are too many vRouter LLS sessions on node {{ $labels.host }} (current value={{ $value }}, threshold=10)" | |
summary: "Too many vRouter LLS sessions" | |
- alert: ContrailQueryEngineProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-query-engine"} == 0) >= count(procstat_running{process_name="contrail-query-engine"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-query-engine" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: HAproxyElasticsearchBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailAnalyticsApiProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-analytics-api"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-analytics-api" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: ContrailTopologyProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-topology"} == 0) == count(procstat_running{process_name="contrail-topology"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-topology" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: CinderServicesDown | |
expr: >- | |
openstack_cinder_services{state="up",service=~"cinder-volume|cinder-scheduler"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "All {{ $labels.service }} services are down for the last 2 minutes" | |
summary: "All {{ $labels.service }} services are down" | |
- alert: HAproxyElasticsearchBinaryHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="elasticsearch_binary"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: NovaComputesCritical | |
expr: >- | |
openstack_nova_services_percent{state="down",service=~"nova-compute"} >= on (service) sum(openstack_nova_services{service=~"nova-compute"}) by (service) * 0.5 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "More than 50.0% of {{ $labels.service }} services are down for the last 2 minutes" | |
summary: "More than 50.0% of {{ $labels.service }} services are down" | |
- alert: ContrailSchemaProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-schema"} == 0) >= count(procstat_running{process_name="contrail-schema"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-schema" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: GlanceErrorLogsTooHigh | |
expr: >- | |
sum(rate(log_messages{service="glance",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > 0.2 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=0.2)." | |
summary: "Too many errors in {{ $labels.service }} logs" | |
- alert: DockerServiceMonitoringServerCriticalReplicasNumber | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_server"}[1m])) <= 2 * 0.4 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "monitoring_server" | |
annotations: | |
description: "{{ $value }}/2 replicas are running for the Docker Swarn service 'monitoring_server' for 2 minutes." | |
summary: "Docker Swarm service monitoring_server invalid number of replicas for 2 minutes" | |
- alert: InfluxdbRelayFailedRequests | |
expr: >- | |
rate(influxdb_relay_failed_requests_total[5m]) / rate(influxdb_relay_requests_total[5m]) * 100 > 5 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "influxdb-relay" | |
annotations: | |
description: "{{ printf `%.1f` $value }}% of requests have been dropped on {{ $labels.instance }} (threshold=5)." | |
summary: "InfluxDB Relay too many failed requests" | |
- alert: ContrailVrouterDNSXMPPSessionsTooMany | |
expr: >- | |
min(contrail_vrouter_dns_xmpp) by (host) >= 10 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are too many vRouter DNS-XMPP sessions on node {{ $labels.host }} (current value={{ $value }}, threshold=10)" | |
summary: "Too many vRouter DNS-XMPP sessions" | |
- alert: ContrailAPICritical | |
expr: >- | |
count(http_response_status{service=~"contrail.api"} == 0) by (service) >= count(http_response_status{service=~"contrail.api"}) by (service) *0.6 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailSupervisordAnalyticsProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-supervisord-analytics"} == 0) == count(procstat_running{process_name="contrail-supervisord-analytics"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-supervisord-analytics" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: NovaAPIServiceDown | |
expr: >- | |
http_response_status{service=~"nova-api"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes." | |
summary: "HTTP check for '{{ $labels.service }}' down" | |
- alert: ContrailNodemgrDatabaseProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-nodemgr-database"} == 0) >= count(procstat_running{process_name="contrail-nodemgr-database"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-nodemgr-database" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailSupervisordConfigProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-supervisord-config"} == 0) == count(procstat_running{process_name="contrail-supervisord-config"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-supervisord-config" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: HAproxyGlanceRegistryApiBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="glance_registry_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="glance_registry_api"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: HAproxyKibanaBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="kibana"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="kibana"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="kibana"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailNodeManagerAPIWarning | |
expr: >- | |
count(http_response_status{service=~"contrail.node.manager"} == 0) by (service) >= count(http_response_status{service=~"contrail.node.manager"}) by (service) *0.3 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: KibanaProcessDown | |
expr: >- | |
count(procstat_running{process_name="kibana"} == 0) == count(procstat_running{process_name="kibana"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "kibana" | |
annotations: | |
description: "All Kibana services are down" | |
summary: "All Kibana services are down" | |
- alert: HAproxyNovaMetadataApiBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_metadata_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="nova_metadata_api"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: NovaTotalFreeMemoryLow | |
expr: >- | |
(100.0 * openstack_nova_total_free_ram) / (openstack_nova_total_free_ram + openstack_nova_total_used_ram) < 10.0 | |
for: 1m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "nova" | |
annotations: | |
description: "Memory low limit for 1 minutes" | |
summary: "Memory low limit for new instances" | |
- alert: ElasticsearchInfo | |
expr: >- | |
elasticsearch_up{host=~'.*'} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "elasticsearch" | |
annotations: | |
description: "Elasticsearch service is down on node {{ $labels.host }}" | |
summary: "Elasticsearch service is down" | |
- alert: KafkaServerProcessWarning | |
expr: >- | |
count(procstat_running{process_name="kafka-server"} == 0) >= count(procstat_running{process_name="kafka-server"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "kafka-server" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailSupervisordControlProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-supervisord-control"} == 0) >= count(procstat_running{process_name="contrail-supervisord-control"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-supervisord-control" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: DockerServiceDevopsportalFrontendReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="devops-portal_frontend"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="devops-portal_frontend"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "devops-portal_frontend" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'devops-portal_frontend'. for 2 minutes" | |
summary: "Docker Swarm service devops-portal_frontend down for 2 minutes" | |
- alert: ContrailVrouterAPIDown | |
expr: >- | |
count(http_response_status{service=~"contrail.vrouter"} == 0) by (service) == count(http_response_status{service=~"contrail.vrouter"}) by (service) | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "All '{{ $labels.service }}' APIs are down" | |
summary: "All '{{ $labels.service }}' APIs are down" | |
- alert: NovaComputesWarning | |
expr: >- | |
openstack_nova_services{state="down",service=~"nova-compute"} >= on (service) sum(openstack_nova_services{service=~"nova-compute"}) by (service) * 0.25 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "More than 25.0% of {{ $labels.service }} services are down for the last 2 minutes" | |
summary: "More than 25.0% of {{ $labels.service }} services are down" | |
- alert: NovaErrorLogsTooHigh | |
expr: >- | |
sum(rate(log_messages{service="nova",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > 0.2 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=0.2)." | |
summary: "Too many errors in {{ $labels.service }} logs" | |
- alert: NtpOffset | |
expr: >- | |
ntpq_offset >= 250 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "ntp" | |
annotations: | |
description: "NTP offset is higher than 250ms on node {{ $labels.host }}" | |
summary: "NTP offset is too high" | |
- alert: KafkaServerProcessCritical | |
expr: >- | |
count(procstat_running{process_name="kafka-server"} == 0) >= count(procstat_running{process_name="kafka-server"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "kafka-server" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailSupervisordControlProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-supervisord-control"} == 0) >= count(procstat_running{process_name="contrail-supervisord-control"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-supervisord-control" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailNodemgrVrouterProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-nodemgr-vrouter"} == 0) >= count(procstat_running{process_name="contrail-nodemgr-vrouter"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-nodemgr-vrouter" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailNamedProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-named"} == 0) == count(procstat_running{process_name="contrail-named"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-named" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: ContrailSnmpCollectorProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-snmp-collector"} == 0) == count(procstat_running{process_name="contrail-snmp-collector"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-snmp-collector" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: ContrailDeviceManagerProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-device-manager"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-device-manager" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: CassandraServerProcessWarning | |
expr: >- | |
count(procstat_running{process_name="cassandra-server"} == 0) >= count(procstat_running{process_name="cassandra-server"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "cassandra-server" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailVrouterXMPPSessionsTooManyVariations | |
expr: >- | |
abs(delta(contrail_vrouter_xmpp[2m])) >= 5 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are too many vRouter XMPP sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold=5)" | |
summary: "Number of vRouter XMPP sessions changed between checks is too high" | |
- alert: KeystoneAPIDown | |
expr: >- | |
openstack_api_check_status{service=~"keystone.*"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "Endpoint check for '{{ $labels.service }}' is down for 2 minutes" | |
summary: "Endpoint check for '{{ $labels.service }}' is down" | |
- alert: HAproxyNovaNovncHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="nova_novnc"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: ContrailFlowsDiscardTooMany | |
expr: >- | |
rate(contrail_vrouter_flows_discard[5m]) >= 0.1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are too many discarded vRouter flows on node {{ $labels.host }} (current value={{ $value }}, threshold=0.1)" | |
summary: "Too many vRouter discarded flows" | |
- alert: HAproxyElasticsearchBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="elasticsearch"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: HAproxyCinderApiHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="cinder_api"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: HAproxyContrailAnalyticsBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_analytics"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="contrail_analytics"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_analytics"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailFlowsQueueLimitExceededTooMany | |
expr: >- | |
rate(contrail_vrouter_flows_flow_queue_limit_exceeded[5m]) >= 0.1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are too many vRouter flows with queue limit exceeded on node {{ $labels.host }} (current value={{ $value }}, threshold=0.1)" | |
summary: "Too many vRouter flows with queue limit exceeded" | |
- alert: ElasticsearchDown | |
expr: >- | |
count(elasticsearch_up{host=~'.*'} == 0) == count(elasticsearch_up{host=~'.*'}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "elasticsearch" | |
annotations: | |
description: "All Elasticsearch services are down" | |
summary: "All Elasticsearch services are down" | |
- alert: ContrailSnmpCollectorProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-snmp-collector"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-snmp-collector" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: ElasticsearchClusterHealthStatusRed | |
expr: >- | |
elasticsearch_cluster_health_status == 3 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "elasticsearch" | |
annotations: | |
description: "The Elasticsearch cluster status is RED for the last 5 minutes." | |
summary: "Elasticsearch cluster status is RED" | |
- alert: HAproxyElasticsearchHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="elasticsearch"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: ContrailFlowsFragErrTooMany | |
expr: >- | |
min(contrail_vrouter_flows_frag_err) by (host) >= 100 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are too many vRouter flows with fragment errors on node {{ $labels.host }} (current value={{ $value }}, threshold=100)" | |
summary: "Too many vRouter flows with fragment errors" | |
- alert: ContrailApiProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-api"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-api" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: HAproxyMysqlClusterBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="mysql_cluster"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="mysql_cluster"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: HAproxyCinderApiBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="cinder_api"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="cinder_api"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="cinder_api"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailSupervisordConfigProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-supervisord-config"} == 0) >= count(procstat_running{process_name="contrail-supervisord-config"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-supervisord-config" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: HAproxyAodhApiBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="aodh-api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="aodh-api"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: HAproxyHeatCfnApiBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_cfn_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="heat_cfn_api"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: DockerServiceJanitorMonkeyCleanupservicemongodbReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="janitor_monkey_cleanup-service-mongodb"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="janitor_monkey_cleanup-service-mongodb"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "janitor_monkey_cleanup-service-mongodb" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'janitor_monkey_cleanup-service-mongodb'. for 2 minutes" | |
summary: "Docker Swarm service janitor_monkey_cleanup-service-mongodb down for 2 minutes" | |
- alert: InfluxdbHTTPPointsWrittenDropped | |
expr: >- | |
rate(influxdb_httpd_pointsWrittenDropped[2m]) / rate(influxdb_httpd_pointsWrittenOK[2m]) * 100 > 5 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "influxdb" | |
annotations: | |
description: "{{ printf `%.1f` $value }}% of written points have been dropped on {{ $labels.host }} (threshold=5)." | |
summary: "Influxdb too many dropped writes" | |
- alert: MemcachedProcessDown | |
expr: >- | |
procstat_running{process_name="memcached"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "memcached" | |
annotations: | |
description: "Memcached service is down on node {{ $labels.host }}" | |
summary: "Memcached service is down" | |
- alert: InfluxdbRelayBufferNearFull | |
expr: >- | |
influxdb_relay_backend_buffer_bytes > 536870912.0 * 70 / 100 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "influxdb-relay" | |
annotations: | |
description: "The buffer size for the {{ $labels.instance }}/{{ $labels.backend }} backend is getting full (current value={{ $value }} bytes, threshold=375809638.4)." | |
summary: "InfluxDB Relay buffer almost full" | |
- alert: HeatAPIServicesInfo | |
expr: >- | |
http_response_status{service=~"heat.*-api"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes." | |
summary: "HTTP check for '{{ $labels.service }}' down" | |
- alert: ContrailFlowsActiveTooMany | |
expr: >- | |
deriv(contrail_vrouter_flows_active[5m]) >= 100 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are too many active vRouter flows on node {{ $labels.host }} (current value={{ $value }}, threshold=100)" | |
summary: "Too many vRouter active flows" | |
- alert: ContrailNodemgrControlProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-nodemgr-control"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-nodemgr-control" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: NovaTotalFreeVCPUsLow | |
expr: >- | |
(100.0 * openstack_nova_total_free_vcpus) / (openstack_nova_total_free_vcpus + openstack_nova_total_used_vcpus) < 10.0 | |
for: 1m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "nova" | |
annotations: | |
description: "VPCU low limit for 1 minutes" | |
summary: "VCPU low limit for new instances" | |
- alert: ContrailTopologyProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-topology"} == 0) >= count(procstat_running{process_name="contrail-topology"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-topology" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailNodemgrVrouterProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-nodemgr-vrouter"} == 0) == count(procstat_running{process_name="contrail-nodemgr-vrouter"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-nodemgr-vrouter" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: ContrailVrouterXMPPSessionsTooMany | |
expr: >- | |
min(contrail_vrouter_xmpp) by (host) >= 10 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are too many vRouter XMPP sessions on node {{ $labels.host }} (current value={{ $value }}, threshold=10)" | |
summary: "Too many vRouter XMPP sessions" | |
- alert: ZookeeperWarning | |
expr: >- | |
count(zookeeper_up == 0) >= count(zookeeper_up) * 0.3 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "zookeeper" | |
annotations: | |
description: "More than 30.0% of Zookeeper services are down" | |
summary: "More than 30.0% of Zookeeper services are down" | |
- alert: ContrailAlarmGenProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-alarm-gen"} == 0) >= count(procstat_running{process_name="contrail-alarm-gen"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-alarm-gen" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: SystemDiskInodesFull | |
expr: >- | |
disk_inodes_used / disk_inodes_total >= 0.99 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "system" | |
annotations: | |
description: "The disk inodes ({{ $labels.path }}) are used at {{ $value }}% on {{ $labels.host }}." | |
summary: "Inodes for {{ $labels.path }} full on {{ $labels.host }}" | |
- alert: ElasticsearchClusterDiskLowWaterMark | |
expr: >- | |
(max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 85 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "elasticsearch" | |
annotations: | |
description: "Elasticsearch will not allocate new shards to node {{ $labels.host }}" | |
summary: "Elasticsearch low disk watermark [85%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}" | |
- alert: ContrailQueryEngineProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-query-engine"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-query-engine" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: ContrailDiscoveryAPIWarning | |
expr: >- | |
count(http_response_status{service=~"contrail.discovery"} == 0) by (service) >= count(http_response_status{service=~"contrail.discovery"}) by (service) *0.3 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailVrouterLLSSessionsTooManyVariations | |
expr: >- | |
abs(delta(contrail_vrouter_lls[2m])) >= 5 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are too many vRouter LLS sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold=5)" | |
summary: "Number of vRouter LLS sessions changed between checks is too high" | |
- alert: ContrailAnalyticsApiProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-analytics-api"} == 0) == count(procstat_running{process_name="contrail-analytics-api"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-analytics-api" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: HAproxyNovaMetadataApiHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="nova_metadata_api"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: RabbitMQDown | |
expr: >- | |
rabbitmq_up != 1 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "rabbitmq" | |
annotations: | |
description: "RabbitMQ service is down on node {{ $labels.host }}" | |
summary: "RabbitMQ service down" | |
- alert: GlanceAPIDown | |
expr: >- | |
max(openstack_api_check_status{service=~"glance.*"}) by (service) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "Endpoint check for '{{ $labels.service }}' is down for 2 minutes" | |
summary: "Endpoint check for '{{ $labels.service }}' is down" | |
- alert: ContrailVrouterAgentProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-vrouter-agent"} == 0) >= count(procstat_running{process_name="contrail-vrouter-agent"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-vrouter-agent" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailXMPPSessionsNone | |
expr: >- | |
max(contrail_xmpp_session_count) by (host) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-control" | |
annotations: | |
description: "There are no XMPP sessions on node {{ $labels.host }}" | |
summary: "No XMPP sessions" | |
- alert: HAproxyInfluxdbRelayBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="influxdb_relay"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="influxdb_relay"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailIrondProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-irond"} == 0) >= count(procstat_running{process_name="contrail-irond"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-irond" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailDiscoveryProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-discovery"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-discovery" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: ContrailAPIWarning | |
expr: >- | |
count(http_response_status{service=~"contrail.api"} == 0) by (service) >= count(http_response_status{service=~"contrail.api"}) by (service) *0.3 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailApiProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-api"} == 0) >= count(procstat_running{process_name="contrail-api"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-api" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: HAproxyHeatApiBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_api"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="heat_api"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_api"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: HeatAPIServicesWarning | |
expr: >- | |
count(http_response_status{service=~"heat.*-api"} == 0) by (service) >= on (service) count(http_response_status{service=~"heat.*-api"}) by (service) * 0.3 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "{{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than 30.0%)" | |
summary: "More than 30.0% of {{ $labels.service }} services are down" | |
- alert: RabbitMQDiskLow | |
expr: >- | |
predict_linear(rabbitmq_node_disk_free[8h], 8*3600) <= rabbitmq_node_disk_free_limit | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "rabbitmq" | |
annotations: | |
description: "The RabbitMQ disk partition will be full in less than 8 hours on node {{ $labels.host }}." | |
summary: "RabbitMQ disk free space too low" | |
- alert: ApacheIdleWorkersShortage | |
expr: >- | |
apache_IdleWorkers == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "apache" | |
annotations: | |
description: "Apache idle workers shortage on node {{ $labels.host }}" | |
summary: "Apache idle workers shortage" | |
- alert: InfluxdbInfo | |
expr: >- | |
influxdb_up == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "influxdb" | |
annotations: | |
description: "InfluxDB service is down on node {{ $labels.host }}" | |
summary: "InfluxDB service down" | |
- alert: ContrailAPIInfo | |
expr: >- | |
http_response_status{service=~"contrail.api"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "Endpoint check for '{{ $labels.service }}' is failed for 2 minutes on node {{ $labels.host }}" | |
summary: "Endpoint check for '{{ $labels.service }}' is failed" | |
- alert: InfluxdbWarning | |
expr: >- | |
count(influxdb_up == 0) >= count(influxdb_up) * 0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "influxdb" | |
annotations: | |
description: "More than 30.0% of InfluxDB services are down" | |
summary: "More than 30.0% of InfluxDB services are down" | |
- alert: ContrailDnsProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-dns"} == 0) == count(procstat_running{process_name="contrail-dns"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-dns" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: ContrailDnsProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-dns"} == 0) >= count(procstat_running{process_name="contrail-dns"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-dns" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: SystemTxPacketsDroppedTooHigh | |
expr: >- | |
rate(net_drop_out[1m]) > 100 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "system" | |
annotations: | |
description: "The rate of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $labels.interface }} (current value={{ $value }}/sec, threshold=100/sec)" | |
summary: "Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}" | |
- alert: ContrailFlowsDropTooMany | |
expr: >- | |
rate(contrail_vrouter_flows_flow_action_drop[5m]) >= 0.2 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are too many dropped vRouter flows on node {{ $labels.host }} (current value={{ $value }} flows/s, threshold=0.2 flows/s)" | |
summary: "Too many vRouter dropped flows" | |
- alert: ContrailNodeManagerAPIInfo | |
expr: >- | |
http_response_status{service=~"contrail.node.manager"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "Endpoint check for '{{ $labels.service }}' is failed for 2 minutes on node {{ $labels.host }}" | |
summary: "Endpoint check for '{{ $labels.service }}' is failed" | |
- alert: ContrailJobServerProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-job-server"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-job-server" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: ContrailCollectorProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-collector"} == 0) >= count(procstat_running{process_name="contrail-collector"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-collector" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: HAproxyGlanceRegistryApiBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="glance_registry_api"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="glance_registry_api"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="glance_registry_api"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailDeviceManagerProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-device-manager"} == 0) >= count(procstat_running{process_name="contrail-device-manager"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-device-manager" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailSupervisordControlProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-supervisord-control"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-supervisord-control" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: ContrailNodemgrControlProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-nodemgr-control"} == 0) >= count(procstat_running{process_name="contrail-nodemgr-control"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-nodemgr-control" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailAnalyticsApiProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-analytics-api"} == 0) >= count(procstat_running{process_name="contrail-analytics-api"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-analytics-api" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: NovaComputesDown | |
expr: >- | |
openstack_nova_services{state="up",service=~"nova-compute"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down for the last 2 minutes" | |
summary: "All {{ $labels.service }} services are down" | |
- alert: ContrailSupervisordDatabaseProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-supervisord-database"} == 0) >= count(procstat_running{process_name="contrail-supervisord-database"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-supervisord-database" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailSupervisordVrouterProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-supervisord-vrouter"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-supervisord-vrouter" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: ContrailAlarmGenProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-alarm-gen"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-alarm-gen" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: ContrailNodemgrProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-nodemgr"} == 0) >= count(procstat_running{process_name="contrail-nodemgr"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-nodemgr" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailDnsProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-dns"} == 0) >= count(procstat_running{process_name="contrail-dns"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-dns" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: RabbitMQMemoryLow | |
expr: >- | |
(rabbitmq_node_mem_limit - rabbitmq_node_mem_used) <= 104857600 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "rabbitmq" | |
annotations: | |
description: "The amount of free memory is too low on node {{ $labels.host }} (current value={{ $value }}B, threshold=104857600B)." | |
summary: "RabbitMQ free memory too low" | |
- alert: GaleraNodeNotReady | |
expr: >- | |
mysql_wsrep_ready != 1 | |
for: 1m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "mysql" | |
annotations: | |
description: "The Galera service on {{ $labels.host }} is not ready to serve queries." | |
summary: "Galera on {{ $labels.host }} not ready" | |
- alert: HAproxyNovaNovncBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="nova_novnc"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="nova_novnc"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: ContrailNamedProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-named"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-named" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: HAproxyHeatCfnApiBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_cfn_api"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="heat_cfn_api"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_cfn_api"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: CassandraServerProcessDown | |
expr: >- | |
count(procstat_running{process_name="cassandra-server"} == 0) == count(procstat_running{process_name="cassandra-server"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "cassandra-server" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: HAproxyKeystoneAdminApiBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="keystone_admin_api"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="keystone_admin_api"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="keystone_admin_api"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailDiscoveryProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-discovery"} == 0) == count(procstat_running{process_name="contrail-discovery"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-discovery" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: HAproxyGlanceRegistryApiHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="glance_registry_api"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: RabbitMQMemoryFull | |
expr: >- | |
rabbitmq_node_mem_used >= rabbitmq_node_mem_limit | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "rabbitmq" | |
annotations: | |
description: "All producers are blocked because the memory is full on node {{ $labels.host }}." | |
summary: "RabbitMQ producers blocked due to full memory" | |
- alert: HAproxyContrailApiBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="contrail_api"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: KeystoneAPIServiceDown | |
expr: >- | |
http_response_status{service=~"keystone.*"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for 2 minutes." | |
summary: "HTTP check for '{{ $labels.service }}' down" | |
- alert: ContrailIfmapServerProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-ifmap-server"} == 0) == count(procstat_running{process_name="contrail-ifmap-server"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-ifmap-server" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: NovaAggregatesFreeMemoryShortage | |
expr: >- | |
(100.0 * openstack_nova_aggregate_free_ram) / (openstack_nova_aggregate_free_ram + openstack_nova_aggregate_used_ram) < 2.0 | |
for: 1m | |
labels: | |
aggregate: "{{ $labels.aggregate }}" | |
route: "email,salesforce" | |
severity: "critical" | |
service: "nova" | |
annotations: | |
description: "Memory shortage for 1 minutes on aggregate {{ $labels.aggregate }}" | |
summary: "Memory shortage for new instances on aggregate {{ $labels.aggregate }}" | |
- alert: HAproxyNovaApiBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="nova_api"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: DockerServiceMonitoringAlertmanagerWarningReplicasNumber | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_alertmanager"}[1m])) <= 2 * 0.7 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "monitoring_alertmanager" | |
annotations: | |
description: "{{ $value }}/2 replicas are running for the Docker Swarn service 'monitoring_alertmanager' for 2 minutes." | |
summary: "Docker Swarm service monitoring_alertmanager invalid number of replicas for 2 minutes" | |
- alert: HAproxyKeystoneAdminApiBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="keystone_admin_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="keystone_admin_api"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: HAproxyHeatCfnApiBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="heat_cfn_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="heat_cfn_api"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: ElasticsearchCritical | |
expr: >- | |
count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * 0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "elasticsearch" | |
annotations: | |
description: "More than 60.0% of Elasticsearch services are down" | |
summary: "More than 60.0% of Elasticsearch services are down" | |
- alert: SystemDiskErrors | |
expr: >- | |
increase(hdd_errors_total[5m]) > 0 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "system" | |
annotations: | |
description: "The disk ({{ $labels.device }}) is reporting errors on {{ $labels.host }}." | |
summary: "Disk {{ $labels.device }} is failing" | |
- alert: ContrailApiProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-api"} == 0) == count(procstat_running{process_name="contrail-api"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-api" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: ContrailIfmapServerProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-ifmap-server"} == 0) >= count(procstat_running{process_name="contrail-ifmap-server"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-ifmap-server" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: HAproxyKeystonePublicApiBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="keystone_public_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="keystone_public_api"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: ContrailCollectorProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-collector"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-collector" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: ElasticsearchWarning | |
expr: >- | |
count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * 0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "elasticsearch" | |
annotations: | |
description: "More than 30.0% of Elasticsearch services are down" | |
summary: "More than 30.0% of Elasticsearch services are down" | |
- alert: ContrailVrouterAPIInfo | |
expr: >- | |
http_response_status{service=~"contrail.vrouter"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "Endpoint check for '{{ $labels.service }}' is failed for 2 minutes on node {{ $labels.host }}" | |
summary: "Endpoint check for '{{ $labels.service }}' is failed" | |
- alert: SystemDiskSpaceFull | |
expr: >- | |
disk_used_percent >= 99 and disk_inodes_total > 0 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "system" | |
annotations: | |
description: "The disk partition ({{ $labels.path }}) is used at {{ $value }}% on {{ $labels.host }}." | |
summary: "Disk partition {{ $labels.path }} full on {{ $labels.host }}" | |
- alert: ElasticsearchClusterHealthStatusYellow | |
expr: >- | |
elasticsearch_cluster_health_status == 2 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "elasticsearch" | |
annotations: | |
description: "The Elasticsearch cluster status is YELLOW for the last 5 minutes." | |
summary: "Elasticsearch cluster status is YELLOW" | |
- alert: HAproxyKeystonePublicApiBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="keystone_public_api"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="keystone_public_api"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="keystone_public_api"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: HAproxyKeystoneAdminApiBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="keystone_admin_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="keystone_admin_api"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: DockerServicePostgresqlPostgresqldbReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="postgresql_postgresql-db"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="postgresql_postgresql-db"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "postgresql_postgresql-db" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'postgresql_postgresql-db'. for 2 minutes" | |
summary: "Docker Swarm service postgresql_postgresql-db down for 2 minutes" | |
- alert: HAproxyNovaNovncBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_novnc"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="nova_novnc"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: HAproxyMysqlClusterBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="mysql_cluster"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="mysql_cluster"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="mysql_cluster"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailControlProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-control"} == 0) >= count(procstat_running{process_name="contrail-control"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-control" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailXMPPSessionsTooMany | |
expr: >- | |
min(contrail_xmpp_session_count) by (host) >= 500 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-control" | |
annotations: | |
description: "There are too many XMPP sessions on node {{ $labels.host }} (current value={{ $value }}, threshold=500)" | |
summary: "Too many XMPP sessions" | |
- alert: HAproxyContrailApiBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="contrail_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="contrail_api"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: ContrailFlowsInvalidNHTooMany | |
expr: >- | |
rate(contrail_vrouter_flows_invalid_nh[5m]) >= 0.1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are too many vRouter flows with invalid next hop on node {{ $labels.host }} (current value={{ $value }}, threshold=0.1)" | |
summary: "Too many vRouter flows with invalid next hop" | |
- alert: HAproxyHeatApiBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="heat_api"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailSupervisordVrouterProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-supervisord-vrouter"} == 0) >= count(procstat_running{process_name="contrail-supervisord-vrouter"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-supervisord-vrouter" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: SystemSwapIn | |
expr: >- | |
rate(swap_in[2m]) > 1048576 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "system" | |
annotations: | |
description: "The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold=1048576b/s)." | |
summary: "Swap input throughput too high on {{ $labels.host }}" | |
- alert: ContrailNodemgrDatabaseProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-nodemgr-database"} == 0) == count(procstat_running{process_name="contrail-nodemgr-database"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-nodemgr-database" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: ContrailVrouterAPICritical | |
expr: >- | |
count(http_response_status{service=~"contrail.vrouter"} == 0) by (service) >= count(http_response_status{service=~"contrail.vrouter"}) by (service) *0.6 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: HAproxyContrailDiscoveryBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_discovery"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="contrail_discovery"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_discovery"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailNodemgrDatabaseProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-nodemgr-database"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-nodemgr-database" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: OutOfMemoryTooHigh | |
expr: >- | |
rate(out_of_memory_total[5m]) > 0.0011 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "system" | |
annotations: | |
description: "The rate of out-of-memory errors is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=0.0011)." | |
summary: "Too many out-of-memory errors" | |
- alert: ContrailSnmpCollectorProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-snmp-collector"} == 0) >= count(procstat_running{process_name="contrail-snmp-collector"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-snmp-collector" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailCollectorProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-collector"} == 0) == count(procstat_running{process_name="contrail-collector"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-collector" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: InfluxdbDown | |
expr: >- | |
count(influxdb_up == 0) == count(influxdb_up) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "influxdb" | |
annotations: | |
description: "All InfluxDB services are down" | |
summary: "All InfluxDB services are down" | |
- alert: DockerServiceMonitoringPushgatewayWarningReplicasNumber | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_pushgateway"}[1m])) <= 2 * 0.7 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "monitoring_pushgateway" | |
annotations: | |
description: "{{ $value }}/2 replicas are running for the Docker Swarn service 'monitoring_pushgateway' for 2 minutes." | |
summary: "Docker Swarm service monitoring_pushgateway invalid number of replicas for 2 minutes" | |
- alert: HAproxyNovaNovncBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_novnc"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="nova_novnc"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_novnc"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailSupervisordConfigProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-supervisord-config"} == 0) >= count(procstat_running{process_name="contrail-supervisord-config"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-supervisord-config" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: DockerServiceSecurityMonkeySecurityauditschedulerReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="security_monkey_security-audit-scheduler"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="security_monkey_security-audit-scheduler"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "security_monkey_security-audit-scheduler" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'security_monkey_security-audit-scheduler'. for 2 minutes" | |
summary: "Docker Swarm service security_monkey_security-audit-scheduler down for 2 minutes" | |
- alert: HAproxyAodhApiBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="aodh-api"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="aodh-api"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="aodh-api"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailSnmpCollectorProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-snmp-collector"} == 0) >= count(procstat_running{process_name="contrail-snmp-collector"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-snmp-collector" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: HAproxyGlanceApiBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="glance_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="glance_api"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: ContrailApiProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-api"} == 0) >= count(procstat_running{process_name="contrail-api"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-api" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: DockerServicePushkinPushkinapiReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="pushkin_pushkin-api"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="pushkin_pushkin-api"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "pushkin_pushkin-api" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'pushkin_pushkin-api'. for 2 minutes" | |
summary: "Docker Swarm service pushkin_pushkin-api down for 2 minutes" | |
- alert: HaproxyDown | |
expr: >- | |
haproxy_up != 1 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy" | |
annotations: | |
description: "Haproxy service is down on node {{ $labels.host }}" | |
summary: "Haproxy service down" | |
- alert: HAproxyKeystonePublicApiBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="keystone_public_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="keystone_public_api"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: SystemCpuIdleTooLow | |
expr: >- | |
avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) < 10.0 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "system" | |
annotations: | |
description: "The average idle CPU usage is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold=10.0%)." | |
summary: "Idle CPU usage too low on {{ $labels.host }}" | |
- alert: CassandraServerProcessCritical | |
expr: >- | |
count(procstat_running{process_name="cassandra-server"} == 0) >= count(procstat_running{process_name="cassandra-server"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "cassandra-server" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailFlowsTableFullTooMany | |
expr: >- | |
min(contrail_vrouter_flows_flow_table_full) by (host) >= 100 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are too many vRouter flows with table full on node {{ $labels.host }} (current value={{ $value }}, threshold=100)" | |
summary: "Too many vRouter flows with table full" | |
- alert: HeatErrorLogsTooHigh | |
expr: >- | |
sum(rate(log_messages{service="heat",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > 0.2 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=0.2)." | |
summary: "Too many errors in {{ $labels.service }} logs" | |
- alert: DockerServiceJanitorMonkeyCleanupserviceapiReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="janitor_monkey_cleanup-service-api"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="janitor_monkey_cleanup-service-api"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "janitor_monkey_cleanup-service-api" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'janitor_monkey_cleanup-service-api'. for 2 minutes" | |
summary: "Docker Swarm service janitor_monkey_cleanup-service-api down for 2 minutes" | |
- alert: ContrailCollectorProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-collector"} == 0) >= count(procstat_running{process_name="contrail-collector"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-collector" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: HAproxyNeutronApiBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="neutron_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="neutron_api"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: ContrailFlowsInvalidITFTooMany | |
expr: >- | |
rate(contrail_vrouter_flows_composite_invalid_interface[5m]) >= 0.05 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are too many vRouter flows with composite invalid interface on node {{ $labels.host }} (current value={{ $value }}, threshold=0.05)" | |
summary: "Too many vRouter flows with composite invalid interface" | |
- alert: DockerServiceAptlyPublicReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="aptly_public"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="aptly_public"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "aptly_public" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'aptly_public'. for 2 minutes" | |
summary: "Docker Swarm service aptly_public down for 2 minutes" | |
- alert: NovaAggregatesFreeMemoryLow | |
expr: >- | |
(100.0 * openstack_nova_aggregate_free_ram) / (openstack_nova_aggregate_free_ram + openstack_nova_aggregate_used_ram) < 10.0 | |
for: 1m | |
labels: | |
aggregate: "{{ $labels.aggregate }}" | |
route: "email,salesforce" | |
severity: "warning" | |
service: "nova" | |
annotations: | |
description: "Memory low limit for 1 minutes on aggregate {{ $labels.aggregate }}" | |
summary: "Memory low limit for new instances on aggregate {{ $labels.aggregate }}" | |
- alert: KeystoneFailedAuthsTooHigh | |
expr: >- | |
rate(authentications_total_failed[5m]) > rate(authentications_total_all[5m]) * 50 / 100 and rate(authentications_total_all[5m]) > 0.1 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "keystone" | |
annotations: | |
description: "The rate of failed authentications in Keystone over the last 5 minutes is too high (current value={{ $value }}, threshold=50)." | |
summary: "Too many failed authentications in Keystone" | |
- alert: NovaAggregatesFreeVCPUsLow | |
expr: >- | |
(100.0 * openstack_nova_aggregate_free_vcpus) / (openstack_nova_aggregate_free_vcpus + openstack_nova_aggregate_used_vcpus) < 10.0 | |
for: 1m | |
labels: | |
aggregate: "{{ $labels.aggregate }}" | |
route: "email,salesforce" | |
severity: "warning" | |
service: "nova" | |
annotations: | |
description: "VPCU low limit for 1 minutes on aggregate {{ $labels.aggregate }}" | |
summary: "VCPU low limit for new instances on aggregate {{ $labels.aggregate }}" | |
- alert: CinderAPIServiceInfo | |
expr: >- | |
http_response_status{service=~"cinder-api"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes." | |
summary: "HTTP check for '{{ $labels.service }}' down" | |
- alert: HAproxyNeutronApiBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="neutron_api"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="neutron_api"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="neutron_api"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailAlarmGenProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-alarm-gen"} == 0) >= count(procstat_running{process_name="contrail-alarm-gen"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-alarm-gen" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: PrometheusRemoteStorageQueue | |
expr: >- | |
prometheus_remote_storage_queue_length / prometheus_remote_storage_queue_capacity * 100 > 75.0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "prometheus" | |
annotations: | |
description: "The Prometheus {{ $labels.instance }} remote storage queue almost full (current value={{ $value }}%, threshold=75.0%)" | |
summary: "Prometheus {{ $labels.instance }} remote storage queue is filling" | |
- alert: HAproxyContrailApiBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_api"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="contrail_api"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_api"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: HAproxyKeystonePublicApiHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="keystone_public_api"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: DockerServiceMonitoringRemoteAgentReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_remote_agent"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_remote_agent"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "monitoring_remote_agent" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'monitoring_remote_agent'. for 2 minutes" | |
summary: "Docker Swarm service monitoring_remote_agent down for 2 minutes" | |
- alert: ContrailAPIDown | |
expr: >- | |
count(http_response_status{service=~"contrail.api"} == 0) by (service) == count(http_response_status{service=~"contrail.api"}) by (service) | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "All '{{ $labels.service }}' APIs are down" | |
summary: "All '{{ $labels.service }}' APIs are down" | |
- alert: DockerServiceMonitoringRemoteStorageAdapterReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_remote_storage_adapter"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_remote_storage_adapter"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "monitoring_remote_storage_adapter" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'monitoring_remote_storage_adapter'. for 2 minutes" | |
summary: "Docker Swarm service monitoring_remote_storage_adapter down for 2 minutes" | |
- alert: HAproxyElasticsearchBinaryBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch_binary"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch_binary"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailJobServerProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-job-server"} == 0) >= count(procstat_running{process_name="contrail-job-server"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-job-server" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: GlanceAPIServiceDown | |
expr: >- | |
http_response_status{service=~"glance-api"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for 2 minutes." | |
summary: "HTTP check for '{{ $labels.service }}' down" | |
- alert: ContrailQueryEngineProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-query-engine"} == 0) >= count(procstat_running{process_name="contrail-query-engine"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-query-engine" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: DockerdProcessDown | |
expr: >- | |
procstat_running{process_name="dockerd"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "docker" | |
annotations: | |
description: "Dockerd service is down on node {{ $labels.host }}" | |
summary: "Dockerd service is down" | |
- alert: ContrailVrouterXMPPSessionsNone | |
expr: >- | |
max(contrail_vrouter_xmpp) by (host) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are no vRouter XMPP sessions on node {{ $labels.host }}" | |
summary: "No vRouter XMPP sessions" | |
- alert: ContrailQueryEngineProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-query-engine"} == 0) == count(procstat_running{process_name="contrail-query-engine"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-query-engine" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: HAproxyHeatCloudwatchApiBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_cloudwatch_api"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="heat_cloudwatch_api"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_cloudwatch_api"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: DockerServiceMonitoringRelayCriticalReplicasNumber | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_relay"}[1m])) <= 2 * 0.4 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "monitoring_relay" | |
annotations: | |
description: "{{ $value }}/2 replicas are running for the Docker Swarn service 'monitoring_relay' for 2 minutes." | |
summary: "Docker Swarm service monitoring_relay invalid number of replicas for 2 minutes" | |
- alert: NovaLibvirtDown | |
expr: >- | |
max(libvirt_up) by (host) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "libvirt" | |
annotations: | |
description: "libvirt check on '{{ $labels.host }}' is down for 2 minutes" | |
summary: "libvirt check on '{{ $labels.host }}' is down" | |
- alert: HAproxyContrailAnalyticsHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="contrail_analytics"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: NovaAggregatesFreeVCPUsShortage | |
expr: >- | |
(100.0 * openstack_nova_aggregate_free_vcpus) / (openstack_nova_aggregate_free_vcpus + openstack_nova_aggregate_used_vcpus) < 2.0 | |
for: 1m | |
labels: | |
aggregate: "{{ $labels.aggregate }}" | |
route: "email,salesforce" | |
severity: "critical" | |
service: "nova" | |
annotations: | |
description: "VPCU shortage for 1 minutes on aggregate {{ $labels.aggregate }}" | |
summary: "VCPU shortage for new instances on aggregate {{ $labels.aggregate }}" | |
- alert: ContrailDiscoveryProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-discovery"} == 0) >= count(procstat_running{process_name="contrail-discovery"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-discovery" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailSupervisordControlProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-supervisord-control"} == 0) == count(procstat_running{process_name="contrail-supervisord-control"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-supervisord-control" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: ContrailSvcMonitorProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-svc-monitor"} == 0) >= count(procstat_running{process_name="contrail-svc-monitor"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-svc-monitor" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: ZookeeperDown | |
expr: >- | |
count(zookeeper_up == 0) == count(zookeeper_up) | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "zookeeper" | |
annotations: | |
description: "All Zookeeper services are down" | |
summary: "All Zookeeper services are down" | |
- alert: ContrailAlarmGenProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-alarm-gen"} == 0) == count(procstat_running{process_name="contrail-alarm-gen"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-alarm-gen" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: CinderAPIDown | |
expr: >- | |
max(openstack_api_check_status{service=~"cinder.*"}) by (service) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "Endpoint check for '{{ $labels.service }}' is down for the last 2 minutes" | |
summary: "Endpoint check for '{{ $labels.service }}' is down" | |
- alert: HAproxyRabbitmqClusterBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="rabbitmq_cluster"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="rabbitmq_cluster"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: ContrailControlProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-control"} == 0) == count(procstat_running{process_name="contrail-control"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-control" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: CinderServicesCritical | |
expr: >- | |
openstack_cinder_services{service=~"cinder-volume|cinder-scheduler", state="down"} >= on (service) sum(openstack_cinder_services{service=~"cinder-volume|cinder-scheduler"}) by (service) * 0.6 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "{{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than 60.0%)" | |
summary: "More than 60.0% of {{ $labels.service }} services are down" | |
- alert: RedisServerProcessInfo | |
expr: >- | |
procstat_running{process_name="redis-server"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "redis-server" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: CinderErrorLogsTooHigh | |
expr: >- | |
sum(rate(log_messages{service="cinder",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > 0.2 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=0.2)." | |
summary: "Too many errors in {{ $labels.service }} logs" | |
- alert: AlertmanagerNotificationFailed | |
expr: >- | |
rate(alertmanager_notifications_failed_total[5m]) > 0.3 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "alertmanager" | |
annotations: | |
description: "Alertmanager {{ $labels.instance }} failed notifications for {{ $labels.integration }} (current value={{ $value }}, threshold=0.3)" | |
summary: "Alertmanager {{ $labels.instance }} failed notifications" | |
- alert: RedisServerProcessWarning | |
expr: >- | |
count(procstat_running{process_name="redis-server"} == 0) >= count(procstat_running{process_name="redis-server"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "redis-server" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: NeutronAPIDown | |
expr: >- | |
openstack_api_check_status{service=~"neutron.*"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "Endpoint check for '{{ $labels.service }}' is down for 2 minutes" | |
summary: "Endpoint check for '{{ $labels.service }}' is down" | |
- alert: GlanceRegistryServiceDown | |
expr: >- | |
http_response_status{service=~"glance-registry"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for 2 minutes." | |
summary: "HTTP check for '{{ $labels.service }}' down" | |
- alert: ContrailIfmapServerProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-ifmap-server"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-ifmap-server" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: HAproxyElasticsearchBinaryBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch_binary"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="elasticsearch_binary"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: ContrailWebServerProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-web-server"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-web-server" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: ContrailWebServerProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-web-server"} == 0) == count(procstat_running{process_name="contrail-web-server"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-web-server" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: SystemDiskSpaceTooLow | |
expr: >- | |
predict_linear(disk_free[1h], 8*3600) < 0 | |
for: 15m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "system" | |
annotations: | |
description: "The disk partition ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}." | |
summary: "Free space for {{ $labels.path }} too low on {{ $labels.host }}" | |
- alert: ContrailSupervisordAnalyticsProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-supervisord-analytics"} == 0) >= count(procstat_running{process_name="contrail-supervisord-analytics"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-supervisord-analytics" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailIrondProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-irond"} == 0) == count(procstat_running{process_name="contrail-irond"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-irond" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: ContrailNodemgrConfigProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-nodemgr-config"} == 0) == count(procstat_running{process_name="contrail-nodemgr-config"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-nodemgr-config" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: PrometheusTargetDown | |
expr: >- | |
up != 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "prometheus" | |
annotations: | |
description: "The Prometheus target {{ $labels.instance }} is down for the job {{ $labels.job }}." | |
summary: "Prometheus endpoint {{ $labels.instance }} down" | |
- alert: ContrailIfmapServerProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-ifmap-server"} == 0) >= count(procstat_running{process_name="contrail-ifmap-server"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-ifmap-server" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: HAproxyKibanaBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="kibana"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="kibana"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: HAproxyContrailDiscoveryHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="contrail_discovery"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: KibanaProcessCritical | |
expr: >- | |
count(procstat_running{process_name="kibana"} == 0) >= count(procstat_running{process_name="kibana"}) * 0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "kibana" | |
annotations: | |
description: "More than 60.0% of Kibana services are down" | |
summary: "More than 60.0% of Kibana services are down" | |
- alert: ContrailSupervisordDatabaseProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-supervisord-database"} == 0) == count(procstat_running{process_name="contrail-supervisord-database"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-supervisord-database" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: DockerServiceAptlyPublicWarningReplicasNumber | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="aptly_public"}[1m])) <= 3 * 0.7 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "aptly_public" | |
annotations: | |
description: "{{ $value }}/3 replicas are running for the Docker Swarn service 'aptly_public' for 2 minutes." | |
summary: "Docker Swarm service aptly_public invalid number of replicas for 2 minutes" | |
- alert: ContrailDeviceManagerProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-device-manager"} == 0) >= count(procstat_running{process_name="contrail-device-manager"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-device-manager" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailNodemgrConfigProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-nodemgr-config"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-nodemgr-config" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: HAproxyRabbitmqClusterHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="rabbitmq_cluster"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: ContrailNodeManagerAPIDown | |
expr: >- | |
count(http_response_status{service=~"contrail.node.manager"} == 0) by (service) == count(http_response_status{service=~"contrail.node.manager"}) by (service) | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "All '{{ $labels.service }}' APIs are down" | |
summary: "All '{{ $labels.service }}' APIs are down" | |
- alert: HeatAPIServicesCritical | |
expr: >- | |
count(http_response_status{service=~"heat.*-api"} == 0) by (service) >= on (service) count(http_response_status{service=~"heat.*-api"}) by (service) * 0.6 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "{{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than 60.0%)" | |
summary: "More than 60.0% of {{ $labels.service }} services are down" | |
- alert: SaltMinionProcessDown | |
expr: >- | |
procstat_running{process_name="salt-minion"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "salt-minion" | |
annotations: | |
description: "Salt-minion service is down on node {{ $labels.host }}" | |
summary: "Salt-minion service is down" | |
- alert: ContrailXMPPSessionsSomeDown | |
expr: >- | |
min(contrail_xmpp_session_down_count) by (host) > 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-control" | |
annotations: | |
description: "There are inactive XMPP sessions on node {{ $labels.host }}" | |
summary: "inactive XMPP sessions" | |
- alert: ContrailVrouterAPIWarning | |
expr: >- | |
count(http_response_status{service=~"contrail.vrouter"} == 0) by (service) >= count(http_response_status{service=~"contrail.vrouter"}) by (service) *0.3 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: HAproxyContrailDiscoveryBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="contrail_discovery"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="contrail_discovery"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: DockerServiceDashboardGrafanaReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="dashboard_grafana"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="dashboard_grafana"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "dashboard_grafana" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'dashboard_grafana'. for 2 minutes" | |
summary: "Docker Swarm service dashboard_grafana down for 2 minutes" | |
- alert: HAproxyNeutronApiBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="neutron_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="neutron_api"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: InfluxdbSeriesNumberTooHigh | |
expr: >- | |
influxdb_database_numSeries >= 1000000 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "influxdb" | |
annotations: | |
description: "The InfluxDB {{ $labels.database }} database has exceeded the maximum number of series (value={{ $value }},threshold=1000000)." | |
summary: "InfluxDB too many series for {{ $labels.database }}" | |
- alert: KafkaServerProcessInfo | |
expr: >- | |
procstat_running{process_name="kafka-server"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "kafka-server" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: HAproxyGlanceApiBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="glance_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="glance_api"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: DockerServiceMonitoringRelayReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_relay"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_relay"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "monitoring_relay" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'monitoring_relay'. for 2 minutes" | |
summary: "Docker Swarm service monitoring_relay down for 2 minutes" | |
- alert: DockerServiceMonitoringRemoteCollectorReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_remote_collector"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_remote_collector"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "monitoring_remote_collector" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'monitoring_remote_collector'. for 2 minutes" | |
summary: "Docker Swarm service monitoring_remote_collector down for 2 minutes" | |
- alert: HAproxyNeutronApiHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="neutron_api"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: HAproxyCinderApiBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="cinder_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="cinder_api"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: HAproxyAodhApiBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="aodh-api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="aodh-api"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailTopologyProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-topology"} == 0) >= count(procstat_running{process_name="contrail-topology"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-topology" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailDiscoveryAPICritical | |
expr: >- | |
count(http_response_status{service=~"contrail.discovery"} == 0) by (service) >= count(http_response_status{service=~"contrail.discovery"}) by (service) *0.6 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: KibanaProcessInfo | |
expr: >- | |
procstat_running{process_name="kibana"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "kibana" | |
annotations: | |
description: "Kibana service is down on node {{ $labels.host }}" | |
summary: "Kibana service is down" | |
- alert: HAproxyCinderApiBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="cinder_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="cinder_api"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: SystemDiskInodesTooLow | |
expr: >- | |
predict_linear(disk_inodes_free[1h], 8*3600) < 0 | |
for: 15m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "system" | |
annotations: | |
description: "The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}." | |
summary: "Free inodes for {{ $labels.path }} too low on {{ $labels.host }}" | |
- alert: KeystoneAPITooSlow | |
expr: >- | |
max by(host) (openstack_http_response_times{service='keystone',quantile="0.9",http_method=~"^(GET|POST)$",http_status=~"^2..$"}) >= 3.0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "keystone" | |
annotations: | |
description: "The 90th percentile of the Keystone API response times for GET and POST requests is too high on node {{ $labels.host }} (current value={{ $value }}s, threshold=3.0s)." | |
summary: "Keystone API too slow" | |
- alert: ContrailNodemgrControlProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-nodemgr-control"} == 0) == count(procstat_running{process_name="contrail-nodemgr-control"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-nodemgr-control" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: RemoteStorageAdapterSendingTooSlow | |
expr: >- | |
100.0 - (100.0 * sent_samples_total{job="remote_storage_adapter"} / on (job, instance) received_samples_total) > 10.0 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "remote_storage_adapter" | |
annotations: | |
description: "Remote storage adapter can not ingest samples fast enough on {{ $labels.instance }} (current value={{ $value }}%, threshold=10.0%)." | |
summary: "Remote storage adapter too slow on {{ $labels.instance }}" | |
- alert: HeatAPIServicesDown | |
expr: >- | |
count(http_response_status{service=~"heat.*-api"} == 0) by (service) == on (service) count(http_response_status{service=~"heat.*-api"}) by (service) | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "All {{ $labels.service }} services are down for the last 2 minutes" | |
summary: "All {{ $labels.service }} services are down" | |
- alert: HAproxyMysqlClusterBackendDown | |
expr: >- | |
max(haproxy_active_servers{sv="BACKEND",proxy="mysql_cluster"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="mysql_cluster"}) by (proxy) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "The proxy '{{ $labels.proxy }}' has no active backend" | |
summary: "All backends are down for the '{{ $labels.proxy }}' proxy" | |
- alert: NeutronAPIServiceDown | |
expr: >- | |
http_response_status{service=~"neutron-api"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for 2 minutes." | |
summary: "HTTP check for '{{ $labels.service }}' down" | |
- alert: ContrailNodeManagerAPICritical | |
expr: >- | |
count(http_response_status{service=~"contrail.node.manager"} == 0) by (service) >= count(http_response_status{service=~"contrail.node.manager"}) by (service) *0.6 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: HAproxyKibanaHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="kibana"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: ContrailBGPSessionsNoneUp | |
expr: >- | |
max(contrail_bgp_session_up_count) by (host) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-control" | |
annotations: | |
description: "There are no active BGP sessions on node {{ $labels.host }}" | |
summary: "no active BGP sessions" | |
- alert: ContrailSchemaProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-schema"} == 0) >= count(procstat_running{process_name="contrail-schema"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-schema" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ZookeeperInfo | |
expr: >- | |
zookeeper_up != 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "zookeeper" | |
annotations: | |
description: "Zookeeper service is down on node {{ $labels.host }}." | |
summary: "Zookeeper service down" | |
- alert: DockerServiceMonitoringServerReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_server"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_server"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "monitoring_server" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'monitoring_server'. for 2 minutes" | |
summary: "Docker Swarm service monitoring_server down for 2 minutes" | |
- alert: ContrailNodemgrVrouterProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-nodemgr-vrouter"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-nodemgr-vrouter" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: RemoteStorageAdapterIgnoredTooHigh | |
expr: >- | |
100.0 * prometheus_influxdb_ignored_samples_total{job="remote_storage_adapter"} / on (job, instance) sent_samples_total > 5.0 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "remote_storage_adapter" | |
annotations: | |
description: "Remote storage adapter is receiving too many invalid metrics on {{ $labels.instance }} (current value={{ $value }}%, threshold=5.0%)." | |
summary: "Remote storage adapter receiving too many invalid metrics on {{ $labels.instance }}" | |
- alert: ContrailNodemgrDatabaseProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-nodemgr-database"} == 0) >= count(procstat_running{process_name="contrail-nodemgr-database"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-nodemgr-database" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailCollectorAPIInfo | |
expr: >- | |
http_response_status{service=~"contrail.collector"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "Endpoint check for '{{ $labels.service }}' is failed for 2 minutes on node {{ $labels.host }}" | |
summary: "Endpoint check for '{{ $labels.service }}' is failed" | |
- alert: ContrailNodemgrConfigProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-nodemgr-config"} == 0) >= count(procstat_running{process_name="contrail-nodemgr-config"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-nodemgr-config" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailAnalyticsApiProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-analytics-api"} == 0) >= count(procstat_running{process_name="contrail-analytics-api"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-analytics-api" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: DockerServiceDockerRegistryReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="docker_registry"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="docker_registry"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "docker_registry" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'docker_registry'. for 2 minutes" | |
summary: "Docker Swarm service docker_registry down for 2 minutes" | |
- alert: ContrailIrondProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-irond"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-irond" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: RabbitMQDiskFull | |
expr: >- | |
rabbitmq_node_disk_free <= rabbitmq_node_disk_free_limit | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "rabbitmq" | |
annotations: | |
description: "All producers are blocked because the RabbitMQ disk partition is full on node {{ $labels.host }}." | |
summary: "RabbitMQ producers blocked due to full disk" | |
- alert: ContrailSupervisordVrouterProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-supervisord-vrouter"} == 0) >= count(procstat_running{process_name="contrail-supervisord-vrouter"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-supervisord-vrouter" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailDiscoveryAPIDown | |
expr: >- | |
count(http_response_status{service=~"contrail.discovery"} == 0) by (service) == count(http_response_status{service=~"contrail.discovery"}) by (service) | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "All '{{ $labels.service }}' APIs are down" | |
summary: "All '{{ $labels.service }}' APIs are down" | |
- alert: ContrailVrouterDNSXMPPSessionsTooManyVariations | |
expr: >- | |
abs(delta(contrail_vrouter_dns_xmpp[2m])) >= 5 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-compute" | |
annotations: | |
description: "There are too many vRouter DNS-XMPP sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold=5)" | |
summary: "Number of vRouter DNS-XMPP sessions changed between checks is too high" | |
- alert: DockerServiceSecurityMonkeySecurityauditapiReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="security_monkey_security-audit-api"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="security_monkey_security-audit-api"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "security_monkey_security-audit-api" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'security_monkey_security-audit-api'. for 2 minutes" | |
summary: "Docker Swarm service security_monkey_security-audit-api down for 2 minutes" | |
- alert: GaleraServiceDown | |
expr: >- | |
mysql_up != 1 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "mysql" | |
annotations: | |
description: "Galera service is down on node {{ $labels.host }}" | |
summary: "Galera service down" | |
- alert: ContrailNodemgrProcessCritical | |
expr: >- | |
count(procstat_running{process_name="contrail-nodemgr"} == 0) >= count(procstat_running{process_name="contrail-nodemgr"}) *0.6 | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "contrail-nodemgr" | |
annotations: | |
description: "More than 60.0% of '{{ $labels.service }}' is down" | |
summary: "More than 60.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailSupervisordAnalyticsProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-supervisord-analytics"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-supervisord-analytics" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: GaleraNodeNotConnected | |
expr: >- | |
mysql_wsrep_connected != 1 | |
for: 1m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "mysql" | |
annotations: | |
description: "The Galera service on {{ $labels.host }} is not connected to the cluster." | |
summary: "Galera on {{ $labels.host }} not connected" | |
- alert: ContrailDeviceManagerProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-device-manager"} == 0) == count(procstat_running{process_name="contrail-device-manager"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-device-manager" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: ContrailSvcMonitorProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-svc-monitor"} == 0) == count(procstat_running{process_name="contrail-svc-monitor"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-svc-monitor" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: DockerServiceMonitoringRelayWarningReplicasNumber | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_relay"}[1m])) <= 2 * 0.7 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "monitoring_relay" | |
annotations: | |
description: "{{ $value }}/2 replicas are running for the Docker Swarn service 'monitoring_relay' for 2 minutes." | |
summary: "Docker Swarm service monitoring_relay invalid number of replicas for 2 minutes" | |
- alert: HAproxyContrailDiscoveryBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_discovery"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="contrail_discovery"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailControlProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-control"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-control" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: ContrailSupervisordDatabaseProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-supervisord-database"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-supervisord-database" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: ElasticsearchClusterDiskHighWaterMark | |
expr: >- | |
(max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 90 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "elasticsearch" | |
annotations: | |
description: "Elasticsearch will not allocate new shards to node {{ $labels.host }} and will attempt to relocate shards to another node" | |
summary: "Elasticsearch high disk watermark [90%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}" | |
- alert: HAproxyHeatCloudwatchApiBackendWarning | |
expr: >- | |
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_cloudwatch_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="heat_cloudwatch_api"}) by (proxy) >= 1 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: HAproxyGlanceApiBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="glance_api"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="glance_api"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="glance_api"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailDiscoveryAPIInfo | |
expr: >- | |
http_response_status{service=~"contrail.discovery"} == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "Endpoint check for '{{ $labels.service }}' is failed for 2 minutes on node {{ $labels.host }}" | |
summary: "Endpoint check for '{{ $labels.service }}' is failed" | |
- alert: ContrailDnsProcessInfo | |
expr: >- | |
procstat_running{process_name="contrail-dns"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "info" | |
service: "contrail-dns" | |
annotations: | |
description: "{{ $labels.service }} service is down on node {{ $labels.host }}" | |
summary: "{{ $labels.service }} service is down" | |
- alert: DockerServiceHceHceapiReplicasDown | |
expr: >- | |
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="hce_hce-api"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="hce_hce-api"}) == 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "hce_hce-api" | |
annotations: | |
description: "No replicas are running for the Docker Swarn service 'hce_hce-api'. for 2 minutes" | |
summary: "Docker Swarm service hce_hce-api down for 2 minutes" | |
- alert: HAproxyElasticsearchBinaryBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch_binary"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="elasticsearch_binary"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch_binary"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailVrouterAgentProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-vrouter-agent"} == 0) == count(procstat_running{process_name="contrail-vrouter-agent"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-vrouter-agent" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" | |
- alert: NovaServicesWarning | |
expr: >- | |
openstack_nova_services{state="down",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} >= on (service) sum(openstack_nova_services{service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"}) by (service) * 0.3 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "{{ $labels.service }}" | |
annotations: | |
description: "More than 30.0% of {{ $labels.service }} services are down for the last 2 minutes" | |
summary: "More than 30.0% of {{ $labels.service }} services are down" | |
- alert: HAproxyAodhApiHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="aodh-api"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: HAproxyNovaApiHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="nova_api"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: NginxDown | |
expr: >- | |
nginx_up != 1 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "nginx" | |
annotations: | |
description: "Nginx service is down on node {{ $labels.host }}" | |
summary: "Nginx service down" | |
- alert: ContrailBGPSessionsNone | |
expr: >- | |
max(contrail_bgp_session_count) by (host) == 0 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-control" | |
annotations: | |
description: "There are no BGP sessions on node {{ $labels.host }}" | |
summary: "No BGP sessions" | |
- alert: HAproxyHeatApiHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="heat_api"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: KeepalivedProcessDown | |
expr: >- | |
procstat_running{process_name="keepalived"} == 0 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "keepalived" | |
annotations: | |
description: "Keepalived service is down on node {{ $labels.host }}" | |
summary: "Keepalived service is down" | |
- alert: InfluxdbSeriesNumberHigh | |
expr: >- | |
influxdb_database_numSeries >= 950000.0 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "influxdb" | |
annotations: | |
description: "The InfluxDB {{ $labels.database }} database is getting close to the maximum number of series (value={{ $value }},threshold=950000.0)." | |
summary: "InfluxDB high number of series for {{ $labels.database }}" | |
- alert: SystemSwapOut | |
expr: >- | |
rate(swap_out[2m]) > 1048576 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "system" | |
annotations: | |
description: "The rate of swap output bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold=1048576b/s)." | |
summary: "Swap output throughput too high on {{ $labels.host }}" | |
- alert: HAproxyHeatCloudwatchApiHTTPResponse5xx | |
expr: >- | |
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="heat_cloudwatch_api"}[1m]) > 1 | |
for: 2m | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)" | |
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})" | |
- alert: SystemLoad5TooHigh | |
expr: >- | |
system_load5 / system_n_cpus > 3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "system" | |
annotations: | |
description: "The 5-minutes system load is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=3)." | |
summary: "High system load (5m) on {{ $labels.host }}" | |
- alert: HAproxyRabbitmqClusterBackendCritical | |
expr: >- | |
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="rabbitmq_cluster"}[12h])) by (proxy) | |
- min (haproxy_active_servers{sv="BACKEND",proxy="rabbitmq_cluster"}) by (proxy) | |
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="rabbitmq_cluster"}[12h])) by (proxy) * 100 >= 50 | |
for: 5m | |
labels: | |
route: "email,salesforce" | |
severity: "critical" | |
service: "haproxy/{{ $labels.proxy }}" | |
annotations: | |
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy" | |
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes" | |
- alert: ContrailSupervisordDatabaseProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-supervisord-database"} == 0) >= count(procstat_running{process_name="contrail-supervisord-database"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-supervisord-database" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: ContrailNodemgrVrouterProcessWarning | |
expr: >- | |
count(procstat_running{process_name="contrail-nodemgr-vrouter"} == 0) >= count(procstat_running{process_name="contrail-nodemgr-vrouter"}) *0.3 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "contrail-nodemgr-vrouter" | |
annotations: | |
description: "More than 30.0% of '{{ $labels.service }}' is down" | |
summary: "More than 30.0% of '{{ $labels.service }}' is down" | |
- alert: ApacheDown | |
expr: >- | |
apache_up != 1 | |
labels: | |
route: "email,salesforce" | |
severity: "warning" | |
service: "apache" | |
annotations: | |
description: "Apache service is down on node {{ $labels.host }}" | |
summary: "Apache service down" | |
- alert: ContrailSupervisordVrouterProcessDown | |
expr: >- | |
count(procstat_running{process_name="contrail-supervisord-vrouter"} == 0) == count(procstat_running{process_name="contrail-supervisord-vrouter"}) | |
labels: | |
route: "email,salesforce" | |
severity: "down" | |
service: "contrail-supervisord-vrouter" | |
annotations: | |
description: "All '{{ $labels.service }}' services are down" | |
summary: "All '{{ $labels.service }}' services are down" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment