We ended up doing something like this
- alert: IstioPilotAvailabilityDrop
annotations:
summary: 'Istio Pilot Availability Drop'
description: 'Pilot pods have dropped during the last 5m (current value: *{{ printf "%2.0f%%" $value }}*). Envoy sidecars might have outdated configuration'
expr: >
avg(avg_over_time(up{job="pilot"}[1m])) < 0.5
for: 5m
- alert: IstioMixerTelemetryAvailabilityDrop
annotations:
summary: 'Istio Mixer Telemetry Drop'
description: 'Mixer pods have dropped during the last 5m (current value: *{{ printf "%2.0f%%" $value }}*). Istio metrics will not work correctly'
expr: >
avg(avg_over_time(up{job="mixer", service="istio-telemetry", endpoint="http-monitoring"}[5m])) < 0.5
for: 5m
- alert: IstioGalleyAvailabilityDrop
annotations:
summary: 'Istio Galley Availability Drop'
description: 'Galley pods have dropped during the last 5m (current value: *{{ printf "%2.0f%%" $value }}*). Istio config ingestion and processing will not work'
expr: >
avg(avg_over_time(up{job="galley"}[5m])) < 0.5
for: 5m
- alert: IstioGatewayAvailabilityDrop
annotations:
summary: 'Istio Gateway Availability Drop'
description: 'Gateway pods have dropped during the last 5m (current value: *{{ printf "%2.0f%%" $value }}*). Inbound traffic will likely be affected'
expr: >
min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway", namespace="istio-system"}) without (instance, pod) < 2
for: 5m
- alert: IstioPilotPushErrorsHigh
annotations:
summary: 'Number of Istio Pilot push errors is too high'
description: 'Pilot has too many push errors during the last 5m (current value: *{{ printf "%2.0f%%" $value }}*). Envoy sidecars might have outdated configuration'
expr: >
sum(irate(pilot_xds_push_errors{job="pilot"}[5m])) / sum(irate(pilot_xds_pushes{job="pilot"}[5m])) > 0.05
for: 5m
- alert: IstioMixerPrometheusDispatchesLow
annotations:
summary: 'Number of Mixer dispatches to Prometheus is too low'
description: 'Mixer disptaches to Prometheus has dropped below normal levels during the last 5m (current value: *{{ printf "%2.0f%%" $value }}*). Istio metrics might not be being exported properly'
expr: >
sum(irate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[5m])) < 180
for: 5m
- alert: IstioGlobalRequestRateHigh
annotations:
summary: 'Istio Global Request Rate High'
description: 'Istio global request rate is unusually high during the last 5m (current value: *{{ printf "%2.0f%%" $value }}*). The amount of traffic being generated inside the service mesh is higher than normal'
expr: >
round(sum(irate(istio_requests_total{reporter="destination"}[5m])), 0.001) > 1200
for: 5m
- alert: IstioGlobalRequestRateLow
annotations:
summary: 'Istio global request rate too low'
description: 'Istio global request rate is unusually low during the last 5m (current value: *{{ printf "%2.0f%%" $value }}*). The amount of traffic being generated inside the service mesh has dropped below usual levels'
expr: >
round(sum(irate(istio_requests_total{reporter="destination"}[5m])), 0.001) < 300
for: 5m
- alert: IstioGlobalHTTP5xxRateHigh
annotations:
summary: 'Istio Percentage of HTTP 5xx responses is too high'
description: 'Istio global HTTP 5xx rate is too high in last 5m (current value: *{{ printf "%2.0f%%" $value }}*). The HTTP 5xx errors within the service mesh is unusually high'
expr: >
sum(irate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(irate(istio_requests_total{reporter="destination"}[5m])) > 0.01
for: 5m
- alert: IstioGatewayOutgoingSuccessLow
annotations:
summary: 'Istio Gateway outgoing success rate is too low'
description: 'Istio Gateway success to outbound destinations is too low in last 5m (current value: *{{ printf "%2.0f%%" $value }}*). Inbound traffic may be affected'
expr: >
sum(irate(istio_requests_total{reporter="source", source_workload="istio-ingressgateway",source_workload_namespace="istio-system", connection_security_policy!="mutual_tls",response_code!~"5.*"}[5m])) / sum(irate(istio_requests_total{reporter="source", source_workload="istio-ingressgateway",source_workload_namespace="istio-system", connection_security_policy!="mutual_tls"}[5m])) < 0.995
for: 5m