alerting_rules.yml: {}
alerts:
Node:
- alert: HostHighCpuLoad
expr: 100 - (avg by(kubernetes_node) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 5m
labels:
severity: warning
group: Node
annotations:
summary: "CPU load is > 80%, VALUE = {{ $value }}, (node {{ $labels.kubernetes_node }})"
description: "CPU load is > 80%\n VALUE = {{ $value }}"
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < 10
for: 5m
labels:
severity: warning
group: Node
annotations:
summary: "Host out of disk space (< 10% left), VALUE = {{ $value }}, (node {{ $labels.kubernetes_node }})"
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}"
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 5m
labels:
severity: warning
group: Node
annotations:
summary: "Host out of memory (< 10% left), VALUE = {{ $value }}, (node {{ $labels.kubernetes_node }})"
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}"
- alert: KubernetesNodeReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 5m
labels:
severity: critical
group: Node
annotations:
summary: "Kubernetes Node {{ $labels.node }} not ready"
description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}"
Pod:
- alert: KubernetesPodNotHealthy
expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[1h:]) > 0
for: 5m
labels:
severity: critical
group: Pod
annotations:
summary: "Kubernetes Pod not healthy (namespace {{ $labels.namespace }}) (app {{ $labels.pod }})"
description: "Pod has been in a non-ready state for longer than 5min.\n VALUE = {{ $value }}"
- alert: KubernetesPodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 5 > 5
for: 5m
labels:
severity: warning
group: Pod
annotations:
summary: "Kubernetes pod crash looping (namespace {{ $labels.namespace }}) (app {{ $labels.pod }})"
description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}"
- alert: ContainerCpuUsage
expr: (irate(container_cpu_usage_seconds_total{container!="POD", image!=""}[1m]) * 100) > 80
for: 5m
labels:
severity: warning
group: Pod
annotations:
summary: "Container CPU usage (>80% left), VALUE = {{ $value }}, (namespace {{ $labels.namespace }}, pod {{ $labels.pod }})"
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}"
- alert: ContainerMemoryUsage
expr: (( sum(container_memory_working_set_bytes{container!="", container!="POD"}) by (pod,namespace) ) / ( sum(kube_pod_container_resource_requests_memory_bytes) by (pod,namespace) ) * 100 ) > 80
for: 5m
labels:
severity: warning
group: Pod
annotations:
summary: "Container Memory usage, (>80% left), VALUE = {{ $value }}, (namespace {{ $labels.namespace }}, pod {{ $labels.pod }})"
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}"
Volume:
- alert: KubernetesVolumeOutOfDiskSpace
expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10
for: 5m
labels:
severity: warning
group: Volume
annotations:
summary: "Kubernetes Volume out of disk space (< 10% left), VALUE = {{ $value }}, (namespace {{ $labels.namespace }}, persistentvolumeclaim {{ $labels.persistentvolumeclaim }})"
description: "Volume is almost full (< 10% left)\n VALUE = {{ $value }}"
- alert: KubernetesPersistentvolumeError
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
for: 5m
labels:
severity: critical
group: Volume
annotations:
summary: "Kubernetes PersistentVolume error persistentvolumeclaim ({{ $labels.persistentvolume }})"
description: "Persistent volume is in bad state\n VALUE = {{ $value }}"
Set:
- alert: KubernetesDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
for: 5m
labels:
severity: warning
group: Set
annotations:
summary: "Kubernetes Deployment replicas mismatch (namespace {{ $labels.namespace }}, deployment {{ $labels.deployment }})"
description: "Deployment Replicas mismatch\n VALUE = {{ $value }}"
- alert: KubernetesStatefulsetReplicasMismatch
expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
for: 5m
labels:
severity: warning
group: Set
annotations:
summary: "Kubernetes StatefulSet replicas mismatch (namespace {{ $labels.namespace }}, statefulset {{ $labels.statefulset }})"
description: "A StatefulSet has not matched the expected number of replicas for longer than 15 minutes.\n VALUE = {{ $value }}"
- alert: KubernetesStatefulsetDown
expr: (kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas_current)
for: 5m
labels:
severity: critical
group: Set
annotations:
summary: "Kubernetes StatefulSet down (namespace {{ $labels.namespace }}) (app {{ $labels.statefulset }})"
description: "A StatefulSet went down\n VALUE = {{ $value }}"