MENU

报警规则

• 2021 年 12 月 02 日 • 默认分类

  alerting_rules.yml: {}
  alerts:
    Node:
    - alert: HostHighCpuLoad
      expr: 100 - (avg by(kubernetes_node) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
      for: 5m
      labels:
        severity: warning
        group: Node
      annotations:
        summary: "CPU load is > 80%, VALUE = {{ $value }}, (node {{ $labels.kubernetes_node }})"
        description: "CPU load is > 80%\n  VALUE = {{ $value }}"
    - alert: HostOutOfDiskSpace
      expr: (node_filesystem_avail_bytes{mountpoint="/"}  * 100) / node_filesystem_size_bytes{mountpoint="/"} < 10
      for: 5m
      labels:
        severity: warning
        group: Node
      annotations:
        summary: "Host out of disk space (< 10% left), VALUE = {{ $value }}, (node {{ $labels.kubernetes_node }})"
        description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}"
    - alert: HostOutOfMemory
      expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
      for: 5m
      labels:
        severity: warning
        group: Node
      annotations:
        summary: "Host out of memory (< 10% left), VALUE = {{ $value }}, (node {{ $labels.kubernetes_node }})"
        description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}"
    - alert: KubernetesNodeReady
      expr: kube_node_status_condition{condition="Ready",status="true"} == 0
      for: 5m
      labels:
        severity: critical
        group: Node
      annotations:
        summary: "Kubernetes Node {{ $labels.node }} not ready"
        description: "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value }}"

    Pod:
    - alert: KubernetesPodNotHealthy
      expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[1h:]) > 0
      for: 5m
      labels:
        severity: critical
        group: Pod
      annotations:
        summary: "Kubernetes Pod not healthy (namespace {{ $labels.namespace }}) (app {{ $labels.pod }})"
        description: "Pod has been in a non-ready state for longer than 5min.\n  VALUE = {{ $value }}"
    - alert: KubernetesPodCrashLooping
      expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 5 > 5
      for: 5m
      labels:
        severity: warning
        group: Pod
      annotations:
        summary: "Kubernetes pod crash looping (namespace {{ $labels.namespace }}) (app {{ $labels.pod }})"
        description: "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}"
    - alert: ContainerCpuUsage
      expr: (irate(container_cpu_usage_seconds_total{container!="POD", image!=""}[1m]) * 100) > 80
      for: 5m
      labels:
        severity: warning
        group: Pod
      annotations:
        summary: "Container CPU usage (>80% left), VALUE = {{ $value }}, (namespace {{ $labels.namespace }}, pod {{ $labels.pod }})"
        description: "Container CPU usage is above 80%\n  VALUE = {{ $value }}"
    - alert: ContainerMemoryUsage
      expr: (( sum(container_memory_working_set_bytes{container!="", container!="POD"}) by (pod,namespace) ) / ( sum(kube_pod_container_resource_requests_memory_bytes) by (pod,namespace) ) * 100  ) > 80
      for: 5m
      labels:
        severity: warning
        group: Pod
      annotations:
        summary: "Container Memory usage, (>80% left), VALUE = {{ $value }}, (namespace {{ $labels.namespace }}, pod {{ $labels.pod }})"
        description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}"

    Volume:
    - alert: KubernetesVolumeOutOfDiskSpace
      expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10
      for: 5m
      labels:
        severity: warning
        group: Volume
      annotations:
        summary: "Kubernetes Volume out of disk space (< 10% left), VALUE = {{ $value }}, (namespace {{ $labels.namespace }}, persistentvolumeclaim {{ $labels.persistentvolumeclaim }})"
        description: "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}"
    - alert: KubernetesPersistentvolumeError
      expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
      for: 5m
      labels:
        severity: critical
        group: Volume
      annotations:
        summary: "Kubernetes PersistentVolume error persistentvolumeclaim ({{ $labels.persistentvolume }})"
        description: "Persistent volume is in bad state\n  VALUE = {{ $value }}"
    
    Set:
    - alert: KubernetesDeploymentReplicasMismatch
      expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
      for: 5m
      labels:
        severity: warning
        group: Set
      annotations:
        summary: "Kubernetes Deployment replicas mismatch (namespace {{ $labels.namespace }}, deployment {{ $labels.deployment }})"
        description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}"
    - alert: KubernetesStatefulsetReplicasMismatch
      expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
      for: 5m
      labels:
        severity: warning
        group: Set
      annotations:
        summary: "Kubernetes StatefulSet replicas mismatch (namespace {{ $labels.namespace }}, statefulset {{ $labels.statefulset }})"
        description: "A StatefulSet has not matched the expected number of replicas for longer than 15 minutes.\n  VALUE = {{ $value }}"
    - alert: KubernetesStatefulsetDown
      expr: (kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas_current)
      for: 5m
      labels:
        severity: critical
        group: Set
      annotations:
        summary: "Kubernetes StatefulSet down (namespace {{ $labels.namespace }}) (app {{ $labels.statefulset }})"
        description: "A StatefulSet went down\n  VALUE = {{ $value }}"