会持续完善与修改!
groups:
- name: kube-state-metrics.rules
rules:
- alert: KubernetesNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 10m
labels:
severity: critical
type: kube-state
annotations:
summary: 节点长时间未就绪
description: "节点 {{ $labels.node }} 已超过十分钟为不可读状态\n VALUE = {{ $value }}\n"
- alert: KubernetesMemoryPressure
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
for: 2m
labels:
severity: critical
type: kube-state
annotations:
summary: Kubernetes memory pressure
description: "{{ $labels.node }} 有内存压力情况\n VALUE = {{ $value }}\n"
- alert: KubernetesDiskPressure
expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
for: 2m
labels:
severity: critical
type: kube-state
annotations:
summary: Kubernetes disk pressure
description: "{{ $labels.node }} 有磁盘压力情况\n VALUE = {{ $value }}\n"
- alert: KubernetesNetworkUnavailable
expr: kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1
for: 2m
labels:
severity: critical
type: kube-state
annotations:
summary: Kubernetes network unavailable
description: "{{ $labels.node }} 网络不可用\n VALUE = {{ $value }}\n"
- alert: KubernetesOutOfCapacity
expr: sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90
for: 2m
labels:
severity: warning
type: kube-state
annotations:
summary: Kubernetes out of capacity
description: "{{ $labels.node }} 容量不足\n VALUE = {{ $value }}\n"
- alert: KubernetesContainerOomKiller
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
for: 0m
labels:
severity: warning
type: kube-state
annotations:
summary: Kubernetes container oom killer
description: "在过去10分钟内,pod{{ $labels.namespace }}/{{ $labels.pod }}中的容器{{ $labels.container }}已被OOMKilled{{ $value }}次。\n"
- alert: KubernetesJobFailed
expr: kube_job_status_failed > 0
for: 0m
labels:
severity: warning
type: kube-state
annotations:
summary: Kubernetes Job failed
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} 执行失败!状态值为{{ $value }}\n"
- alert: KubernetesCronjobSuspended
expr: kube_cronjob_spec_suspend != 0
for: 0m
labels:
severity: warning
type: kube-state
annotations:
summary: Kubernetes CronJob suspended
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }}被暂停!状态值为{{ $value }}\n"
- alert: KubernetesPersistentvolumeclaimPending
expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
for: 2m
labels:
severity: warning
type: kube-state
annotations:
summary: Kubernetes PersistentVolumeClaim pending
description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}被挂起。\n"
# - alert: KubernetesVolumeOutOfDiskSpace
# expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10
# for: 2m
# labels:
# severity: warning
# type: kube-state
# annotations:
# summary: Kubernetes Volume out of disk space
# description: "Volume 几乎已满(剩余<10%),当前值为{{ $value }}%\n="
#
# - alert: KubernetesVolumeFullInFourDays
# expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0
# for: 0m
# labels:
# severity: critical
# type: kube-state
# annotations:
# summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
# description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n"
- alert: KubernetesPersistentvolumeError
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending"} > 0
for: 0m
labels:
severity: critical
type: kube-state
annotations:
summary: Kubernetes PersistentVolume error
description: "Persistent volume 处于错误状态。"
- alert: KubernetesStatefulsetDown
expr: kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0
for: 1m
labels:
severity: critical
type: kube-state
annotations:
summary: Kubernetes StatefulSet down
description: "A StatefulSet 构建失败\n VALUE = {{ $value }}\n"
- alert: KubernetesHpaScalingAbility
expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1
for: 2m
labels:
severity: warning
type: kube-state
annotations:
summary: Kubernetes HPA scaling ability
description: "Pod 无法缩放。\n"
- alert: KubernetesHpaMetricAvailability
expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1
for: 0m
labels:
severity: warning
type: kube-state
annotations:
summary: Kubernetes HPA metric availability
description: "HPA无法收集指标。\n"
- alert: KubernetesHpaScaleCapability
expr: kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas
for: 2m
labels:
severity: info
type: kube-state
annotations:
summary: Kubernetes HPA scale capability
description: "已达到所需Pod的最大数量。\n VALUE = {{ $value }}\n"
- alert: KubernetesHpaUnderutilized
expr: max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3
for: 0m
labels:
severity: info
type: kube-state
annotations:
summary: Kubernetes HPA underutilized
description: "HPA在50%的时间内始终处于最低复制副本。这里可能节省成本。\n VALUE = {{ $value }}\n"
- alert: KubernetesPodNotHealthy
expr: sum(kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) by(job,env,namespace,pod) > 0
for: 15m
labels:
severity: critical
type: kube-state
annotations:
summary: Kubernetes Pod not healthy
description: "Pod已处于非就绪状态超过15分钟。\n"
- alert: KubernetesPodCrashLooping
expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
for: 2m
labels:
severity: critical
type: kube-state
annotations:
summary: Kubernetes pod crash looping
description: "Pod{{$labels.Pod}}正在循环奔溃中。1分钟内重启次数大于3次。\n"
- alert: KubernetesReplicassetMismatch
expr: kube_replicaset_status_ready_replicas<kube_replicaset_spec_replicas
for: 10m
labels:
severity: warning
type: kube-state
annotations:
summary: Kubernetes ReplicasSet mismatch
description: "ReplicasSet 部署期望副本数与ready状态的副本数不符。当前ready状态的副本数为{{ $value }}。\n"
- alert: KubernetesDeploymentReplicasMismatch
expr: kube_deployment_status_replicas_available < kube_deployment_spec_replicas
for: 10m
labels:
severity: warning
type: kube-state
annotations:
summary: Kubernetes Deployment replicas mismatch
description: "Deployment 实际可用副本数量小于定义的数量,可能存在Pod未准备好。当前可用的副本数为{{ $value }}。\n"
- alert: KubernetesStatefulsetReplicasMismatch
expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
for: 10m
labels:
severity: warning
type: kube-state
annotations:
summary: Kubernetes StatefulSet replicas mismatch
description: "StatefulSet准备就绪的副本数与预期的副本数不匹配。当前可用数量为{{ $value }}。\n"
- alert: KubernetesDeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
for: 10m
labels:
severity: critical
type: kube-state
annotations:
summary: Kubernetes Deployment generation mismatch
description: "Deployment 已失败,但尚未回滚。\n"
- alert: KubernetesStatefulsetGenerationMismatch
expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation
for: 10m
labels:
severity: critical
type: kube-state
annotations:
summary: Kubernetes StatefulSet generation mismatch
description: "StatefulSet已失败,但尚未回滚。\n"
- alert: KubernetesStatefulsetUpdateNotRolledOut
expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)
for: 10m
labels:
severity: warning
type: kube-state
annotations:
summary: Kubernetes StatefulSet update not rolled out
description: "StatefulSet更新未展开.\n"
- alert: KubernetesDaemonsetRolloutStuck
expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
for: 10m
labels:
severity: warning
type: kube-state
annotations:
summary: Kubernetes DaemonSet rollout stuck
description: "DaemonSet的一些Pods未安排或是为准备好。\n"
- alert: KubernetesDaemonsetMisscheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 1m
labels:
severity: critical
type: kube-state
annotations:
summary: Kubernetes DaemonSet misscheduled
description: "一些DaemonSet Pod正在不应该运行的地方运行。\n"
# Threshold should be customized for each cronjob name.
- alert: KubernetesCronjobTooLong
expr: time() - kube_cronjob_next_schedule_time > 3600
for: 0m
labels:
severity: warning
type: kube-state
annotations:
summary: Kubernetes CronJob too long
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} 需要超过1小时才能完成。\n"
- alert: KubernetesJobSlowCompletion
expr: kube_job_spec_completions - kube_job_status_succeeded > 0
for: 12h
labels:
severity: critical
type: kube-state
annotations:
summary: Kubernetes job slow completion
description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} 未及时完成。\n"
- alert: KubernetesPodRestarts
expr: sum(changes(kube_pod_container_status_restarts_total[2m]))by(env,namespace,pod) >0 and sum(time()-kube_pod_start_time>5*1000*60)by(env,namespace,pod)
for: 0m
labels:
severity: warning
type: kube-state
annotations:
summary: Kubernetes Pods 重启告警
description: "Kubernetes Pod {{ $labels.namespace }}/{{ $labels.pod }} 正常启动五分钟后,被重启了{{ $value }}次.\n"