mirror of
https://github.com/outbackdingo/cozystack.git
synced 2026-03-22 04:41:28 +00:00
35 lines
2.5 KiB
YAML
35 lines
2.5 KiB
YAML
apiVersion: operator.victoriametrics.com/v1beta1
|
|
kind: VMRule
|
|
metadata:
|
|
name: alerts-node
|
|
namespace: cozy-monitoring
|
|
spec:
|
|
groups:
|
|
- name: kubernetes.pod_status_incorrect
|
|
rules:
|
|
- alert: PodStatusIsIncorrect
|
|
expr: >
|
|
(count by (node, namespace, pod) (kube_pod_status_ready{condition="true"} == 0) * on (namespace, pod) group_left(node) (max by (namespace, node, pod) (kube_pod_info)))
|
|
and
|
|
(
|
|
(count by (namespace, pod) (kube_pod_container_status_ready==1) * on (namespace, pod) group_left(node) (max by (namespace, node, pod) (kube_pod_info)))
|
|
unless
|
|
(count by (namespace, pod) (kube_pod_container_status_ready==0) * on (namespace, pod) group_left(node) (max by (namespace, node, pod) (kube_pod_info)))
|
|
)
|
|
for: 10m
|
|
annotations:
|
|
plk_markup_format: markdown
|
|
plk_protocol_version: "1"
|
|
plk_create_group_if_not_exists__node_have_pods_with_incorrect_status: "NodeHavePodsWithIncorrectStatus,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
|
|
plk_grouped_by__node_have_pods_with_incorrect_status: "NodeHavePodsWithIncorrectStatus,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
|
|
description: |
|
|
There is a {{ $labels.namespace }}/{{ $labels.pod }} Pod in the cluster that runs on the {{ $labels.node }} and listed as NotReady while all the Pod's containers are Ready.
|
|
|
|
This could be due to the [Kubernetes bug](https://github.com/kubernetes/kubernetes/issues/80968).
|
|
|
|
The recommended course of action:
|
|
1. Find all the Pods having this state: `kubectl get pod -o json --all-namespaces | jq '.items[] | select(.status.phase == "Running") | select(.status.conditions[] | select(.type == "ContainersReady" and .status == "True")) | select(.status.conditions[] | select(.type == "Ready" and .status == "False")) | "\(.spec.nodeName)/\(.metadata.namespace)/\(.metadata.name)"'`;
|
|
2. Find all the Nodes affected: `kubectl get pod -o json --all-namespaces | jq '.items[] | select(.status.phase == "Running") | select(.status.conditions[] | select(.type == "ContainersReady" and .status == "True")) | select(.status.conditions[] | select(.type == "Ready" and .status == "False")) | .spec.nodeName' -r | sort | uniq -c`;
|
|
3. Restart `kubelet` on each Node: `systemctl restart kubelet`.
|
|
summary: The state of the {{ $labels.namespace }}/{{ $labels.pod }} Pod running on the {{ $labels.node }} Node is incorrect. You need to restart `kubelet`.
|