cozystack/packages/system/monitoring/alerts/pod-status.yaml

apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
  name: alerts-node
  namespace: cozy-monitoring
spec:
  groups:
  - name: kubernetes.pod_status_incorrect
    rules:
      - alert: PodStatusIsIncorrect
        expr: >
          (count by (node, namespace, pod) (kube_pod_status_ready{condition="true"} == 0) * on (namespace, pod) group_left(node) (max by (namespace, node, pod) (kube_pod_info)))
          and
          (
            (count by (namespace, pod) (kube_pod_container_status_ready==1) * on (namespace, pod) group_left(node) (max by (namespace, node, pod) (kube_pod_info)))
            unless
            (count by (namespace, pod) (kube_pod_container_status_ready==0) * on (namespace, pod) group_left(node) (max by (namespace, node, pod) (kube_pod_info)))
          )
        for: 10m
        annotations:
          plk_markup_format: markdown
          plk_protocol_version: "1"
          plk_create_group_if_not_exists__node_have_pods_with_incorrect_status: "NodeHavePodsWithIncorrectStatus,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
          plk_grouped_by__node_have_pods_with_incorrect_status: "NodeHavePodsWithIncorrectStatus,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
          description: |
            There is a {{ $labels.namespace }}/{{ $labels.pod }} Pod in the cluster that runs on the {{ $labels.node }} and listed as NotReady while all the Pod's containers are Ready.

            This could be due to the [Kubernetes bug](https://github.com/kubernetes/kubernetes/issues/80968).

            The recommended course of action:
            1. Find all the Pods having this state: `kubectl get pod -o json --all-namespaces | jq '.items[] | select(.status.phase == "Running") | select(.status.conditions[] | select(.type == "ContainersReady" and .status == "True")) | select(.status.conditions[] | select(.type == "Ready" and .status == "False")) | "\(.spec.nodeName)/\(.metadata.namespace)/\(.metadata.name)"'`;
            2. Find all the Nodes affected: `kubectl get pod -o json --all-namespaces | jq '.items[] | select(.status.phase == "Running") | select(.status.conditions[] | select(.type == "ContainersReady" and .status == "True")) | select(.status.conditions[] | select(.type == "Ready" and .status == "False")) | .spec.nodeName' -r | sort | uniq -c`;
            3. Restart `kubelet` on each Node: `systemctl restart kubelet`.
          summary: The state of the {{ $labels.namespace }}/{{ $labels.pod }} Pod running on the {{ $labels.node }} Node is incorrect. You need to restart `kubelet`.