mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-10-31 02:08:13 +00:00 
			
		
		
		
	Introduce a metric to track kubelet admission failure.
This commit is contained in:
		| @@ -29,6 +29,7 @@ import ( | |||||||
| 	sysruntime "runtime" | 	sysruntime "runtime" | ||||||
| 	"slices" | 	"slices" | ||||||
| 	"sort" | 	"sort" | ||||||
|  | 	"strings" | ||||||
| 	"sync" | 	"sync" | ||||||
| 	"sync/atomic" | 	"sync/atomic" | ||||||
| 	"time" | 	"time" | ||||||
| @@ -81,6 +82,7 @@ import ( | |||||||
| 	"k8s.io/kubernetes/pkg/kubelet/cloudresource" | 	"k8s.io/kubernetes/pkg/kubelet/cloudresource" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle" | 	"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/cm" | 	"k8s.io/kubernetes/pkg/kubelet/cm" | ||||||
|  | 	"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/config" | 	"k8s.io/kubernetes/pkg/kubelet/config" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/configmap" | 	"k8s.io/kubernetes/pkg/kubelet/configmap" | ||||||
| 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" | 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" | ||||||
| @@ -120,6 +122,7 @@ import ( | |||||||
| 	"k8s.io/kubernetes/pkg/kubelet/volumemanager" | 	"k8s.io/kubernetes/pkg/kubelet/volumemanager" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/watchdog" | 	"k8s.io/kubernetes/pkg/kubelet/watchdog" | ||||||
| 	httpprobe "k8s.io/kubernetes/pkg/probe/http" | 	httpprobe "k8s.io/kubernetes/pkg/probe/http" | ||||||
|  | 	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration" | ||||||
| 	"k8s.io/kubernetes/pkg/security/apparmor" | 	"k8s.io/kubernetes/pkg/security/apparmor" | ||||||
| 	"k8s.io/kubernetes/pkg/util/oom" | 	"k8s.io/kubernetes/pkg/util/oom" | ||||||
| 	"k8s.io/kubernetes/pkg/volume" | 	"k8s.io/kubernetes/pkg/volume" | ||||||
| @@ -220,6 +223,26 @@ var ( | |||||||
| 	// ContainerLogsDir can be overwritten for testing usage | 	// ContainerLogsDir can be overwritten for testing usage | ||||||
| 	ContainerLogsDir = DefaultContainerLogsDir | 	ContainerLogsDir = DefaultContainerLogsDir | ||||||
| 	etcHostsPath     = getContainerEtcHostsPath() | 	etcHostsPath     = getContainerEtcHostsPath() | ||||||
|  |  | ||||||
|  | 	admissionRejectionReasons = sets.New[string]( | ||||||
|  | 		lifecycle.AppArmorNotAdmittedReason, | ||||||
|  | 		lifecycle.PodOSSelectorNodeLabelDoesNotMatch, | ||||||
|  | 		lifecycle.PodOSNotSupported, | ||||||
|  | 		lifecycle.InvalidNodeInfo, | ||||||
|  | 		lifecycle.InitContainerRestartPolicyForbidden, | ||||||
|  | 		lifecycle.UnexpectedAdmissionError, | ||||||
|  | 		lifecycle.UnknownReason, | ||||||
|  | 		lifecycle.UnexpectedPredicateFailureType, | ||||||
|  | 		lifecycle.OutOfCPU, | ||||||
|  | 		lifecycle.OutOfMemory, | ||||||
|  | 		lifecycle.OutOfEphemeralStorage, | ||||||
|  | 		lifecycle.OutOfPods, | ||||||
|  | 		tainttoleration.ErrReasonNotMatch, | ||||||
|  | 		eviction.Reason, | ||||||
|  | 		sysctl.ForbiddenReason, | ||||||
|  | 		topologymanager.ErrorTopologyAffinity, | ||||||
|  | 		nodeshutdown.NodeShutdownNotAdmittedReason, | ||||||
|  | 	) | ||||||
| ) | ) | ||||||
|  |  | ||||||
| func getContainerEtcHostsPath() string { | func getContainerEtcHostsPath() string { | ||||||
| @@ -2304,7 +2327,6 @@ func (kl *Kubelet) canAdmitPod(allocatedPods []*v1.Pod, pod *v1.Pod) (bool, stri | |||||||
| 	attrs := &lifecycle.PodAdmitAttributes{Pod: pod, OtherPods: allocatedPods} | 	attrs := &lifecycle.PodAdmitAttributes{Pod: pod, OtherPods: allocatedPods} | ||||||
| 	for _, podAdmitHandler := range kl.admitHandlers { | 	for _, podAdmitHandler := range kl.admitHandlers { | ||||||
| 		if result := podAdmitHandler.Admit(attrs); !result.Admit { | 		if result := podAdmitHandler.Admit(attrs); !result.Admit { | ||||||
|  |  | ||||||
| 			klog.InfoS("Pod admission denied", "podUID", attrs.Pod.UID, "pod", klog.KObj(attrs.Pod), "reason", result.Reason, "message", result.Message) | 			klog.InfoS("Pod admission denied", "podUID", attrs.Pod.UID, "pod", klog.KObj(attrs.Pod), "reason", result.Reason, "message", result.Message) | ||||||
|  |  | ||||||
| 			return false, result.Reason, result.Message | 			return false, result.Reason, result.Message | ||||||
| @@ -2314,6 +2336,22 @@ func (kl *Kubelet) canAdmitPod(allocatedPods []*v1.Pod, pod *v1.Pod) (bool, stri | |||||||
| 	return true, "", "" | 	return true, "", "" | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func recordAdmissionRejection(reason string) { | ||||||
|  | 	// It is possible that the "reason" label can have high cardinality. | ||||||
|  | 	// To avoid this metric from exploding, we create an allowlist of known | ||||||
|  | 	// reasons, and only record reasons from this list. Use "Other" reason | ||||||
|  | 	// for the rest. | ||||||
|  | 	if admissionRejectionReasons.Has(reason) { | ||||||
|  | 		metrics.AdmissionRejectionsTotal.WithLabelValues(reason).Inc() | ||||||
|  | 	} else if strings.HasPrefix(reason, lifecycle.InsufficientResourcePrefix) { | ||||||
|  | 		// non-extended resources (like cpu, memory, ephemeral-storage, pods) | ||||||
|  | 		// are already included in admissionRejectionReasons. | ||||||
|  | 		metrics.AdmissionRejectionsTotal.WithLabelValues("OutOfExtendedResources").Inc() | ||||||
|  | 	} else { | ||||||
|  | 		metrics.AdmissionRejectionsTotal.WithLabelValues("Other").Inc() | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
| // syncLoop is the main loop for processing changes. It watches for changes from | // syncLoop is the main loop for processing changes. It watches for changes from | ||||||
| // three channels (file, apiserver, and http) and creates a union of them. For | // three channels (file, apiserver, and http) and creates a union of them. For | ||||||
| // any new change seen, will run a sync against desired state and running state. If | // any new change seen, will run a sync against desired state and running state. If | ||||||
| @@ -2584,6 +2622,11 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) { | |||||||
| 				// Check if we can admit the pod; if not, reject it. | 				// Check if we can admit the pod; if not, reject it. | ||||||
| 				if ok, reason, message := kl.canAdmitPod(allocatedPods, allocatedPod); !ok { | 				if ok, reason, message := kl.canAdmitPod(allocatedPods, allocatedPod); !ok { | ||||||
| 					kl.rejectPod(pod, reason, message) | 					kl.rejectPod(pod, reason, message) | ||||||
|  | 					// We avoid recording the metric in canAdmitPod because it's called | ||||||
|  | 					// repeatedly during a resize, which would inflate the metric. | ||||||
|  | 					// Instead, we record the metric here in HandlePodAdditions for new pods | ||||||
|  | 					// and capture resize events separately. | ||||||
|  | 					recordAdmissionRejection(reason) | ||||||
| 					continue | 					continue | ||||||
| 				} | 				} | ||||||
| 				// For new pod, checkpoint the resource values at which the Pod has been admitted | 				// For new pod, checkpoint the resource values at which the Pod has been admitted | ||||||
| @@ -2595,6 +2638,11 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) { | |||||||
| 				// Check if we can admit the pod; if not, reject it. | 				// Check if we can admit the pod; if not, reject it. | ||||||
| 				if ok, reason, message := kl.canAdmitPod(allocatedPods, pod); !ok { | 				if ok, reason, message := kl.canAdmitPod(allocatedPods, pod); !ok { | ||||||
| 					kl.rejectPod(pod, reason, message) | 					kl.rejectPod(pod, reason, message) | ||||||
|  | 					// We avoid recording the metric in canAdmitPod because it's called | ||||||
|  | 					// repeatedly during a resize, which would inflate the metric. | ||||||
|  | 					// Instead, we record the metric here in HandlePodAdditions for new pods | ||||||
|  | 					// and capture resize events separately. | ||||||
|  | 					recordAdmissionRejection(reason) | ||||||
| 					continue | 					continue | ||||||
| 				} | 				} | ||||||
| 			} | 			} | ||||||
|   | |||||||
| @@ -55,6 +55,7 @@ import ( | |||||||
| 	"k8s.io/client-go/tools/record" | 	"k8s.io/client-go/tools/record" | ||||||
| 	"k8s.io/client-go/util/flowcontrol" | 	"k8s.io/client-go/util/flowcontrol" | ||||||
| 	featuregatetesting "k8s.io/component-base/featuregate/testing" | 	featuregatetesting "k8s.io/component-base/featuregate/testing" | ||||||
|  | 	"k8s.io/component-base/metrics/testutil" | ||||||
| 	internalapi "k8s.io/cri-api/pkg/apis" | 	internalapi "k8s.io/cri-api/pkg/apis" | ||||||
| 	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" | 	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" | ||||||
| 	remote "k8s.io/cri-client/pkg" | 	remote "k8s.io/cri-client/pkg" | ||||||
| @@ -66,6 +67,7 @@ import ( | |||||||
| 	cadvisortest "k8s.io/kubernetes/pkg/kubelet/cadvisor/testing" | 	cadvisortest "k8s.io/kubernetes/pkg/kubelet/cadvisor/testing" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle" | 	"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/cm" | 	"k8s.io/kubernetes/pkg/kubelet/cm" | ||||||
|  | 	"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/config" | 	"k8s.io/kubernetes/pkg/kubelet/config" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/configmap" | 	"k8s.io/kubernetes/pkg/kubelet/configmap" | ||||||
| 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" | 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" | ||||||
| @@ -75,6 +77,7 @@ import ( | |||||||
| 	"k8s.io/kubernetes/pkg/kubelet/kuberuntime" | 	"k8s.io/kubernetes/pkg/kubelet/kuberuntime" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/lifecycle" | 	"k8s.io/kubernetes/pkg/kubelet/lifecycle" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/logs" | 	"k8s.io/kubernetes/pkg/kubelet/logs" | ||||||
|  | 	"k8s.io/kubernetes/pkg/kubelet/metrics" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/network/dns" | 	"k8s.io/kubernetes/pkg/kubelet/network/dns" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/nodeshutdown" | 	"k8s.io/kubernetes/pkg/kubelet/nodeshutdown" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/pleg" | 	"k8s.io/kubernetes/pkg/kubelet/pleg" | ||||||
| @@ -90,12 +93,14 @@ import ( | |||||||
| 	"k8s.io/kubernetes/pkg/kubelet/status" | 	"k8s.io/kubernetes/pkg/kubelet/status" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/status/state" | 	"k8s.io/kubernetes/pkg/kubelet/status/state" | ||||||
| 	statustest "k8s.io/kubernetes/pkg/kubelet/status/testing" | 	statustest "k8s.io/kubernetes/pkg/kubelet/status/testing" | ||||||
|  | 	"k8s.io/kubernetes/pkg/kubelet/sysctl" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/token" | 	"k8s.io/kubernetes/pkg/kubelet/token" | ||||||
| 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types" | 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types" | ||||||
| 	kubeletutil "k8s.io/kubernetes/pkg/kubelet/util" | 	kubeletutil "k8s.io/kubernetes/pkg/kubelet/util" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/util/queue" | 	"k8s.io/kubernetes/pkg/kubelet/util/queue" | ||||||
| 	kubeletvolume "k8s.io/kubernetes/pkg/kubelet/volumemanager" | 	kubeletvolume "k8s.io/kubernetes/pkg/kubelet/volumemanager" | ||||||
| 	schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" | 	schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" | ||||||
|  | 	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration" | ||||||
| 	"k8s.io/kubernetes/pkg/util/oom" | 	"k8s.io/kubernetes/pkg/util/oom" | ||||||
| 	"k8s.io/kubernetes/pkg/volume" | 	"k8s.io/kubernetes/pkg/volume" | ||||||
| 	_ "k8s.io/kubernetes/pkg/volume/hostpath" | 	_ "k8s.io/kubernetes/pkg/volume/hostpath" | ||||||
| @@ -3460,3 +3465,200 @@ func TestIsPodResizeInProgress(t *testing.T) { | |||||||
| 		}) | 		}) | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func TestRecordAdmissionRejection(t *testing.T) { | ||||||
|  | 	metrics.Register() | ||||||
|  |  | ||||||
|  | 	testCases := []struct { | ||||||
|  | 		name   string | ||||||
|  | 		reason string | ||||||
|  | 		wants  string | ||||||
|  | 	}{ | ||||||
|  | 		{ | ||||||
|  | 			name:   "AppArmor", | ||||||
|  | 			reason: lifecycle.AppArmorNotAdmittedReason, | ||||||
|  | 			wants: ` | ||||||
|  | 				# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  | 				# TYPE kubelet_admission_rejections_total counter | ||||||
|  | 				kubelet_admission_rejections_total{reason="AppArmor"} 1 | ||||||
|  | 			`, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "PodOSSelectorNodeLabelDoesNotMatch", | ||||||
|  | 			reason: lifecycle.PodOSSelectorNodeLabelDoesNotMatch, | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="PodOSSelectorNodeLabelDoesNotMatch"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "PodOSNotSupported", | ||||||
|  | 			reason: lifecycle.PodOSNotSupported, | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="PodOSNotSupported"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "InvalidNodeInfo", | ||||||
|  | 			reason: lifecycle.InvalidNodeInfo, | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="InvalidNodeInfo"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "InitContainerRestartPolicyForbidden", | ||||||
|  | 			reason: lifecycle.InitContainerRestartPolicyForbidden, | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="InitContainerRestartPolicyForbidden"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "UnexpectedAdmissionError", | ||||||
|  | 			reason: lifecycle.UnexpectedAdmissionError, | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="UnexpectedAdmissionError"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "UnknownReason", | ||||||
|  | 			reason: lifecycle.UnknownReason, | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="UnknownReason"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "UnexpectedPredicateFailureType", | ||||||
|  | 			reason: lifecycle.UnexpectedPredicateFailureType, | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="UnexpectedPredicateFailureType"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "node(s) had taints that the pod didn't tolerate", | ||||||
|  | 			reason: tainttoleration.ErrReasonNotMatch, | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="node(s) had taints that the pod didn't tolerate"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "Evicted", | ||||||
|  | 			reason: eviction.Reason, | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="Evicted"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "SysctlForbidden", | ||||||
|  | 			reason: sysctl.ForbiddenReason, | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="SysctlForbidden"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "TopologyAffinityError", | ||||||
|  | 			reason: topologymanager.ErrorTopologyAffinity, | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="TopologyAffinityError"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "NodeShutdown", | ||||||
|  | 			reason: nodeshutdown.NodeShutdownNotAdmittedReason, | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="NodeShutdown"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "OutOfcpu", | ||||||
|  | 			reason: "OutOfcpu", | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="OutOfcpu"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "OutOfmemory", | ||||||
|  | 			reason: "OutOfmemory", | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="OutOfmemory"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "OutOfephemeral-storage", | ||||||
|  | 			reason: "OutOfephemeral-storage", | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="OutOfephemeral-storage"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "OutOfpods", | ||||||
|  | 			reason: "OutOfpods", | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="OutOfpods"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "OutOfgpu", | ||||||
|  | 			reason: "OutOfgpu", | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="OutOfExtendedResources"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name:   "OtherReason", | ||||||
|  | 			reason: "OtherReason", | ||||||
|  | 			wants: ` | ||||||
|  |                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||||
|  |                 # TYPE kubelet_admission_rejections_total counter | ||||||
|  |                 kubelet_admission_rejections_total{reason="Other"} 1 | ||||||
|  |             `, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	// Run tests. | ||||||
|  | 	for _, tc := range testCases { | ||||||
|  | 		t.Run(tc.name, func(t *testing.T) { | ||||||
|  | 			// Clear the metrics after the test. | ||||||
|  | 			metrics.AdmissionRejectionsTotal.Reset() | ||||||
|  |  | ||||||
|  | 			// Call the function. | ||||||
|  | 			recordAdmissionRejection(tc.reason) | ||||||
|  |  | ||||||
|  | 			if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(tc.wants), "kubelet_admission_rejections_total"); err != nil { | ||||||
|  | 				t.Error(err) | ||||||
|  | 			} | ||||||
|  | 		}) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|   | |||||||
| @@ -44,6 +44,8 @@ import ( | |||||||
|  |  | ||||||
| const ( | const ( | ||||||
| 	maxRespBodyLength = 10 * 1 << 10 // 10KB | 	maxRespBodyLength = 10 * 1 << 10 // 10KB | ||||||
|  |  | ||||||
|  | 	AppArmorNotAdmittedReason = "AppArmor" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| type handlerRunner struct { | type handlerRunner struct { | ||||||
| @@ -224,7 +226,7 @@ func (a *appArmorAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult { | |||||||
| 	} | 	} | ||||||
| 	return PodAdmitResult{ | 	return PodAdmitResult{ | ||||||
| 		Admit:   false, | 		Admit:   false, | ||||||
| 		Reason:  "AppArmor", | 		Reason:  AppArmorNotAdmittedReason, | ||||||
| 		Message: fmt.Sprintf("Cannot enforce AppArmor: %v", err), | 		Message: fmt.Sprintf("Cannot enforce AppArmor: %v", err), | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
|   | |||||||
| @@ -32,6 +32,52 @@ import ( | |||||||
| 	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration" | 	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration" | ||||||
| ) | ) | ||||||
|  |  | ||||||
|  | const ( | ||||||
|  | 	// PodOSSelectorNodeLabelDoesNotMatch is used to denote that the pod was | ||||||
|  | 	// rejected admission to the node because the pod's node selector | ||||||
|  | 	// corresponding to kubernetes.io/os label didn't match the node label. | ||||||
|  | 	PodOSSelectorNodeLabelDoesNotMatch = "PodOSSelectorNodeLabelDoesNotMatch" | ||||||
|  |  | ||||||
|  | 	// PodOSNotSupported is used to denote that the pod was rejected admission | ||||||
|  | 	// to the node because the pod's OS field didn't match the node OS. | ||||||
|  | 	PodOSNotSupported = "PodOSNotSupported" | ||||||
|  |  | ||||||
|  | 	// InvalidNodeInfo is used to denote that the pod was rejected admission | ||||||
|  | 	// to the node because the kubelet was unable to retrieve the node info. | ||||||
|  | 	InvalidNodeInfo = "InvalidNodeInfo" | ||||||
|  |  | ||||||
|  | 	// InitContainerRestartPolicyForbidden is used to denote that the pod was | ||||||
|  | 	// rejected admission to the node because it uses a restart policy other | ||||||
|  | 	// than Always for some of its init containers. | ||||||
|  | 	InitContainerRestartPolicyForbidden = "InitContainerRestartPolicyForbidden" | ||||||
|  |  | ||||||
|  | 	// UnexpectedAdmissionError is used to denote that the pod was rejected | ||||||
|  | 	// admission to the node because of an error during admission that could not | ||||||
|  | 	// be categorized. | ||||||
|  | 	UnexpectedAdmissionError = "UnexpectedAdmissionError" | ||||||
|  |  | ||||||
|  | 	// UnknownReason is used to denote that the pod was rejected admission to | ||||||
|  | 	// the node because a predicate failed for a reason that could not be | ||||||
|  | 	// determined. | ||||||
|  | 	UnknownReason = "UnknownReason" | ||||||
|  |  | ||||||
|  | 	// UnexpectedPredicateFailureType is used to denote that the pod was | ||||||
|  | 	// rejected admission to the node because a predicate returned a reason | ||||||
|  | 	// object that was not an InsufficientResourceError or a PredicateFailureError. | ||||||
|  | 	UnexpectedPredicateFailureType = "UnexpectedPredicateFailureType" | ||||||
|  |  | ||||||
|  | 	// Prefix for admission reason when kubelet rejects a pod due to insufficient | ||||||
|  | 	// resources available. | ||||||
|  | 	InsufficientResourcePrefix = "OutOf" | ||||||
|  |  | ||||||
|  | 	// These reasons are used to denote that the pod has reject admission | ||||||
|  | 	// to the node because there's not enough resources to run the pod. | ||||||
|  | 	OutOfCPU              = "OutOfcpu" | ||||||
|  | 	OutOfMemory           = "OutOfmemory" | ||||||
|  | 	OutOfEphemeralStorage = "OutOfephemeral-storage" | ||||||
|  | 	OutOfPods             = "OutOfpods" | ||||||
|  | ) | ||||||
|  |  | ||||||
| type getNodeAnyWayFuncType func() (*v1.Node, error) | type getNodeAnyWayFuncType func() (*v1.Node, error) | ||||||
|  |  | ||||||
| type pluginResourceUpdateFuncType func(*schedulerframework.NodeInfo, *PodAdmitAttributes) error | type pluginResourceUpdateFuncType func(*schedulerframework.NodeInfo, *PodAdmitAttributes) error | ||||||
| @@ -66,7 +112,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult | |||||||
| 		klog.ErrorS(err, "Cannot get Node info") | 		klog.ErrorS(err, "Cannot get Node info") | ||||||
| 		return PodAdmitResult{ | 		return PodAdmitResult{ | ||||||
| 			Admit:   false, | 			Admit:   false, | ||||||
| 			Reason:  "InvalidNodeInfo", | 			Reason:  InvalidNodeInfo, | ||||||
| 			Message: "Kubelet cannot get node info.", | 			Message: "Kubelet cannot get node info.", | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| @@ -76,14 +122,14 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult | |||||||
| 	if rejectPodAdmissionBasedOnOSSelector(admitPod, node) { | 	if rejectPodAdmissionBasedOnOSSelector(admitPod, node) { | ||||||
| 		return PodAdmitResult{ | 		return PodAdmitResult{ | ||||||
| 			Admit:   false, | 			Admit:   false, | ||||||
| 			Reason:  "PodOSSelectorNodeLabelDoesNotMatch", | 			Reason:  PodOSSelectorNodeLabelDoesNotMatch, | ||||||
| 			Message: "Failed to admit pod as the `kubernetes.io/os` label doesn't match node label", | 			Message: "Failed to admit pod as the `kubernetes.io/os` label doesn't match node label", | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 	if rejectPodAdmissionBasedOnOSField(admitPod) { | 	if rejectPodAdmissionBasedOnOSField(admitPod) { | ||||||
| 		return PodAdmitResult{ | 		return PodAdmitResult{ | ||||||
| 			Admit:   false, | 			Admit:   false, | ||||||
| 			Reason:  "PodOSNotSupported", | 			Reason:  PodOSNotSupported, | ||||||
| 			Message: "Failed to admit pod as the OS field doesn't match node OS", | 			Message: "Failed to admit pod as the OS field doesn't match node OS", | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| @@ -100,7 +146,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult | |||||||
| 				klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message) | 				klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message) | ||||||
| 				return PodAdmitResult{ | 				return PodAdmitResult{ | ||||||
| 					Admit:   false, | 					Admit:   false, | ||||||
| 					Reason:  "InitContainerRestartPolicyForbidden", | 					Reason:  InitContainerRestartPolicyForbidden, | ||||||
| 					Message: message, | 					Message: message, | ||||||
| 				} | 				} | ||||||
| 			} | 			} | ||||||
| @@ -113,7 +159,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult | |||||||
| 		klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message) | 		klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message) | ||||||
| 		return PodAdmitResult{ | 		return PodAdmitResult{ | ||||||
| 			Admit:   false, | 			Admit:   false, | ||||||
| 			Reason:  "UnexpectedAdmissionError", | 			Reason:  UnexpectedAdmissionError, | ||||||
| 			Message: message, | 			Message: message, | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| @@ -138,7 +184,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult | |||||||
| 			klog.InfoS("Failed to admit pod, unexpected error while attempting to recover from admission failure", "pod", klog.KObj(admitPod), "err", err) | 			klog.InfoS("Failed to admit pod, unexpected error while attempting to recover from admission failure", "pod", klog.KObj(admitPod), "err", err) | ||||||
| 			return PodAdmitResult{ | 			return PodAdmitResult{ | ||||||
| 				Admit:   fit, | 				Admit:   fit, | ||||||
| 				Reason:  "UnexpectedAdmissionError", | 				Reason:  UnexpectedAdmissionError, | ||||||
| 				Message: message, | 				Message: message, | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
| @@ -151,7 +197,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult | |||||||
| 			klog.InfoS("Failed to admit pod: GeneralPredicates failed due to unknown reason, which is unexpected", "pod", klog.KObj(admitPod)) | 			klog.InfoS("Failed to admit pod: GeneralPredicates failed due to unknown reason, which is unexpected", "pod", klog.KObj(admitPod)) | ||||||
| 			return PodAdmitResult{ | 			return PodAdmitResult{ | ||||||
| 				Admit:   fit, | 				Admit:   fit, | ||||||
| 				Reason:  "UnknownReason", | 				Reason:  UnknownReason, | ||||||
| 				Message: message, | 				Message: message, | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
| @@ -163,11 +209,22 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult | |||||||
| 			message = re.Error() | 			message = re.Error() | ||||||
| 			klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message) | 			klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message) | ||||||
| 		case *InsufficientResourceError: | 		case *InsufficientResourceError: | ||||||
| 			reason = fmt.Sprintf("OutOf%s", re.ResourceName) | 			switch re.ResourceName { | ||||||
|  | 			case v1.ResourceCPU: | ||||||
|  | 				reason = OutOfCPU | ||||||
|  | 			case v1.ResourceMemory: | ||||||
|  | 				reason = OutOfMemory | ||||||
|  | 			case v1.ResourceEphemeralStorage: | ||||||
|  | 				reason = OutOfEphemeralStorage | ||||||
|  | 			case v1.ResourcePods: | ||||||
|  | 				reason = OutOfPods | ||||||
|  | 			default: | ||||||
|  | 				reason = fmt.Sprintf("%s%s", InsufficientResourcePrefix, re.ResourceName) | ||||||
|  | 			} | ||||||
| 			message = re.Error() | 			message = re.Error() | ||||||
| 			klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message) | 			klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message) | ||||||
| 		default: | 		default: | ||||||
| 			reason = "UnexpectedPredicateFailureType" | 			reason = UnexpectedPredicateFailureType | ||||||
| 			message = fmt.Sprintf("GeneralPredicates failed due to %v, which is unexpected.", r) | 			message = fmt.Sprintf("GeneralPredicates failed due to %v, which is unexpected.", r) | ||||||
| 			klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "err", message) | 			klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "err", message) | ||||||
| 		} | 		} | ||||||
|   | |||||||
| @@ -149,6 +149,9 @@ const ( | |||||||
|  |  | ||||||
| 	AlignedPhysicalCPU = "physical_cpu" | 	AlignedPhysicalCPU = "physical_cpu" | ||||||
| 	AlignedNUMANode    = "numa_node" | 	AlignedNUMANode    = "numa_node" | ||||||
|  |  | ||||||
|  | 	// Metrics to track kubelet admission rejections. | ||||||
|  | 	AdmissionRejectionsTotalKey = "admission_rejections_total" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| type imageSizeBucket struct { | type imageSizeBucket struct { | ||||||
| @@ -994,6 +997,17 @@ var ( | |||||||
| 		}, | 		}, | ||||||
| 		[]string{"driver_name", "method_name", "grpc_status_code"}, | 		[]string{"driver_name", "method_name", "grpc_status_code"}, | ||||||
| 	) | 	) | ||||||
|  |  | ||||||
|  | 	// AdmissionRejectionsTotal tracks the number of failed admission times, currently, just record it for pod additions | ||||||
|  | 	AdmissionRejectionsTotal = metrics.NewCounterVec( | ||||||
|  | 		&metrics.CounterOpts{ | ||||||
|  | 			Subsystem:      KubeletSubsystem, | ||||||
|  | 			Name:           AdmissionRejectionsTotalKey, | ||||||
|  | 			Help:           "Cumulative number pod admission rejections by the Kubelet.", | ||||||
|  | 			StabilityLevel: metrics.ALPHA, | ||||||
|  | 		}, | ||||||
|  | 		[]string{"reason"}, | ||||||
|  | 	) | ||||||
| ) | ) | ||||||
|  |  | ||||||
| var registerMetrics sync.Once | var registerMetrics sync.Once | ||||||
| @@ -1091,6 +1105,8 @@ func Register(collectors ...metrics.StableCollector) { | |||||||
| 			legacyregistry.MustRegister(DRAOperationsDuration) | 			legacyregistry.MustRegister(DRAOperationsDuration) | ||||||
| 			legacyregistry.MustRegister(DRAGRPCOperationsDuration) | 			legacyregistry.MustRegister(DRAGRPCOperationsDuration) | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
|  | 		legacyregistry.MustRegister(AdmissionRejectionsTotal) | ||||||
| 	}) | 	}) | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -83,6 +83,10 @@ func (managerStub) ShutdownStatus() error { | |||||||
| } | } | ||||||
|  |  | ||||||
| const ( | const ( | ||||||
|  | 	NodeShutdownNotAdmittedReason  = "NodeShutdown" | ||||||
|  | 	nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down." | ||||||
|  | 	localStorageStateFile          = "graceful_node_shutdown_state" | ||||||
|  |  | ||||||
| 	nodeShutdownReason  = "Terminated" | 	nodeShutdownReason  = "Terminated" | ||||||
| 	nodeShutdownMessage = "Pod was terminated in response to imminent node shutdown." | 	nodeShutdownMessage = "Pod was terminated in response to imminent node shutdown." | ||||||
| ) | ) | ||||||
|   | |||||||
| @@ -40,10 +40,7 @@ import ( | |||||||
| ) | ) | ||||||
|  |  | ||||||
| const ( | const ( | ||||||
| 	nodeShutdownNotAdmittedReason  = "NodeShutdown" | 	dbusReconnectPeriod = 1 * time.Second | ||||||
| 	nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down." |  | ||||||
| 	dbusReconnectPeriod            = 1 * time.Second |  | ||||||
| 	localStorageStateFile          = "graceful_node_shutdown_state" |  | ||||||
| ) | ) | ||||||
|  |  | ||||||
| var systemDbus = func() (dbusInhibiter, error) { | var systemDbus = func() (dbusInhibiter, error) { | ||||||
| @@ -123,7 +120,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd | |||||||
| 	if nodeShuttingDown { | 	if nodeShuttingDown { | ||||||
| 		return lifecycle.PodAdmitResult{ | 		return lifecycle.PodAdmitResult{ | ||||||
| 			Admit:   false, | 			Admit:   false, | ||||||
| 			Reason:  nodeShutdownNotAdmittedReason, | 			Reason:  NodeShutdownNotAdmittedReason, | ||||||
| 			Message: nodeShutdownNotAdmittedMessage, | 			Message: nodeShutdownNotAdmittedMessage, | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
|   | |||||||
| @@ -51,12 +51,6 @@ const ( | |||||||
| 	shutdownOrderStringValue = "PreshutdownOrder" | 	shutdownOrderStringValue = "PreshutdownOrder" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| const ( |  | ||||||
| 	nodeShutdownNotAdmittedReason  = "NodeShutdown" |  | ||||||
| 	nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down." |  | ||||||
| 	localStorageStateFile          = "graceful_node_shutdown_state" |  | ||||||
| ) |  | ||||||
|  |  | ||||||
| // managerImpl has functions that can be used to interact with the Node Shutdown Manager. | // managerImpl has functions that can be used to interact with the Node Shutdown Manager. | ||||||
| type managerImpl struct { | type managerImpl struct { | ||||||
| 	logger       klog.Logger | 	logger       klog.Logger | ||||||
| @@ -120,7 +114,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd | |||||||
| 	if nodeShuttingDown { | 	if nodeShuttingDown { | ||||||
| 		return lifecycle.PodAdmitResult{ | 		return lifecycle.PodAdmitResult{ | ||||||
| 			Admit:   false, | 			Admit:   false, | ||||||
| 			Reason:  nodeShutdownNotAdmittedReason, | 			Reason:  NodeShutdownNotAdmittedReason, | ||||||
| 			Message: nodeShutdownNotAdmittedMessage, | 			Message: nodeShutdownNotAdmittedMessage, | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Anish Shah
					Anish Shah