mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-10-31 10:18:13 +00:00 
			
		
		
		
	Introduce a metric to track kubelet admission failure.
This commit is contained in:
		| @@ -29,6 +29,7 @@ import ( | ||||
| 	sysruntime "runtime" | ||||
| 	"slices" | ||||
| 	"sort" | ||||
| 	"strings" | ||||
| 	"sync" | ||||
| 	"sync/atomic" | ||||
| 	"time" | ||||
| @@ -81,6 +82,7 @@ import ( | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/cloudresource" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/cm" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/config" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/configmap" | ||||
| 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" | ||||
| @@ -120,6 +122,7 @@ import ( | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/volumemanager" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/watchdog" | ||||
| 	httpprobe "k8s.io/kubernetes/pkg/probe/http" | ||||
| 	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration" | ||||
| 	"k8s.io/kubernetes/pkg/security/apparmor" | ||||
| 	"k8s.io/kubernetes/pkg/util/oom" | ||||
| 	"k8s.io/kubernetes/pkg/volume" | ||||
| @@ -220,6 +223,26 @@ var ( | ||||
| 	// ContainerLogsDir can be overwritten for testing usage | ||||
| 	ContainerLogsDir = DefaultContainerLogsDir | ||||
| 	etcHostsPath     = getContainerEtcHostsPath() | ||||
|  | ||||
| 	admissionRejectionReasons = sets.New[string]( | ||||
| 		lifecycle.AppArmorNotAdmittedReason, | ||||
| 		lifecycle.PodOSSelectorNodeLabelDoesNotMatch, | ||||
| 		lifecycle.PodOSNotSupported, | ||||
| 		lifecycle.InvalidNodeInfo, | ||||
| 		lifecycle.InitContainerRestartPolicyForbidden, | ||||
| 		lifecycle.UnexpectedAdmissionError, | ||||
| 		lifecycle.UnknownReason, | ||||
| 		lifecycle.UnexpectedPredicateFailureType, | ||||
| 		lifecycle.OutOfCPU, | ||||
| 		lifecycle.OutOfMemory, | ||||
| 		lifecycle.OutOfEphemeralStorage, | ||||
| 		lifecycle.OutOfPods, | ||||
| 		tainttoleration.ErrReasonNotMatch, | ||||
| 		eviction.Reason, | ||||
| 		sysctl.ForbiddenReason, | ||||
| 		topologymanager.ErrorTopologyAffinity, | ||||
| 		nodeshutdown.NodeShutdownNotAdmittedReason, | ||||
| 	) | ||||
| ) | ||||
|  | ||||
| func getContainerEtcHostsPath() string { | ||||
| @@ -2304,7 +2327,6 @@ func (kl *Kubelet) canAdmitPod(allocatedPods []*v1.Pod, pod *v1.Pod) (bool, stri | ||||
| 	attrs := &lifecycle.PodAdmitAttributes{Pod: pod, OtherPods: allocatedPods} | ||||
| 	for _, podAdmitHandler := range kl.admitHandlers { | ||||
| 		if result := podAdmitHandler.Admit(attrs); !result.Admit { | ||||
|  | ||||
| 			klog.InfoS("Pod admission denied", "podUID", attrs.Pod.UID, "pod", klog.KObj(attrs.Pod), "reason", result.Reason, "message", result.Message) | ||||
|  | ||||
| 			return false, result.Reason, result.Message | ||||
| @@ -2314,6 +2336,22 @@ func (kl *Kubelet) canAdmitPod(allocatedPods []*v1.Pod, pod *v1.Pod) (bool, stri | ||||
| 	return true, "", "" | ||||
| } | ||||
|  | ||||
| func recordAdmissionRejection(reason string) { | ||||
| 	// It is possible that the "reason" label can have high cardinality. | ||||
| 	// To avoid this metric from exploding, we create an allowlist of known | ||||
| 	// reasons, and only record reasons from this list. Use "Other" reason | ||||
| 	// for the rest. | ||||
| 	if admissionRejectionReasons.Has(reason) { | ||||
| 		metrics.AdmissionRejectionsTotal.WithLabelValues(reason).Inc() | ||||
| 	} else if strings.HasPrefix(reason, lifecycle.InsufficientResourcePrefix) { | ||||
| 		// non-extended resources (like cpu, memory, ephemeral-storage, pods) | ||||
| 		// are already included in admissionRejectionReasons. | ||||
| 		metrics.AdmissionRejectionsTotal.WithLabelValues("OutOfExtendedResources").Inc() | ||||
| 	} else { | ||||
| 		metrics.AdmissionRejectionsTotal.WithLabelValues("Other").Inc() | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // syncLoop is the main loop for processing changes. It watches for changes from | ||||
| // three channels (file, apiserver, and http) and creates a union of them. For | ||||
| // any new change seen, will run a sync against desired state and running state. If | ||||
| @@ -2584,6 +2622,11 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) { | ||||
| 				// Check if we can admit the pod; if not, reject it. | ||||
| 				if ok, reason, message := kl.canAdmitPod(allocatedPods, allocatedPod); !ok { | ||||
| 					kl.rejectPod(pod, reason, message) | ||||
| 					// We avoid recording the metric in canAdmitPod because it's called | ||||
| 					// repeatedly during a resize, which would inflate the metric. | ||||
| 					// Instead, we record the metric here in HandlePodAdditions for new pods | ||||
| 					// and capture resize events separately. | ||||
| 					recordAdmissionRejection(reason) | ||||
| 					continue | ||||
| 				} | ||||
| 				// For new pod, checkpoint the resource values at which the Pod has been admitted | ||||
| @@ -2595,6 +2638,11 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) { | ||||
| 				// Check if we can admit the pod; if not, reject it. | ||||
| 				if ok, reason, message := kl.canAdmitPod(allocatedPods, pod); !ok { | ||||
| 					kl.rejectPod(pod, reason, message) | ||||
| 					// We avoid recording the metric in canAdmitPod because it's called | ||||
| 					// repeatedly during a resize, which would inflate the metric. | ||||
| 					// Instead, we record the metric here in HandlePodAdditions for new pods | ||||
| 					// and capture resize events separately. | ||||
| 					recordAdmissionRejection(reason) | ||||
| 					continue | ||||
| 				} | ||||
| 			} | ||||
|   | ||||
| @@ -55,6 +55,7 @@ import ( | ||||
| 	"k8s.io/client-go/tools/record" | ||||
| 	"k8s.io/client-go/util/flowcontrol" | ||||
| 	featuregatetesting "k8s.io/component-base/featuregate/testing" | ||||
| 	"k8s.io/component-base/metrics/testutil" | ||||
| 	internalapi "k8s.io/cri-api/pkg/apis" | ||||
| 	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" | ||||
| 	remote "k8s.io/cri-client/pkg" | ||||
| @@ -66,6 +67,7 @@ import ( | ||||
| 	cadvisortest "k8s.io/kubernetes/pkg/kubelet/cadvisor/testing" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/cm" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/config" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/configmap" | ||||
| 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" | ||||
| @@ -75,6 +77,7 @@ import ( | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/kuberuntime" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/lifecycle" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/logs" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/metrics" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/network/dns" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/nodeshutdown" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/pleg" | ||||
| @@ -90,12 +93,14 @@ import ( | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/status" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/status/state" | ||||
| 	statustest "k8s.io/kubernetes/pkg/kubelet/status/testing" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/sysctl" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/token" | ||||
| 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types" | ||||
| 	kubeletutil "k8s.io/kubernetes/pkg/kubelet/util" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/util/queue" | ||||
| 	kubeletvolume "k8s.io/kubernetes/pkg/kubelet/volumemanager" | ||||
| 	schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" | ||||
| 	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration" | ||||
| 	"k8s.io/kubernetes/pkg/util/oom" | ||||
| 	"k8s.io/kubernetes/pkg/volume" | ||||
| 	_ "k8s.io/kubernetes/pkg/volume/hostpath" | ||||
| @@ -3460,3 +3465,200 @@ func TestIsPodResizeInProgress(t *testing.T) { | ||||
| 		}) | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func TestRecordAdmissionRejection(t *testing.T) { | ||||
| 	metrics.Register() | ||||
|  | ||||
| 	testCases := []struct { | ||||
| 		name   string | ||||
| 		reason string | ||||
| 		wants  string | ||||
| 	}{ | ||||
| 		{ | ||||
| 			name:   "AppArmor", | ||||
| 			reason: lifecycle.AppArmorNotAdmittedReason, | ||||
| 			wants: ` | ||||
| 				# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
| 				# TYPE kubelet_admission_rejections_total counter | ||||
| 				kubelet_admission_rejections_total{reason="AppArmor"} 1 | ||||
| 			`, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "PodOSSelectorNodeLabelDoesNotMatch", | ||||
| 			reason: lifecycle.PodOSSelectorNodeLabelDoesNotMatch, | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="PodOSSelectorNodeLabelDoesNotMatch"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "PodOSNotSupported", | ||||
| 			reason: lifecycle.PodOSNotSupported, | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="PodOSNotSupported"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "InvalidNodeInfo", | ||||
| 			reason: lifecycle.InvalidNodeInfo, | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="InvalidNodeInfo"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "InitContainerRestartPolicyForbidden", | ||||
| 			reason: lifecycle.InitContainerRestartPolicyForbidden, | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="InitContainerRestartPolicyForbidden"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "UnexpectedAdmissionError", | ||||
| 			reason: lifecycle.UnexpectedAdmissionError, | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="UnexpectedAdmissionError"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "UnknownReason", | ||||
| 			reason: lifecycle.UnknownReason, | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="UnknownReason"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "UnexpectedPredicateFailureType", | ||||
| 			reason: lifecycle.UnexpectedPredicateFailureType, | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="UnexpectedPredicateFailureType"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "node(s) had taints that the pod didn't tolerate", | ||||
| 			reason: tainttoleration.ErrReasonNotMatch, | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="node(s) had taints that the pod didn't tolerate"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "Evicted", | ||||
| 			reason: eviction.Reason, | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="Evicted"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "SysctlForbidden", | ||||
| 			reason: sysctl.ForbiddenReason, | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="SysctlForbidden"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "TopologyAffinityError", | ||||
| 			reason: topologymanager.ErrorTopologyAffinity, | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="TopologyAffinityError"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "NodeShutdown", | ||||
| 			reason: nodeshutdown.NodeShutdownNotAdmittedReason, | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="NodeShutdown"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "OutOfcpu", | ||||
| 			reason: "OutOfcpu", | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="OutOfcpu"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "OutOfmemory", | ||||
| 			reason: "OutOfmemory", | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="OutOfmemory"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "OutOfephemeral-storage", | ||||
| 			reason: "OutOfephemeral-storage", | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="OutOfephemeral-storage"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "OutOfpods", | ||||
| 			reason: "OutOfpods", | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="OutOfpods"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "OutOfgpu", | ||||
| 			reason: "OutOfgpu", | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="OutOfExtendedResources"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:   "OtherReason", | ||||
| 			reason: "OtherReason", | ||||
| 			wants: ` | ||||
|                 # HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet. | ||||
|                 # TYPE kubelet_admission_rejections_total counter | ||||
|                 kubelet_admission_rejections_total{reason="Other"} 1 | ||||
|             `, | ||||
| 		}, | ||||
| 	} | ||||
|  | ||||
| 	// Run tests. | ||||
| 	for _, tc := range testCases { | ||||
| 		t.Run(tc.name, func(t *testing.T) { | ||||
| 			// Clear the metrics after the test. | ||||
| 			metrics.AdmissionRejectionsTotal.Reset() | ||||
|  | ||||
| 			// Call the function. | ||||
| 			recordAdmissionRejection(tc.reason) | ||||
|  | ||||
| 			if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(tc.wants), "kubelet_admission_rejections_total"); err != nil { | ||||
| 				t.Error(err) | ||||
| 			} | ||||
| 		}) | ||||
| 	} | ||||
| } | ||||
|   | ||||
| @@ -44,6 +44,8 @@ import ( | ||||
|  | ||||
| const ( | ||||
| 	maxRespBodyLength = 10 * 1 << 10 // 10KB | ||||
|  | ||||
| 	AppArmorNotAdmittedReason = "AppArmor" | ||||
| ) | ||||
|  | ||||
| type handlerRunner struct { | ||||
| @@ -224,7 +226,7 @@ func (a *appArmorAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult { | ||||
| 	} | ||||
| 	return PodAdmitResult{ | ||||
| 		Admit:   false, | ||||
| 		Reason:  "AppArmor", | ||||
| 		Reason:  AppArmorNotAdmittedReason, | ||||
| 		Message: fmt.Sprintf("Cannot enforce AppArmor: %v", err), | ||||
| 	} | ||||
| } | ||||
|   | ||||
| @@ -32,6 +32,52 @@ import ( | ||||
| 	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration" | ||||
| ) | ||||
|  | ||||
| const ( | ||||
| 	// PodOSSelectorNodeLabelDoesNotMatch is used to denote that the pod was | ||||
| 	// rejected admission to the node because the pod's node selector | ||||
| 	// corresponding to kubernetes.io/os label didn't match the node label. | ||||
| 	PodOSSelectorNodeLabelDoesNotMatch = "PodOSSelectorNodeLabelDoesNotMatch" | ||||
|  | ||||
| 	// PodOSNotSupported is used to denote that the pod was rejected admission | ||||
| 	// to the node because the pod's OS field didn't match the node OS. | ||||
| 	PodOSNotSupported = "PodOSNotSupported" | ||||
|  | ||||
| 	// InvalidNodeInfo is used to denote that the pod was rejected admission | ||||
| 	// to the node because the kubelet was unable to retrieve the node info. | ||||
| 	InvalidNodeInfo = "InvalidNodeInfo" | ||||
|  | ||||
| 	// InitContainerRestartPolicyForbidden is used to denote that the pod was | ||||
| 	// rejected admission to the node because it uses a restart policy other | ||||
| 	// than Always for some of its init containers. | ||||
| 	InitContainerRestartPolicyForbidden = "InitContainerRestartPolicyForbidden" | ||||
|  | ||||
| 	// UnexpectedAdmissionError is used to denote that the pod was rejected | ||||
| 	// admission to the node because of an error during admission that could not | ||||
| 	// be categorized. | ||||
| 	UnexpectedAdmissionError = "UnexpectedAdmissionError" | ||||
|  | ||||
| 	// UnknownReason is used to denote that the pod was rejected admission to | ||||
| 	// the node because a predicate failed for a reason that could not be | ||||
| 	// determined. | ||||
| 	UnknownReason = "UnknownReason" | ||||
|  | ||||
| 	// UnexpectedPredicateFailureType is used to denote that the pod was | ||||
| 	// rejected admission to the node because a predicate returned a reason | ||||
| 	// object that was not an InsufficientResourceError or a PredicateFailureError. | ||||
| 	UnexpectedPredicateFailureType = "UnexpectedPredicateFailureType" | ||||
|  | ||||
| 	// Prefix for admission reason when kubelet rejects a pod due to insufficient | ||||
| 	// resources available. | ||||
| 	InsufficientResourcePrefix = "OutOf" | ||||
|  | ||||
| 	// These reasons are used to denote that the pod has reject admission | ||||
| 	// to the node because there's not enough resources to run the pod. | ||||
| 	OutOfCPU              = "OutOfcpu" | ||||
| 	OutOfMemory           = "OutOfmemory" | ||||
| 	OutOfEphemeralStorage = "OutOfephemeral-storage" | ||||
| 	OutOfPods             = "OutOfpods" | ||||
| ) | ||||
|  | ||||
| type getNodeAnyWayFuncType func() (*v1.Node, error) | ||||
|  | ||||
| type pluginResourceUpdateFuncType func(*schedulerframework.NodeInfo, *PodAdmitAttributes) error | ||||
| @@ -66,7 +112,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult | ||||
| 		klog.ErrorS(err, "Cannot get Node info") | ||||
| 		return PodAdmitResult{ | ||||
| 			Admit:   false, | ||||
| 			Reason:  "InvalidNodeInfo", | ||||
| 			Reason:  InvalidNodeInfo, | ||||
| 			Message: "Kubelet cannot get node info.", | ||||
| 		} | ||||
| 	} | ||||
| @@ -76,14 +122,14 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult | ||||
| 	if rejectPodAdmissionBasedOnOSSelector(admitPod, node) { | ||||
| 		return PodAdmitResult{ | ||||
| 			Admit:   false, | ||||
| 			Reason:  "PodOSSelectorNodeLabelDoesNotMatch", | ||||
| 			Reason:  PodOSSelectorNodeLabelDoesNotMatch, | ||||
| 			Message: "Failed to admit pod as the `kubernetes.io/os` label doesn't match node label", | ||||
| 		} | ||||
| 	} | ||||
| 	if rejectPodAdmissionBasedOnOSField(admitPod) { | ||||
| 		return PodAdmitResult{ | ||||
| 			Admit:   false, | ||||
| 			Reason:  "PodOSNotSupported", | ||||
| 			Reason:  PodOSNotSupported, | ||||
| 			Message: "Failed to admit pod as the OS field doesn't match node OS", | ||||
| 		} | ||||
| 	} | ||||
| @@ -100,7 +146,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult | ||||
| 				klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message) | ||||
| 				return PodAdmitResult{ | ||||
| 					Admit:   false, | ||||
| 					Reason:  "InitContainerRestartPolicyForbidden", | ||||
| 					Reason:  InitContainerRestartPolicyForbidden, | ||||
| 					Message: message, | ||||
| 				} | ||||
| 			} | ||||
| @@ -113,7 +159,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult | ||||
| 		klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message) | ||||
| 		return PodAdmitResult{ | ||||
| 			Admit:   false, | ||||
| 			Reason:  "UnexpectedAdmissionError", | ||||
| 			Reason:  UnexpectedAdmissionError, | ||||
| 			Message: message, | ||||
| 		} | ||||
| 	} | ||||
| @@ -138,7 +184,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult | ||||
| 			klog.InfoS("Failed to admit pod, unexpected error while attempting to recover from admission failure", "pod", klog.KObj(admitPod), "err", err) | ||||
| 			return PodAdmitResult{ | ||||
| 				Admit:   fit, | ||||
| 				Reason:  "UnexpectedAdmissionError", | ||||
| 				Reason:  UnexpectedAdmissionError, | ||||
| 				Message: message, | ||||
| 			} | ||||
| 		} | ||||
| @@ -151,7 +197,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult | ||||
| 			klog.InfoS("Failed to admit pod: GeneralPredicates failed due to unknown reason, which is unexpected", "pod", klog.KObj(admitPod)) | ||||
| 			return PodAdmitResult{ | ||||
| 				Admit:   fit, | ||||
| 				Reason:  "UnknownReason", | ||||
| 				Reason:  UnknownReason, | ||||
| 				Message: message, | ||||
| 			} | ||||
| 		} | ||||
| @@ -163,11 +209,22 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult | ||||
| 			message = re.Error() | ||||
| 			klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message) | ||||
| 		case *InsufficientResourceError: | ||||
| 			reason = fmt.Sprintf("OutOf%s", re.ResourceName) | ||||
| 			switch re.ResourceName { | ||||
| 			case v1.ResourceCPU: | ||||
| 				reason = OutOfCPU | ||||
| 			case v1.ResourceMemory: | ||||
| 				reason = OutOfMemory | ||||
| 			case v1.ResourceEphemeralStorage: | ||||
| 				reason = OutOfEphemeralStorage | ||||
| 			case v1.ResourcePods: | ||||
| 				reason = OutOfPods | ||||
| 			default: | ||||
| 				reason = fmt.Sprintf("%s%s", InsufficientResourcePrefix, re.ResourceName) | ||||
| 			} | ||||
| 			message = re.Error() | ||||
| 			klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message) | ||||
| 		default: | ||||
| 			reason = "UnexpectedPredicateFailureType" | ||||
| 			reason = UnexpectedPredicateFailureType | ||||
| 			message = fmt.Sprintf("GeneralPredicates failed due to %v, which is unexpected.", r) | ||||
| 			klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "err", message) | ||||
| 		} | ||||
|   | ||||
| @@ -149,6 +149,9 @@ const ( | ||||
|  | ||||
| 	AlignedPhysicalCPU = "physical_cpu" | ||||
| 	AlignedNUMANode    = "numa_node" | ||||
|  | ||||
| 	// Metrics to track kubelet admission rejections. | ||||
| 	AdmissionRejectionsTotalKey = "admission_rejections_total" | ||||
| ) | ||||
|  | ||||
| type imageSizeBucket struct { | ||||
| @@ -994,6 +997,17 @@ var ( | ||||
| 		}, | ||||
| 		[]string{"driver_name", "method_name", "grpc_status_code"}, | ||||
| 	) | ||||
|  | ||||
| 	// AdmissionRejectionsTotal tracks the number of failed admission times, currently, just record it for pod additions | ||||
| 	AdmissionRejectionsTotal = metrics.NewCounterVec( | ||||
| 		&metrics.CounterOpts{ | ||||
| 			Subsystem:      KubeletSubsystem, | ||||
| 			Name:           AdmissionRejectionsTotalKey, | ||||
| 			Help:           "Cumulative number pod admission rejections by the Kubelet.", | ||||
| 			StabilityLevel: metrics.ALPHA, | ||||
| 		}, | ||||
| 		[]string{"reason"}, | ||||
| 	) | ||||
| ) | ||||
|  | ||||
| var registerMetrics sync.Once | ||||
| @@ -1091,6 +1105,8 @@ func Register(collectors ...metrics.StableCollector) { | ||||
| 			legacyregistry.MustRegister(DRAOperationsDuration) | ||||
| 			legacyregistry.MustRegister(DRAGRPCOperationsDuration) | ||||
| 		} | ||||
|  | ||||
| 		legacyregistry.MustRegister(AdmissionRejectionsTotal) | ||||
| 	}) | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -83,6 +83,10 @@ func (managerStub) ShutdownStatus() error { | ||||
| } | ||||
|  | ||||
| const ( | ||||
| 	NodeShutdownNotAdmittedReason  = "NodeShutdown" | ||||
| 	nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down." | ||||
| 	localStorageStateFile          = "graceful_node_shutdown_state" | ||||
|  | ||||
| 	nodeShutdownReason  = "Terminated" | ||||
| 	nodeShutdownMessage = "Pod was terminated in response to imminent node shutdown." | ||||
| ) | ||||
|   | ||||
| @@ -40,10 +40,7 @@ import ( | ||||
| ) | ||||
|  | ||||
| const ( | ||||
| 	nodeShutdownNotAdmittedReason  = "NodeShutdown" | ||||
| 	nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down." | ||||
| 	dbusReconnectPeriod = 1 * time.Second | ||||
| 	localStorageStateFile          = "graceful_node_shutdown_state" | ||||
| ) | ||||
|  | ||||
| var systemDbus = func() (dbusInhibiter, error) { | ||||
| @@ -123,7 +120,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd | ||||
| 	if nodeShuttingDown { | ||||
| 		return lifecycle.PodAdmitResult{ | ||||
| 			Admit:   false, | ||||
| 			Reason:  nodeShutdownNotAdmittedReason, | ||||
| 			Reason:  NodeShutdownNotAdmittedReason, | ||||
| 			Message: nodeShutdownNotAdmittedMessage, | ||||
| 		} | ||||
| 	} | ||||
|   | ||||
| @@ -51,12 +51,6 @@ const ( | ||||
| 	shutdownOrderStringValue = "PreshutdownOrder" | ||||
| ) | ||||
|  | ||||
| const ( | ||||
| 	nodeShutdownNotAdmittedReason  = "NodeShutdown" | ||||
| 	nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down." | ||||
| 	localStorageStateFile          = "graceful_node_shutdown_state" | ||||
| ) | ||||
|  | ||||
| // managerImpl has functions that can be used to interact with the Node Shutdown Manager. | ||||
| type managerImpl struct { | ||||
| 	logger       klog.Logger | ||||
| @@ -120,7 +114,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd | ||||
| 	if nodeShuttingDown { | ||||
| 		return lifecycle.PodAdmitResult{ | ||||
| 			Admit:   false, | ||||
| 			Reason:  nodeShutdownNotAdmittedReason, | ||||
| 			Reason:  NodeShutdownNotAdmittedReason, | ||||
| 			Message: nodeShutdownNotAdmittedMessage, | ||||
| 		} | ||||
| 	} | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Anish Shah
					Anish Shah