mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-11-04 12:18:16 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			304 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			304 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
/*
 | 
						|
Copyright 2014 The Kubernetes Authors.
 | 
						|
 | 
						|
Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
you may not use this file except in compliance with the License.
 | 
						|
You may obtain a copy of the License at
 | 
						|
 | 
						|
    http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
 | 
						|
Unless required by applicable law or agreed to in writing, software
 | 
						|
distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
See the License for the specific language governing permissions and
 | 
						|
limitations under the License.
 | 
						|
*/
 | 
						|
 | 
						|
package predicates
 | 
						|
 | 
						|
import (
 | 
						|
	"fmt"
 | 
						|
 | 
						|
	"k8s.io/klog"
 | 
						|
 | 
						|
	v1 "k8s.io/api/core/v1"
 | 
						|
	"k8s.io/apimachinery/pkg/util/sets"
 | 
						|
	utilfeature "k8s.io/apiserver/pkg/util/feature"
 | 
						|
	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
 | 
						|
	"k8s.io/kubernetes/pkg/features"
 | 
						|
	pluginhelper "k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
 | 
						|
	schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo"
 | 
						|
	schedutil "k8s.io/kubernetes/pkg/scheduler/util"
 | 
						|
)
 | 
						|
 | 
						|
const (
 | 
						|
	// MatchInterPodAffinityPred defines the name of predicate MatchInterPodAffinity.
 | 
						|
	MatchInterPodAffinityPred = "MatchInterPodAffinity"
 | 
						|
	// CheckVolumeBindingPred defines the name of predicate CheckVolumeBinding.
 | 
						|
	CheckVolumeBindingPred = "CheckVolumeBinding"
 | 
						|
	// GeneralPred defines the name of predicate GeneralPredicates.
 | 
						|
	GeneralPred = "GeneralPredicates"
 | 
						|
	// HostNamePred defines the name of predicate HostName.
 | 
						|
	HostNamePred = "HostName"
 | 
						|
	// PodFitsHostPortsPred defines the name of predicate PodFitsHostPorts.
 | 
						|
	PodFitsHostPortsPred = "PodFitsHostPorts"
 | 
						|
	// MatchNodeSelectorPred defines the name of predicate MatchNodeSelector.
 | 
						|
	MatchNodeSelectorPred = "MatchNodeSelector"
 | 
						|
	// PodFitsResourcesPred defines the name of predicate PodFitsResources.
 | 
						|
	PodFitsResourcesPred = "PodFitsResources"
 | 
						|
	// NoDiskConflictPred defines the name of predicate NoDiskConflict.
 | 
						|
	NoDiskConflictPred = "NoDiskConflict"
 | 
						|
	// PodToleratesNodeTaintsPred defines the name of predicate PodToleratesNodeTaints.
 | 
						|
	PodToleratesNodeTaintsPred = "PodToleratesNodeTaints"
 | 
						|
	// CheckNodeUnschedulablePred defines the name of predicate CheckNodeUnschedulablePredicate.
 | 
						|
	CheckNodeUnschedulablePred = "CheckNodeUnschedulable"
 | 
						|
	// CheckNodeLabelPresencePred defines the name of predicate CheckNodeLabelPresence.
 | 
						|
	CheckNodeLabelPresencePred = "CheckNodeLabelPresence"
 | 
						|
	// CheckServiceAffinityPred defines the name of predicate checkServiceAffinity.
 | 
						|
	CheckServiceAffinityPred = "CheckServiceAffinity"
 | 
						|
	// MaxEBSVolumeCountPred defines the name of predicate MaxEBSVolumeCount.
 | 
						|
	// DEPRECATED
 | 
						|
	// All cloudprovider specific predicates are deprecated in favour of MaxCSIVolumeCountPred.
 | 
						|
	MaxEBSVolumeCountPred = "MaxEBSVolumeCount"
 | 
						|
	// MaxGCEPDVolumeCountPred defines the name of predicate MaxGCEPDVolumeCount.
 | 
						|
	// DEPRECATED
 | 
						|
	// All cloudprovider specific predicates are deprecated in favour of MaxCSIVolumeCountPred.
 | 
						|
	MaxGCEPDVolumeCountPred = "MaxGCEPDVolumeCount"
 | 
						|
	// MaxAzureDiskVolumeCountPred defines the name of predicate MaxAzureDiskVolumeCount.
 | 
						|
	// DEPRECATED
 | 
						|
	// All cloudprovider specific predicates are deprecated in favour of MaxCSIVolumeCountPred.
 | 
						|
	MaxAzureDiskVolumeCountPred = "MaxAzureDiskVolumeCount"
 | 
						|
	// MaxCinderVolumeCountPred defines the name of predicate MaxCinderDiskVolumeCount.
 | 
						|
	// DEPRECATED
 | 
						|
	// All cloudprovider specific predicates are deprecated in favour of MaxCSIVolumeCountPred.
 | 
						|
	MaxCinderVolumeCountPred = "MaxCinderVolumeCount"
 | 
						|
	// MaxCSIVolumeCountPred defines the predicate that decides how many CSI volumes should be attached.
 | 
						|
	MaxCSIVolumeCountPred = "MaxCSIVolumeCountPred"
 | 
						|
	// NoVolumeZoneConflictPred defines the name of predicate NoVolumeZoneConflict.
 | 
						|
	NoVolumeZoneConflictPred = "NoVolumeZoneConflict"
 | 
						|
	// EvenPodsSpreadPred defines the name of predicate EvenPodsSpread.
 | 
						|
	EvenPodsSpreadPred = "EvenPodsSpread"
 | 
						|
)
 | 
						|
 | 
						|
// IMPORTANT NOTE for predicate developers:
 | 
						|
// We are using cached predicate result for pods belonging to the same equivalence class.
 | 
						|
// So when updating an existing predicate, you should consider whether your change will introduce new
 | 
						|
// dependency to attributes of any API object like Pod, Node, Service etc.
 | 
						|
// If yes, you are expected to invalidate the cached predicate result for related API object change.
 | 
						|
// For example:
 | 
						|
// https://github.com/kubernetes/kubernetes/blob/36a218e/plugin/pkg/scheduler/factory/factory.go#L422
 | 
						|
 | 
						|
// IMPORTANT NOTE: this list contains the ordering of the predicates, if you develop a new predicate
 | 
						|
// it is mandatory to add its name to this list.
 | 
						|
// Otherwise it won't be processed, see generic_scheduler#podFitsOnNode().
 | 
						|
// The order is based on the restrictiveness & complexity of predicates.
 | 
						|
// Design doc: https://github.com/kubernetes/community/blob/master/contributors/design-proposals/scheduling/predicates-ordering.md
 | 
						|
var (
 | 
						|
	predicatesOrdering = []string{CheckNodeUnschedulablePred,
 | 
						|
		GeneralPred, HostNamePred, PodFitsHostPortsPred,
 | 
						|
		MatchNodeSelectorPred, PodFitsResourcesPred, NoDiskConflictPred,
 | 
						|
		PodToleratesNodeTaintsPred, CheckNodeLabelPresencePred,
 | 
						|
		CheckServiceAffinityPred, MaxEBSVolumeCountPred, MaxGCEPDVolumeCountPred, MaxCSIVolumeCountPred,
 | 
						|
		MaxAzureDiskVolumeCountPred, MaxCinderVolumeCountPred, CheckVolumeBindingPred, NoVolumeZoneConflictPred,
 | 
						|
		EvenPodsSpreadPred, MatchInterPodAffinityPred}
 | 
						|
)
 | 
						|
 | 
						|
// Ordering returns the ordering of predicates.
 | 
						|
func Ordering() []string {
 | 
						|
	return predicatesOrdering
 | 
						|
}
 | 
						|
 | 
						|
// FitPredicate is a function that indicates if a pod fits into an existing node.
 | 
						|
// The failure information is given by the error.
 | 
						|
type FitPredicate func(pod *v1.Pod, meta Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error)
 | 
						|
 | 
						|
// GetResourceRequest returns a *schedulernodeinfo.Resource that covers the largest
 | 
						|
// width in each resource dimension. Because init-containers run sequentially, we collect
 | 
						|
// the max in each dimension iteratively. In contrast, we sum the resource vectors for
 | 
						|
// regular containers since they run simultaneously.
 | 
						|
//
 | 
						|
// If Pod Overhead is specified and the feature gate is set, the resources defined for Overhead
 | 
						|
// are added to the calculated Resource request sum
 | 
						|
//
 | 
						|
// Example:
 | 
						|
//
 | 
						|
// Pod:
 | 
						|
//   InitContainers
 | 
						|
//     IC1:
 | 
						|
//       CPU: 2
 | 
						|
//       Memory: 1G
 | 
						|
//     IC2:
 | 
						|
//       CPU: 2
 | 
						|
//       Memory: 3G
 | 
						|
//   Containers
 | 
						|
//     C1:
 | 
						|
//       CPU: 2
 | 
						|
//       Memory: 1G
 | 
						|
//     C2:
 | 
						|
//       CPU: 1
 | 
						|
//       Memory: 1G
 | 
						|
//
 | 
						|
// Result: CPU: 3, Memory: 3G
 | 
						|
func GetResourceRequest(pod *v1.Pod) *schedulernodeinfo.Resource {
 | 
						|
	result := &schedulernodeinfo.Resource{}
 | 
						|
	for _, container := range pod.Spec.Containers {
 | 
						|
		result.Add(container.Resources.Requests)
 | 
						|
	}
 | 
						|
 | 
						|
	// take max_resource(sum_pod, any_init_container)
 | 
						|
	for _, container := range pod.Spec.InitContainers {
 | 
						|
		result.SetMaxResource(container.Resources.Requests)
 | 
						|
	}
 | 
						|
 | 
						|
	// If Overhead is being utilized, add to the total requests for the pod
 | 
						|
	if pod.Spec.Overhead != nil && utilfeature.DefaultFeatureGate.Enabled(features.PodOverhead) {
 | 
						|
		result.Add(pod.Spec.Overhead)
 | 
						|
	}
 | 
						|
 | 
						|
	return result
 | 
						|
}
 | 
						|
 | 
						|
func podName(pod *v1.Pod) string {
 | 
						|
	return pod.Namespace + "/" + pod.Name
 | 
						|
}
 | 
						|
 | 
						|
// PodFitsResources is a wrapper around PodFitsResourcesPredicate that implements FitPredicate interface.
 | 
						|
// TODO(#85822): remove this function once predicate registration logic is deleted.
 | 
						|
func PodFitsResources(pod *v1.Pod, _ Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
 | 
						|
	return PodFitsResourcesPredicate(pod, nil, nil, nodeInfo)
 | 
						|
}
 | 
						|
 | 
						|
// PodFitsResourcesPredicate checks if a node has sufficient resources, such as cpu, memory, gpu, opaque int resources etc to run a pod.
 | 
						|
// First return value indicates whether a node has sufficient resources to run a pod while the second return value indicates the
 | 
						|
// predicate failure reasons if the node has insufficient resources to run the pod
 | 
						|
func PodFitsResourcesPredicate(pod *v1.Pod, podRequest *schedulernodeinfo.Resource, ignoredExtendedResources sets.String, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
 | 
						|
	node := nodeInfo.Node()
 | 
						|
	if node == nil {
 | 
						|
		return false, nil, fmt.Errorf("node not found")
 | 
						|
	}
 | 
						|
 | 
						|
	var predicateFails []PredicateFailureReason
 | 
						|
	allowedPodNumber := nodeInfo.AllowedPodNumber()
 | 
						|
	if len(nodeInfo.Pods())+1 > allowedPodNumber {
 | 
						|
		predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourcePods, 1, int64(len(nodeInfo.Pods())), int64(allowedPodNumber)))
 | 
						|
	}
 | 
						|
 | 
						|
	if ignoredExtendedResources == nil {
 | 
						|
		ignoredExtendedResources = sets.NewString()
 | 
						|
	}
 | 
						|
 | 
						|
	if podRequest == nil {
 | 
						|
		podRequest = GetResourceRequest(pod)
 | 
						|
	}
 | 
						|
	if podRequest.MilliCPU == 0 &&
 | 
						|
		podRequest.Memory == 0 &&
 | 
						|
		podRequest.EphemeralStorage == 0 &&
 | 
						|
		len(podRequest.ScalarResources) == 0 {
 | 
						|
		return len(predicateFails) == 0, predicateFails, nil
 | 
						|
	}
 | 
						|
 | 
						|
	allocatable := nodeInfo.AllocatableResource()
 | 
						|
	if allocatable.MilliCPU < podRequest.MilliCPU+nodeInfo.RequestedResource().MilliCPU {
 | 
						|
		predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceCPU, podRequest.MilliCPU, nodeInfo.RequestedResource().MilliCPU, allocatable.MilliCPU))
 | 
						|
	}
 | 
						|
	if allocatable.Memory < podRequest.Memory+nodeInfo.RequestedResource().Memory {
 | 
						|
		predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceMemory, podRequest.Memory, nodeInfo.RequestedResource().Memory, allocatable.Memory))
 | 
						|
	}
 | 
						|
	if allocatable.EphemeralStorage < podRequest.EphemeralStorage+nodeInfo.RequestedResource().EphemeralStorage {
 | 
						|
		predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceEphemeralStorage, podRequest.EphemeralStorage, nodeInfo.RequestedResource().EphemeralStorage, allocatable.EphemeralStorage))
 | 
						|
	}
 | 
						|
 | 
						|
	for rName, rQuant := range podRequest.ScalarResources {
 | 
						|
		if v1helper.IsExtendedResourceName(rName) {
 | 
						|
			// If this resource is one of the extended resources that should be
 | 
						|
			// ignored, we will skip checking it.
 | 
						|
			if ignoredExtendedResources.Has(string(rName)) {
 | 
						|
				continue
 | 
						|
			}
 | 
						|
		}
 | 
						|
		if allocatable.ScalarResources[rName] < rQuant+nodeInfo.RequestedResource().ScalarResources[rName] {
 | 
						|
			predicateFails = append(predicateFails, NewInsufficientResourceError(rName, podRequest.ScalarResources[rName], nodeInfo.RequestedResource().ScalarResources[rName], allocatable.ScalarResources[rName]))
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	if klog.V(10) && len(predicateFails) == 0 {
 | 
						|
		// We explicitly don't do klog.V(10).Infof() to avoid computing all the parameters if this is
 | 
						|
		// not logged. There is visible performance gain from it.
 | 
						|
		klog.Infof("Schedule Pod %+v on Node %+v is allowed, Node is running only %v out of %v Pods.",
 | 
						|
			podName(pod), node.Name, len(nodeInfo.Pods()), allowedPodNumber)
 | 
						|
	}
 | 
						|
	return len(predicateFails) == 0, predicateFails, nil
 | 
						|
}
 | 
						|
 | 
						|
// PodMatchNodeSelector checks if a pod node selector matches the node label.
 | 
						|
func PodMatchNodeSelector(pod *v1.Pod, meta Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
 | 
						|
	node := nodeInfo.Node()
 | 
						|
	if node == nil {
 | 
						|
		return false, nil, fmt.Errorf("node not found")
 | 
						|
	}
 | 
						|
	if pluginhelper.PodMatchesNodeSelectorAndAffinityTerms(pod, node) {
 | 
						|
		return true, nil, nil
 | 
						|
	}
 | 
						|
	return false, []PredicateFailureReason{ErrNodeSelectorNotMatch}, nil
 | 
						|
}
 | 
						|
 | 
						|
// PodFitsHost checks if a pod spec node name matches the current node.
 | 
						|
func PodFitsHost(pod *v1.Pod, meta Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
 | 
						|
	if len(pod.Spec.NodeName) == 0 {
 | 
						|
		return true, nil, nil
 | 
						|
	}
 | 
						|
	node := nodeInfo.Node()
 | 
						|
	if node == nil {
 | 
						|
		return false, nil, fmt.Errorf("node not found")
 | 
						|
	}
 | 
						|
	if pod.Spec.NodeName == node.Name {
 | 
						|
		return true, nil, nil
 | 
						|
	}
 | 
						|
	return false, []PredicateFailureReason{ErrPodNotMatchHostName}, nil
 | 
						|
}
 | 
						|
 | 
						|
// PodFitsHostPorts is a wrapper around PodFitsHostPortsPredicate. This is needed until
 | 
						|
// we are able to get rid of the FitPredicate function signature.
 | 
						|
// TODO(#85822): remove this function once predicate registration logic is deleted.
 | 
						|
func PodFitsHostPorts(pod *v1.Pod, _ Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
 | 
						|
	return PodFitsHostPortsPredicate(pod, nil, nodeInfo)
 | 
						|
}
 | 
						|
 | 
						|
// PodFitsHostPortsPredicate checks if a node has free ports for the requested pod ports.
 | 
						|
func PodFitsHostPortsPredicate(pod *v1.Pod, meta []*v1.ContainerPort, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
 | 
						|
	wantPorts := meta
 | 
						|
	if wantPorts == nil {
 | 
						|
		// Fallback to computing it.
 | 
						|
		wantPorts = schedutil.GetContainerPorts(pod)
 | 
						|
	}
 | 
						|
	if len(wantPorts) == 0 {
 | 
						|
		return true, nil, nil
 | 
						|
	}
 | 
						|
 | 
						|
	existingPorts := nodeInfo.UsedPorts()
 | 
						|
 | 
						|
	// try to see whether existingPorts and  wantPorts will conflict or not
 | 
						|
	if portsConflict(existingPorts, wantPorts) {
 | 
						|
		return false, []PredicateFailureReason{ErrPodNotFitsHostPorts}, nil
 | 
						|
	}
 | 
						|
 | 
						|
	return true, nil, nil
 | 
						|
}
 | 
						|
 | 
						|
// GeneralPredicates checks a group of predicates that the kubelet cares about.
 | 
						|
// DEPRECATED: this exist only because kubelet uses it. We should change kubelet to execute the individual predicates it requires.
 | 
						|
func GeneralPredicates(pod *v1.Pod, meta Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
 | 
						|
	var predicateFails []PredicateFailureReason
 | 
						|
	for _, predicate := range []FitPredicate{PodFitsResources, PodFitsHost, PodFitsHostPorts, PodMatchNodeSelector} {
 | 
						|
		fit, reasons, err := predicate(pod, meta, nodeInfo)
 | 
						|
		if err != nil {
 | 
						|
			return false, predicateFails, err
 | 
						|
		}
 | 
						|
		if !fit {
 | 
						|
			predicateFails = append(predicateFails, reasons...)
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return len(predicateFails) == 0, predicateFails, nil
 | 
						|
}
 |