implement inter pod topological affinity and anti-affinity

This commit is contained in:
Kevin
2016-05-04 06:50:31 +00:00
parent 28a8a23471
commit 82ba4f077e
46 changed files with 8302 additions and 814 deletions

View File

@@ -1159,11 +1159,109 @@ const (
NodeSelectorOpLt NodeSelectorOperator = "Lt"
)
// Affinity is a group of affinity scheduling rules, currently
// only node affinity, but in the future also inter-pod affinity.
// Affinity is a group of affinity scheduling rules.
type Affinity struct {
// Describes node affinity scheduling rules for the pod.
NodeAffinity *NodeAffinity `json:"nodeAffinity,omitempty"`
// Describes pod affinity scheduling rules (e.g. co-locate this pod in the same node, zone, etc. as some other pod(s)).
PodAffinity *PodAffinity `json:"podAffinity,omitempty"`
// Describes pod anti-affinity scheduling rules (e.g. avoid putting this pod in the same node, zone, etc. as some other pod(s)).
PodAntiAffinity *PodAntiAffinity `json:"podAntiAffinity,omitempty"`
}
// Pod affinity is a group of inter pod affinity scheduling rules.
type PodAffinity struct {
// NOT YET IMPLEMENTED. TODO: Uncomment field once it is implemented.
// If the affinity requirements specified by this field are not met at
// scheduling time, the pod will not be scheduled onto the node.
// If the affinity requirements specified by this field cease to be met
// at some point during pod execution (e.g. due to a pod label update), the
// system will try to eventually evict the pod from its node.
// When there are multiple elements, the lists of nodes corresponding to each
// podAffinityTerm are intersected, i.e. all terms must be satisfied.
// RequiredDuringSchedulingRequiredDuringExecution []PodAffinityTerm `json:"requiredDuringSchedulingRequiredDuringExecution,omitempty"`
// If the affinity requirements specified by this field are not met at
// scheduling time, the pod will not be scheduled onto the node.
// If the affinity requirements specified by this field cease to be met
// at some point during pod execution (e.g. due to a pod label update), the
// system may or may not try to eventually evict the pod from its node.
// When there are multiple elements, the lists of nodes corresponding to each
// podAffinityTerm are intersected, i.e. all terms must be satisfied.
RequiredDuringSchedulingIgnoredDuringExecution []PodAffinityTerm `json:"requiredDuringSchedulingIgnoredDuringExecution,omitempty"`
// The scheduler will prefer to schedule pods to nodes that satisfy
// the affinity expressions specified by this field, but it may choose
// a node that violates one or more of the expressions. The node that is
// most preferred is the one with the greatest sum of weights, i.e.
// for each node that meets all of the scheduling requirements (resource
// request, requiredDuringScheduling affinity expressions, etc.),
// compute a sum by iterating through the elements of this field and adding
// "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the
// node(s) with the highest sum are the most preferred.
PreferredDuringSchedulingIgnoredDuringExecution []WeightedPodAffinityTerm `json:"preferredDuringSchedulingIgnoredDuringExecution,omitempty"`
}
// Pod anti affinity is a group of inter pod anti affinity scheduling rules.
type PodAntiAffinity struct {
// NOT YET IMPLEMENTED. TODO: Uncomment field once it is implemented.
// If the anti-affinity requirements specified by this field are not met at
// scheduling time, the pod will not be scheduled onto the node.
// If the anti-affinity requirements specified by this field cease to be met
// at some point during pod execution (e.g. due to a pod label update), the
// system will try to eventually evict the pod from its node.
// When there are multiple elements, the lists of nodes corresponding to each
// podAffinityTerm are intersected, i.e. all terms must be satisfied.
// RequiredDuringSchedulingRequiredDuringExecution []PodAffinityTerm `json:"requiredDuringSchedulingRequiredDuringExecution,omitempty"`
// If the anti-affinity requirements specified by this field are not met at
// scheduling time, the pod will not be scheduled onto the node.
// If the anti-affinity requirements specified by this field cease to be met
// at some point during pod execution (e.g. due to a pod label update), the
// system may or may not try to eventually evict the pod from its node.
// When there are multiple elements, the lists of nodes corresponding to each
// podAffinityTerm are intersected, i.e. all terms must be satisfied.
RequiredDuringSchedulingIgnoredDuringExecution []PodAffinityTerm `json:"requiredDuringSchedulingIgnoredDuringExecution,omitempty"`
// The scheduler will prefer to schedule pods to nodes that satisfy
// the anti-affinity expressions specified by this field, but it may choose
// a node that violates one or more of the expressions. The node that is
// most preferred is the one with the greatest sum of weights, i.e.
// for each node that meets all of the scheduling requirements (resource
// request, requiredDuringScheduling anti-affinity expressions, etc.),
// compute a sum by iterating through the elements of this field and adding
// "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the
// node(s) with the highest sum are the most preferred.
PreferredDuringSchedulingIgnoredDuringExecution []WeightedPodAffinityTerm `json:"preferredDuringSchedulingIgnoredDuringExecution,omitempty"`
}
// The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)
type WeightedPodAffinityTerm struct {
// weight associated with matching the corresponding podAffinityTerm,
// in the range 1-100.
Weight int `json:"weight"`
// Required. A pod affinity term, associated with the corresponding weight.
PodAffinityTerm PodAffinityTerm `json:"podAffinityTerm"`
}
// Defines a set of pods (namely those matching the labelSelector
// relative to the given namespace(s)) that this pod should be
// co-located (affinity) or not co-located (anti-affinity) with,
// where co-located is defined as running on a node whose value of
// the label with key <topologyKey> matches that of any node on which
// a pod of the set of pods is running.
type PodAffinityTerm struct {
// A label query over a set of resources, in this case pods.
LabelSelector *unversioned.LabelSelector `json:"labelSelector,omitempty"`
// namespaces specifies which namespaces the labelSelector applies to (matches against);
// nil list means "this pod's namespace," empty list means "all namespaces"
// The json tag here is not "omitempty" since we need to distinguish nil and empty.
// See https://golang.org/pkg/encoding/json/#Marshal for more details.
Namespaces []string `json:"namespaces"`
// This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching
// the labelSelector in the specified namespaces, where co-located is defined as running on a node
// whose value of the label with key topologyKey matches that of any node on which any of the
// selected pods is running.
// For PreferredDuringScheduling pod anti-affinity, empty topologyKey is interpreted as "all topologies"
// ("all topologies" here means all the topologyKeys indicated by scheduler command-line argument --failure-domains);
// for affinity and for RequiredDuringScheduling pod anti-affinity, empty topologyKey is not allowed.
TopologyKey string `json:"topologyKey,omitempty"`
}
// Node affinity is a group of node affinity scheduling rules.
@@ -2604,4 +2702,14 @@ type RangeAllocation struct {
const (
// "default-scheduler" is the name of default scheduler.
DefaultSchedulerName = "default-scheduler"
// RequiredDuringScheduling affinity is not symmetric, but there is an implicit PreferredDuringScheduling affinity rule
// corresponding to every RequiredDuringScheduling affinity rule.
// When the --hard-pod-affinity-weight scheduler flag is not specified,
// DefaultHardPodAffinityWeight defines the weight of the implicit PreferredDuringScheduling affinity rule.
DefaultHardPodAffinitySymmetricWeight int = 1
// When the --failure-domains scheduler flag is not specified,
// DefaultFailureDomains defines the set of label keys used when TopologyKey is empty in PreferredDuringScheduling anti-affinity.
DefaultFailureDomains string = unversioned.LabelHostname + "," + unversioned.LabelZoneFailureDomain + "," + unversioned.LabelZoneRegion
)