mirror of
https://github.com/optim-enterprises-bv/kubernetes.git
synced 2025-11-01 18:58:18 +00:00
Change eviction logic in NodeController and make it Zone-aware
This commit is contained in:
@@ -67,9 +67,10 @@ const (
|
||||
type zoneState string
|
||||
|
||||
const (
|
||||
stateNormal = zoneState("Normal")
|
||||
stateFullSegmentation = zoneState("FullSegmentation")
|
||||
statePartialSegmentation = zoneState("PartialSegmentation")
|
||||
stateInitial = zoneState("Initial")
|
||||
stateNormal = zoneState("Normal")
|
||||
stateFullDisruption = zoneState("FullDisruption")
|
||||
statePartialDisruption = zoneState("PartialDisruption")
|
||||
)
|
||||
|
||||
type nodeStatusData struct {
|
||||
@@ -136,9 +137,11 @@ type NodeController struct {
|
||||
// allocate/recycle CIDRs for node if allocateNodeCIDRs == true
|
||||
cidrAllocator CIDRAllocator
|
||||
|
||||
forcefullyDeletePod func(*api.Pod) error
|
||||
nodeExistsInCloudProvider func(string) (bool, error)
|
||||
computeZoneStateFunc func(nodeConditions []*api.NodeCondition) zoneState
|
||||
forcefullyDeletePod func(*api.Pod) error
|
||||
nodeExistsInCloudProvider func(string) (bool, error)
|
||||
computeZoneStateFunc func(nodeConditions []*api.NodeCondition) zoneState
|
||||
enterPartialDisruptionFunc func(nodeNum int, defaultQPS float32) float32
|
||||
enterFullDisruptionFunc func(nodeNum int, defaultQPS float32) float32
|
||||
|
||||
zoneStates map[string]zoneState
|
||||
|
||||
@@ -192,28 +195,30 @@ func NewNodeController(
|
||||
}
|
||||
|
||||
nc := &NodeController{
|
||||
cloud: cloud,
|
||||
knownNodeSet: make(map[string]*api.Node),
|
||||
kubeClient: kubeClient,
|
||||
recorder: recorder,
|
||||
podEvictionTimeout: podEvictionTimeout,
|
||||
maximumGracePeriod: 5 * time.Minute,
|
||||
zonePodEvictor: make(map[string]*RateLimitedTimedQueue),
|
||||
zoneTerminationEvictor: make(map[string]*RateLimitedTimedQueue),
|
||||
nodeStatusMap: make(map[string]nodeStatusData),
|
||||
nodeMonitorGracePeriod: nodeMonitorGracePeriod,
|
||||
nodeMonitorPeriod: nodeMonitorPeriod,
|
||||
nodeStartupGracePeriod: nodeStartupGracePeriod,
|
||||
lookupIP: net.LookupIP,
|
||||
now: unversioned.Now,
|
||||
clusterCIDR: clusterCIDR,
|
||||
serviceCIDR: serviceCIDR,
|
||||
allocateNodeCIDRs: allocateNodeCIDRs,
|
||||
forcefullyDeletePod: func(p *api.Pod) error { return forcefullyDeletePod(kubeClient, p) },
|
||||
nodeExistsInCloudProvider: func(nodeName string) (bool, error) { return nodeExistsInCloudProvider(cloud, nodeName) },
|
||||
computeZoneStateFunc: ComputeZoneState,
|
||||
evictionLimiterQPS: evictionLimiterQPS,
|
||||
zoneStates: make(map[string]zoneState),
|
||||
cloud: cloud,
|
||||
knownNodeSet: make(map[string]*api.Node),
|
||||
kubeClient: kubeClient,
|
||||
recorder: recorder,
|
||||
podEvictionTimeout: podEvictionTimeout,
|
||||
maximumGracePeriod: 5 * time.Minute,
|
||||
zonePodEvictor: make(map[string]*RateLimitedTimedQueue),
|
||||
zoneTerminationEvictor: make(map[string]*RateLimitedTimedQueue),
|
||||
nodeStatusMap: make(map[string]nodeStatusData),
|
||||
nodeMonitorGracePeriod: nodeMonitorGracePeriod,
|
||||
nodeMonitorPeriod: nodeMonitorPeriod,
|
||||
nodeStartupGracePeriod: nodeStartupGracePeriod,
|
||||
lookupIP: net.LookupIP,
|
||||
now: unversioned.Now,
|
||||
clusterCIDR: clusterCIDR,
|
||||
serviceCIDR: serviceCIDR,
|
||||
allocateNodeCIDRs: allocateNodeCIDRs,
|
||||
forcefullyDeletePod: func(p *api.Pod) error { return forcefullyDeletePod(kubeClient, p) },
|
||||
nodeExistsInCloudProvider: func(nodeName string) (bool, error) { return nodeExistsInCloudProvider(cloud, nodeName) },
|
||||
enterPartialDisruptionFunc: ReducedQPSFunc,
|
||||
enterFullDisruptionFunc: HealthyQPSFunc,
|
||||
computeZoneStateFunc: ComputeZoneState,
|
||||
evictionLimiterQPS: evictionLimiterQPS,
|
||||
zoneStates: make(map[string]zoneState),
|
||||
}
|
||||
|
||||
podInformer.AddEventHandler(framework.ResourceEventHandlerFuncs{
|
||||
@@ -491,7 +496,7 @@ func (nc *NodeController) monitorNodeStatus() error {
|
||||
"Skipping - no pods will be evicted.", node.Name)
|
||||
continue
|
||||
}
|
||||
// We do not treat a master node as a part of the cluster for network segmentation checking.
|
||||
// We do not treat a master node as a part of the cluster for network disruption checking.
|
||||
if !system.IsMasterNode(node) {
|
||||
zoneToNodeConditions[utilnode.GetZoneKey(node)] = append(zoneToNodeConditions[utilnode.GetZoneKey(node)], currentReadyCondition)
|
||||
}
|
||||
@@ -550,39 +555,110 @@ func (nc *NodeController) monitorNodeStatus() error {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for k, v := range zoneToNodeConditions {
|
||||
newState := nc.computeZoneStateFunc(v)
|
||||
if newState == nc.zoneStates[k] {
|
||||
continue
|
||||
}
|
||||
if newState == stateFullSegmentation {
|
||||
glog.V(2).Infof("NodeController is entering network segmentation mode in zone %v.", k)
|
||||
} else if newState == stateNormal {
|
||||
glog.V(2).Infof("NodeController exited network segmentation mode in zone %v.", k)
|
||||
}
|
||||
for i := range nodes.Items {
|
||||
if utilnode.GetZoneKey(&nodes.Items[i]) == k {
|
||||
if newState == stateFullSegmentation {
|
||||
// When zone is fully segmented we stop the eviction all together.
|
||||
nc.cancelPodEviction(&nodes.Items[i])
|
||||
}
|
||||
if newState == stateNormal && nc.zoneStates[k] == stateFullSegmentation {
|
||||
// When exiting segmentation mode update probe timestamps on all Nodes.
|
||||
now := nc.now()
|
||||
v := nc.nodeStatusMap[nodes.Items[i].Name]
|
||||
v.probeTimestamp = now
|
||||
v.readyTransitionTimestamp = now
|
||||
nc.nodeStatusMap[nodes.Items[i].Name] = v
|
||||
}
|
||||
}
|
||||
}
|
||||
nc.zoneStates[k] = newState
|
||||
}
|
||||
nc.handleDisruption(zoneToNodeConditions, nodes)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (nc *NodeController) handleDisruption(zoneToNodeConditions map[string][]*api.NodeCondition, nodes *api.NodeList) {
|
||||
newZoneStates := map[string]zoneState{}
|
||||
allAreFullyDisrupted := true
|
||||
for k, v := range zoneToNodeConditions {
|
||||
newState := nc.computeZoneStateFunc(v)
|
||||
if newState != stateFullDisruption {
|
||||
allAreFullyDisrupted = false
|
||||
}
|
||||
newZoneStates[k] = newState
|
||||
if _, had := nc.zoneStates[k]; !had {
|
||||
nc.zoneStates[k] = stateInitial
|
||||
}
|
||||
}
|
||||
|
||||
allWasFullyDisrupted := true
|
||||
for k, v := range nc.zoneStates {
|
||||
if _, have := zoneToNodeConditions[k]; !have {
|
||||
delete(nc.zoneStates, k)
|
||||
continue
|
||||
}
|
||||
if v != stateFullDisruption {
|
||||
allWasFullyDisrupted = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// At least one node was responding in previous pass or in the current pass. Semantics is as follows:
|
||||
// - if the new state is "partialDisruption" we call a user defined function that returns a new limiter to use,
|
||||
// - if the new state is "normal" we resume normal operation (go back to default limiter settings),
|
||||
// - if new state is "fullDisruption" we restore normal eviction rate,
|
||||
// - unless all zones in the cluster are in "fullDisruption" - in that case we stop all evictions.
|
||||
if !allAreFullyDisrupted || !allWasFullyDisrupted {
|
||||
// We're switching to full disruption mode
|
||||
if allAreFullyDisrupted {
|
||||
glog.V(0).Info("NodeController detected that all Nodes are not-Ready. Entering master disruption mode.")
|
||||
for i := range nodes.Items {
|
||||
nc.cancelPodEviction(&nodes.Items[i])
|
||||
}
|
||||
// We stop all evictions.
|
||||
for k := range nc.zonePodEvictor {
|
||||
nc.zonePodEvictor[k].SwapLimiter(0)
|
||||
nc.zoneTerminationEvictor[k].SwapLimiter(0)
|
||||
}
|
||||
for k := range nc.zoneStates {
|
||||
nc.zoneStates[k] = stateFullDisruption
|
||||
}
|
||||
// All rate limiters are updated, so we can return early here.
|
||||
return
|
||||
}
|
||||
// We're exiting full disruption mode
|
||||
if allWasFullyDisrupted {
|
||||
glog.V(0).Info("NodeController detected that some Nodes are Ready. Exiting master disruption mode.")
|
||||
// When exiting disruption mode update probe timestamps on all Nodes.
|
||||
now := nc.now()
|
||||
for i := range nodes.Items {
|
||||
v := nc.nodeStatusMap[nodes.Items[i].Name]
|
||||
v.probeTimestamp = now
|
||||
v.readyTransitionTimestamp = now
|
||||
nc.nodeStatusMap[nodes.Items[i].Name] = v
|
||||
}
|
||||
// We reset all rate limiters to settings appropriate for the given state.
|
||||
for k := range nc.zonePodEvictor {
|
||||
nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newZoneStates[k])
|
||||
nc.zoneStates[k] = newZoneStates[k]
|
||||
}
|
||||
return
|
||||
}
|
||||
// We know that there's at least one not-fully disrupted so,
|
||||
// we can use default behavior for rate limiters
|
||||
for k, v := range nc.zoneStates {
|
||||
newState := newZoneStates[k]
|
||||
if v == newState {
|
||||
continue
|
||||
}
|
||||
glog.V(0).Infof("NodeController detected that zone %v is now in state %v.", k, newState)
|
||||
nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newState)
|
||||
nc.zoneStates[k] = newState
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (nc *NodeController) setLimiterInZone(zone string, zoneSize int, state zoneState) {
|
||||
switch state {
|
||||
case stateNormal:
|
||||
nc.zonePodEvictor[zone].SwapLimiter(nc.evictionLimiterQPS)
|
||||
nc.zoneTerminationEvictor[zone].SwapLimiter(nc.evictionLimiterQPS)
|
||||
case statePartialDisruption:
|
||||
nc.zonePodEvictor[zone].SwapLimiter(
|
||||
nc.enterPartialDisruptionFunc(zoneSize, nc.evictionLimiterQPS))
|
||||
nc.zoneTerminationEvictor[zone].SwapLimiter(
|
||||
nc.enterPartialDisruptionFunc(zoneSize, nc.evictionLimiterQPS))
|
||||
case stateFullDisruption:
|
||||
nc.zonePodEvictor[zone].SwapLimiter(
|
||||
nc.enterFullDisruptionFunc(zoneSize, nc.evictionLimiterQPS))
|
||||
nc.zoneTerminationEvictor[zone].SwapLimiter(
|
||||
nc.enterFullDisruptionFunc(zoneSize, nc.evictionLimiterQPS))
|
||||
}
|
||||
}
|
||||
|
||||
// For a given node checks its conditions and tries to update it. Returns grace period to which given node
|
||||
// is entitled, state of current and last observed Ready Condition, and an error if it occurred.
|
||||
func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, api.NodeCondition, *api.NodeCondition, error) {
|
||||
@@ -791,16 +867,5 @@ func (nc *NodeController) cancelPodEviction(node *api.Node) bool {
|
||||
func (nc *NodeController) evictPods(node *api.Node) bool {
|
||||
nc.evictorLock.Lock()
|
||||
defer nc.evictorLock.Unlock()
|
||||
foundHealty := false
|
||||
for _, state := range nc.zoneStates {
|
||||
if state != stateFullSegmentation {
|
||||
foundHealty = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !foundHealty {
|
||||
return false
|
||||
}
|
||||
zone := utilnode.GetZoneKey(node)
|
||||
return nc.zonePodEvictor[zone].Add(node.Name)
|
||||
return nc.zonePodEvictor[utilnode.GetZoneKey(node)].Add(node.Name)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user