Merge pull request #35235 from foxish/node-controller-no-force-deletion

Automatic merge from submit-queue Node controller to not force delete pods Fixes https://github.com/kubernetes/kubernetes/issues/35145 - [x] e2e tests to test Petset, RC, Job. - [x] Remove and cover other locations where we force-delete pods within the NodeController. **Release note**:  ``` release-note Node controller no longer force-deletes pods from the api-server. * For StatefulSet (previously PetSet), this change means creation of replacement pods is blocked until old pods are definitely not running (indicated either by the kubelet returning from partitioned state, or deletion of the Node object, or deletion of the instance in the cloud provider, or force deletion of the pod from the api-server). This has the desirable outcome of "fencing" to prevent "split brain" scenarios. * For all other existing controllers except StatefulSet , this has no effect on the ability of the controller to replace pods because the controllers do not reuse pod names (they use generate-name). * User-written controllers that reuse names of pod objects should evaluate this change. ```
2025-11-01 18:58:18 +00:00 · 2016-11-01 20:08:57 -07:00
parent 7760c2f5fa 71941016c1
commit 49e7d640d9
10 changed files with 621 additions and 511 deletions
--- a/pkg/controller/node/nodecontroller.go
+++ b/pkg/controller/node/nodecontroller.go
@@ -125,9 +125,8 @@ type NodeController struct {
 	// Lock to access evictor workers
 	evictorLock sync.Mutex
 	// workers that evicts pods from unresponsive nodes.
-	zonePodEvictor         map[string]*RateLimitedTimedQueue
-	zoneTerminationEvictor map[string]*RateLimitedTimedQueue
-	podEvictionTimeout     time.Duration
+	zonePodEvictor     map[string]*RateLimitedTimedQueue
+	podEvictionTimeout time.Duration
 	// The maximum duration before a pod evicted from a node can be forcefully terminated.
 	maximumGracePeriod time.Duration
 	recorder           record.EventRecorder
@@ -215,7 +214,6 @@ func NewNodeController(
 		podEvictionTimeout:          podEvictionTimeout,
 		maximumGracePeriod:          5 * time.Minute,
 		zonePodEvictor:              make(map[string]*RateLimitedTimedQueue),
-		zoneTerminationEvictor:      make(map[string]*RateLimitedTimedQueue),
 		nodeStatusMap:               make(map[string]nodeStatusData),
 		nodeMonitorGracePeriod:      nodeMonitorGracePeriod,
 		nodeMonitorPeriod:           nodeMonitorPeriod,
@@ -370,17 +368,8 @@ func (nc *NodeController) Run() {
 		}, nc.nodeMonitorPeriod, wait.NeverStop)

 		// Managing eviction of nodes:
-		// 1. when we delete pods off a node, if the node was not empty at the time we then
-		//    queue a termination watcher
-		//    a. If we hit an error, retry deletion
-		// 2. The terminator loop ensures that pods are eventually cleaned and we never
-		//    terminate a pod in a time period less than nc.maximumGracePeriod. AddedAt
-		//    is the time from which we measure "has this pod been terminating too long",
-		//    after which we will delete the pod with grace period 0 (force delete).
-		//    a. If we hit errors, retry instantly
-		//    b. If there are no pods left terminating, exit
-		//    c. If there are pods still terminating, wait for their estimated completion
-		//       before retrying
+		// When we delete pods off a node, if the node was not empty at the time we then
+		// queue an eviction watcher. If we hit an error, retry deletion.
 		go wait.Until(func() {
 			nc.evictorLock.Lock()
 			defer nc.evictorLock.Unlock()
@@ -405,42 +394,12 @@ func (nc *NodeController) Run() {
 					}

 					if remaining {
-						nc.zoneTerminationEvictor[k].Add(value.Value, value.UID)
+						glog.Infof("Pods awaiting deletion due to NodeController eviction")
 					}
 					return true, 0
 				})
 			}
 		}, nodeEvictionPeriod, wait.NeverStop)
-
-		// TODO: replace with a controller that ensures pods that are terminating complete
-		// in a particular time period
-		go wait.Until(func() {
-			nc.evictorLock.Lock()
-			defer nc.evictorLock.Unlock()
-			for k := range nc.zoneTerminationEvictor {
-				nc.zoneTerminationEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
-					nodeUid, _ := value.UID.(string)
-					completed, remaining, err := terminatePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, value.AddedAt, nc.maximumGracePeriod)
-					if err != nil {
-						utilruntime.HandleError(fmt.Errorf("unable to terminate pods on node %q: %v", value.Value, err))
-						return false, 0
-					}
-
-					if completed {
-						glog.V(2).Infof("All pods terminated on %s", value.Value)
-						recordNodeEvent(nc.recorder, value.Value, nodeUid, api.EventTypeNormal, "TerminatedAllPods", fmt.Sprintf("Terminated all Pods on Node %s.", value.Value))
-						return true, 0
-					}
-
-					glog.V(2).Infof("Pods terminating since %s on %q, estimated completion %s", value.AddedAt, value.Value, remaining)
-					// clamp very short intervals
-					if remaining < nodeEvictionPeriod {
-						remaining = nodeEvictionPeriod
-					}
-					return false, remaining
-				})
-			}
-		}, nodeEvictionPeriod, wait.NeverStop)
 	}()
 }

@@ -470,10 +429,6 @@ func (nc *NodeController) monitorNodeStatus() error {
 			glog.Infof("Initializing eviction metric for zone: %v", zone)
 			EvictionsNumber.WithLabelValues(zone).Add(0)
 		}
-		if _, found := nc.zoneTerminationEvictor[zone]; !found {
-			nc.zoneTerminationEvictor[zone] = NewRateLimitedTimedQueue(
-				flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst))
-		}
 		nc.cancelPodEviction(added[i])
 	}

@@ -557,7 +512,7 @@ func (nc *NodeController) monitorNodeStatus() error {
 						// Kubelet is not reporting and Cloud Provider says node
 						// is gone. Delete it without worrying about grace
 						// periods.
-						if err := forcefullyDeleteNode(nc.kubeClient, nodeName, nc.forcefullyDeletePod); err != nil {
+						if err := forcefullyDeleteNode(nc.kubeClient, nodeName); err != nil {
 							glog.Errorf("Unable to forcefully delete node %q: %v", nodeName, err)
 						}
 					}(node.Name)
@@ -618,7 +573,6 @@ func (nc *NodeController) handleDisruption(zoneToNodeConditions map[string][]*ap
 			// We stop all evictions.
 			for k := range nc.zonePodEvictor {
 				nc.zonePodEvictor[k].SwapLimiter(0)
-				nc.zoneTerminationEvictor[k].SwapLimiter(0)
 			}
 			for k := range nc.zoneStates {
 				nc.zoneStates[k] = stateFullDisruption
@@ -662,17 +616,12 @@ func (nc *NodeController) setLimiterInZone(zone string, zoneSize int, state zone
 	switch state {
 	case stateNormal:
 		nc.zonePodEvictor[zone].SwapLimiter(nc.evictionLimiterQPS)
-		nc.zoneTerminationEvictor[zone].SwapLimiter(nc.evictionLimiterQPS)
 	case statePartialDisruption:
 		nc.zonePodEvictor[zone].SwapLimiter(
 			nc.enterPartialDisruptionFunc(zoneSize))
-		nc.zoneTerminationEvictor[zone].SwapLimiter(
-			nc.enterPartialDisruptionFunc(zoneSize))
 	case stateFullDisruption:
 		nc.zonePodEvictor[zone].SwapLimiter(
 			nc.enterFullDisruptionFunc(zoneSize))
-		nc.zoneTerminationEvictor[zone].SwapLimiter(
-			nc.enterFullDisruptionFunc(zoneSize))
 	}
 }

@@ -871,8 +820,7 @@ func (nc *NodeController) cancelPodEviction(node *api.Node) bool {
 	nc.evictorLock.Lock()
 	defer nc.evictorLock.Unlock()
 	wasDeleting := nc.zonePodEvictor[zone].Remove(node.Name)
-	wasTerminating := nc.zoneTerminationEvictor[zone].Remove(node.Name)
-	if wasDeleting || wasTerminating {
+	if wasDeleting {
 		glog.V(2).Infof("Cancelling pod Eviction on Node: %v", node.Name)
 		return true
 	}