Wire contexts to Core controllers

2025-11-01 18:58:18 +00:00 · 2021-04-22 14:27:59 -04:00
parent 657412713b
commit 4960d0976a
61 changed files with 842 additions and 780 deletions
--- a/pkg/controller/nodelifecycle/node_lifecycle_controller.go
+++ b/pkg/controller/nodelifecycle/node_lifecycle_controller.go
@@ -349,6 +349,7 @@ type Controller struct {

 // NewNodeLifecycleController returns a new taint controller.
 func NewNodeLifecycleController(
+	ctx context.Context,
 	leaseInformer coordinformers.LeaseInformer,
 	podInformer coreinformers.PodInformer,
 	nodeInformer coreinformers.NodeInformer,
@@ -484,7 +485,7 @@ func NewNodeLifecycleController(
 		podGetter := func(name, namespace string) (*v1.Pod, error) { return nc.podLister.Pods(namespace).Get(name) }
 		nodeLister := nodeInformer.Lister()
 		nodeGetter := func(name string) (*v1.Node, error) { return nodeLister.Get(name) }
-		nc.taintManager = scheduler.NewNoExecuteTaintManager(kubeClient, podGetter, nodeGetter, nc.getPodsAssignedToNode)
+		nc.taintManager = scheduler.NewNoExecuteTaintManager(ctx, kubeClient, podGetter, nodeGetter, nc.getPodsAssignedToNode)
 		nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
 			AddFunc: nodeutil.CreateAddNodeHandler(func(node *v1.Node) error {
 				nc.taintManager.NodeUpdated(nil, node)
@@ -532,18 +533,18 @@ func NewNodeLifecycleController(
 }

 // Run starts an asynchronous loop that monitors the status of cluster nodes.
-func (nc *Controller) Run(stopCh <-chan struct{}) {
+func (nc *Controller) Run(ctx context.Context) {
 	defer utilruntime.HandleCrash()

 	klog.Infof("Starting node controller")
 	defer klog.Infof("Shutting down node controller")

-	if !cache.WaitForNamedCacheSync("taint", stopCh, nc.leaseInformerSynced, nc.nodeInformerSynced, nc.podInformerSynced, nc.daemonSetInformerSynced) {
+	if !cache.WaitForNamedCacheSync("taint", ctx.Done(), nc.leaseInformerSynced, nc.nodeInformerSynced, nc.podInformerSynced, nc.daemonSetInformerSynced) {
 		return
 	}

 	if nc.runTaintManager {
-		go nc.taintManager.Run(stopCh)
+		go nc.taintManager.Run(ctx)
 	}

 	// Close node update queue to cleanup go routine.
@@ -556,35 +557,35 @@ func (nc *Controller) Run(stopCh <-chan struct{}) {
 		// the item is flagged when got from queue: if new event come, the new item will
 		// be re-queued until "Done", so no more than one worker handle the same item and
 		// no event missed.
-		go wait.Until(nc.doNodeProcessingPassWorker, time.Second, stopCh)
+		go wait.UntilWithContext(ctx, nc.doNodeProcessingPassWorker, time.Second)
 	}

 	for i := 0; i < podUpdateWorkerSize; i++ {
-		go wait.Until(nc.doPodProcessingWorker, time.Second, stopCh)
+		go wait.UntilWithContext(ctx, nc.doPodProcessingWorker, time.Second)
 	}

 	if nc.runTaintManager {
 		// Handling taint based evictions. Because we don't want a dedicated logic in TaintManager for NC-originated
 		// taints and we normally don't rate limit evictions caused by taints, we need to rate limit adding taints.
-		go wait.Until(nc.doNoExecuteTaintingPass, scheduler.NodeEvictionPeriod, stopCh)
+		go wait.UntilWithContext(ctx, nc.doNoExecuteTaintingPass, scheduler.NodeEvictionPeriod)
 	} else {
 		// Managing eviction of nodes:
 		// When we delete pods off a node, if the node was not empty at the time we then
 		// queue an eviction watcher. If we hit an error, retry deletion.
-		go wait.Until(nc.doEvictionPass, scheduler.NodeEvictionPeriod, stopCh)
+		go wait.UntilWithContext(ctx, nc.doEvictionPass, scheduler.NodeEvictionPeriod)
 	}

 	// Incorporate the results of node health signal pushed from kubelet to master.
-	go wait.Until(func() {
-		if err := nc.monitorNodeHealth(); err != nil {
+	go wait.UntilWithContext(ctx, func(ctx context.Context) {
+		if err := nc.monitorNodeHealth(ctx); err != nil {
 			klog.Errorf("Error monitoring node health: %v", err)
 		}
-	}, nc.nodeMonitorPeriod, stopCh)
+	}, nc.nodeMonitorPeriod)

-	<-stopCh
+	<-ctx.Done()
 }

-func (nc *Controller) doNodeProcessingPassWorker() {
+func (nc *Controller) doNodeProcessingPassWorker(ctx context.Context) {
 	for {
 		obj, shutdown := nc.nodeUpdateQueue.Get()
 		// "nodeUpdateQueue" will be shutdown when "stopCh" closed;
@@ -593,7 +594,7 @@ func (nc *Controller) doNodeProcessingPassWorker() {
 			return
 		}
 		nodeName := obj.(string)
-		if err := nc.doNoScheduleTaintingPass(nodeName); err != nil {
+		if err := nc.doNoScheduleTaintingPass(ctx, nodeName); err != nil {
 			klog.Errorf("Failed to taint NoSchedule on node <%s>, requeue it: %v", nodeName, err)
 			// TODO(k82cn): Add nodeName back to the queue
 		}
@@ -607,7 +608,7 @@ func (nc *Controller) doNodeProcessingPassWorker() {
 	}
 }

-func (nc *Controller) doNoScheduleTaintingPass(nodeName string) error {
+func (nc *Controller) doNoScheduleTaintingPass(ctx context.Context, nodeName string) error {
 	node, err := nc.nodeLister.Get(nodeName)
 	if err != nil {
 		// If node not found, just ignore it.
@@ -656,13 +657,13 @@ func (nc *Controller) doNoScheduleTaintingPass(nodeName string) error {
 	if len(taintsToAdd) == 0 && len(taintsToDel) == 0 {
 		return nil
 	}
-	if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, taintsToAdd, taintsToDel, node) {
+	if !nodeutil.SwapNodeControllerTaint(ctx, nc.kubeClient, taintsToAdd, taintsToDel, node) {
 		return fmt.Errorf("failed to swap taints of node %+v", node)
 	}
 	return nil
 }

-func (nc *Controller) doNoExecuteTaintingPass() {
+func (nc *Controller) doNoExecuteTaintingPass(ctx context.Context) {
 	nc.evictorLock.Lock()
 	defer nc.evictorLock.Unlock()
 	for k := range nc.zoneNoExecuteTainter {
@@ -694,7 +695,7 @@ func (nc *Controller) doNoExecuteTaintingPass() {
 				return true, 0
 			}

-			result := nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{&oppositeTaint}, node)
+			result := nodeutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{&oppositeTaint}, node)
 			if result {
 				//count the evictionsNumber
 				zone := utilnode.GetZoneKey(node)
@@ -706,7 +707,7 @@ func (nc *Controller) doNoExecuteTaintingPass() {
 	}
 }

-func (nc *Controller) doEvictionPass() {
+func (nc *Controller) doEvictionPass(ctx context.Context) {
 	nc.evictorLock.Lock()
 	defer nc.evictorLock.Unlock()
 	for k := range nc.zonePodEvictor {
@@ -724,7 +725,7 @@ func (nc *Controller) doEvictionPass() {
 				utilruntime.HandleError(fmt.Errorf("unable to list pods from node %q: %v", value.Value, err))
 				return false, 0
 			}
-			remaining, err := nodeutil.DeletePods(nc.kubeClient, pods, nc.recorder, value.Value, nodeUID, nc.daemonSetStore)
+			remaining, err := nodeutil.DeletePods(ctx, nc.kubeClient, pods, nc.recorder, value.Value, nodeUID, nc.daemonSetStore)
 			if err != nil {
 				// We are not setting eviction status here.
 				// New pods will be handled by zonePodEvictor retry
@@ -752,7 +753,7 @@ func (nc *Controller) doEvictionPass() {
 // monitorNodeHealth verifies node health are constantly updated by kubelet, and
 // if not, post "NodeReady==ConditionUnknown".
 // This function will taint nodes who are not ready or not reachable for a long period of time.
-func (nc *Controller) monitorNodeHealth() error {
+func (nc *Controller) monitorNodeHealth(ctx context.Context) error {
 	// We are listing nodes from local cache as we can tolerate some small delays
 	// comparing to state from etcd and there is eventual consistency anyway.
 	nodes, err := nc.nodeLister.List(labels.Everything())
@@ -771,7 +772,7 @@ func (nc *Controller) monitorNodeHealth() error {
 		nc.knownNodeSet[added[i].Name] = added[i]
 		nc.addPodEvictorForNewZone(added[i])
 		if nc.runTaintManager {
-			nc.markNodeAsReachable(added[i])
+			nc.markNodeAsReachable(ctx, added[i])
 		} else {
 			nc.cancelPodEviction(added[i])
 		}
@@ -790,12 +791,12 @@ func (nc *Controller) monitorNodeHealth() error {
 		var currentReadyCondition *v1.NodeCondition
 		node := nodes[i].DeepCopy()
 		if err := wait.PollImmediate(retrySleepTime, retrySleepTime*scheduler.NodeHealthUpdateRetry, func() (bool, error) {
-			gracePeriod, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeHealth(node)
+			gracePeriod, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeHealth(ctx, node)
 			if err == nil {
 				return true, nil
 			}
 			name := node.Name
-			node, err = nc.kubeClient.CoreV1().Nodes().Get(context.TODO(), name, metav1.GetOptions{})
+			node, err = nc.kubeClient.CoreV1().Nodes().Get(ctx, name, metav1.GetOptions{})
 			if err != nil {
 				klog.Errorf("Failed while getting a Node to retry updating node health. Probably Node %s was deleted.", name)
 				return false, err
@@ -825,9 +826,9 @@ func (nc *Controller) monitorNodeHealth() error {
 				continue
 			}
 			if nc.runTaintManager {
-				nc.processTaintBaseEviction(node, &observedReadyCondition)
+				nc.processTaintBaseEviction(ctx, node, &observedReadyCondition)
 			} else {
-				if err := nc.processNoTaintBaseEviction(node, &observedReadyCondition, gracePeriod, pods); err != nil {
+				if err := nc.processNoTaintBaseEviction(ctx, node, &observedReadyCondition, gracePeriod, pods); err != nil {
 					utilruntime.HandleError(fmt.Errorf("unable to evict all pods from node %v: %v; queuing for retry", node.Name, err))
 				}
 			}
@@ -839,7 +840,7 @@ func (nc *Controller) monitorNodeHealth() error {
 				nodeutil.RecordNodeStatusChange(nc.recorder, node, "NodeNotReady")
 				fallthrough
 			case needsRetry && observedReadyCondition.Status != v1.ConditionTrue:
-				if err = nodeutil.MarkPodsNotReady(nc.kubeClient, nc.recorder, pods, node.Name); err != nil {
+				if err = nodeutil.MarkPodsNotReady(ctx, nc.kubeClient, nc.recorder, pods, node.Name); err != nil {
 					utilruntime.HandleError(fmt.Errorf("unable to mark all pods NotReady on node %v: %v; queuing for retry", node.Name, err))
 					nc.nodesToRetry.Store(node.Name, struct{}{})
 					continue
@@ -848,12 +849,12 @@ func (nc *Controller) monitorNodeHealth() error {
 		}
 		nc.nodesToRetry.Delete(node.Name)
 	}
-	nc.handleDisruption(zoneToNodeConditions, nodes)
+	nc.handleDisruption(ctx, zoneToNodeConditions, nodes)

 	return nil
 }

-func (nc *Controller) processTaintBaseEviction(node *v1.Node, observedReadyCondition *v1.NodeCondition) {
+func (nc *Controller) processTaintBaseEviction(ctx context.Context, node *v1.Node, observedReadyCondition *v1.NodeCondition) {
 	decisionTimestamp := nc.now()
 	// Check eviction timeout against decisionTimestamp
 	switch observedReadyCondition.Status {
@@ -861,7 +862,7 @@ func (nc *Controller) processTaintBaseEviction(node *v1.Node, observedReadyCondi
 		// We want to update the taint straight away if Node is already tainted with the UnreachableTaint
 		if taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) {
 			taintToAdd := *NotReadyTaintTemplate
-			if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{UnreachableTaintTemplate}, node) {
+			if !nodeutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{UnreachableTaintTemplate}, node) {
 				klog.Errorf("Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle.")
 			}
 		} else if nc.markNodeForTainting(node, v1.ConditionFalse) {
@@ -874,7 +875,7 @@ func (nc *Controller) processTaintBaseEviction(node *v1.Node, observedReadyCondi
 		// We want to update the taint straight away if Node is already tainted with the UnreachableTaint
 		if taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) {
 			taintToAdd := *UnreachableTaintTemplate
-			if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{NotReadyTaintTemplate}, node) {
+			if !nodeutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{NotReadyTaintTemplate}, node) {
 				klog.Errorf("Failed to instantly swap NotReadyTaint to UnreachableTaint. Will try again in the next cycle.")
 			}
 		} else if nc.markNodeForTainting(node, v1.ConditionUnknown) {
@@ -884,7 +885,7 @@ func (nc *Controller) processTaintBaseEviction(node *v1.Node, observedReadyCondi
 			)
 		}
 	case v1.ConditionTrue:
-		removed, err := nc.markNodeAsReachable(node)
+		removed, err := nc.markNodeAsReachable(ctx, node)
 		if err != nil {
 			klog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name)
 		}
@@ -894,7 +895,7 @@ func (nc *Controller) processTaintBaseEviction(node *v1.Node, observedReadyCondi
 	}
 }

-func (nc *Controller) processNoTaintBaseEviction(node *v1.Node, observedReadyCondition *v1.NodeCondition, gracePeriod time.Duration, pods []*v1.Pod) error {
+func (nc *Controller) processNoTaintBaseEviction(ctx context.Context, node *v1.Node, observedReadyCondition *v1.NodeCondition, gracePeriod time.Duration, pods []*v1.Pod) error {
 	decisionTimestamp := nc.now()
 	nodeHealthData := nc.nodeHealthMap.getDeepCopy(node.Name)
 	if nodeHealthData == nil {
@@ -904,7 +905,7 @@ func (nc *Controller) processNoTaintBaseEviction(node *v1.Node, observedReadyCon
 	switch observedReadyCondition.Status {
 	case v1.ConditionFalse:
 		if decisionTimestamp.After(nodeHealthData.readyTransitionTimestamp.Add(nc.podEvictionTimeout)) {
-			enqueued, err := nc.evictPods(node, pods)
+			enqueued, err := nc.evictPods(ctx, node, pods)
 			if err != nil {
 				return err
 			}
@@ -919,7 +920,7 @@ func (nc *Controller) processNoTaintBaseEviction(node *v1.Node, observedReadyCon
 		}
 	case v1.ConditionUnknown:
 		if decisionTimestamp.After(nodeHealthData.probeTimestamp.Add(nc.podEvictionTimeout)) {
-			enqueued, err := nc.evictPods(node, pods)
+			enqueued, err := nc.evictPods(ctx, node, pods)
 			if err != nil {
 				return err
 			}
@@ -953,7 +954,7 @@ func isNodeExcludedFromDisruptionChecks(node *v1.Node) bool {

 // tryUpdateNodeHealth checks a given node's conditions and tries to update it. Returns grace period to
 // which given node is entitled, state of current and last observed Ready Condition, and an error if it occurred.
-func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.NodeCondition, *v1.NodeCondition, error) {
+func (nc *Controller) tryUpdateNodeHealth(ctx context.Context, node *v1.Node) (time.Duration, v1.NodeCondition, *v1.NodeCondition, error) {
 	nodeHealth := nc.nodeHealthMap.getDeepCopy(node.Name)
 	defer func() {
 		nc.nodeHealthMap.set(node.Name, nodeHealth)
@@ -1102,7 +1103,7 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
 		_, currentReadyCondition = nodeutil.GetNodeCondition(&node.Status, v1.NodeReady)

 		if !apiequality.Semantic.DeepEqual(currentReadyCondition, &observedReadyCondition) {
-			if _, err := nc.kubeClient.CoreV1().Nodes().UpdateStatus(context.TODO(), node, metav1.UpdateOptions{}); err != nil {
+			if _, err := nc.kubeClient.CoreV1().Nodes().UpdateStatus(ctx, node, metav1.UpdateOptions{}); err != nil {
 				klog.Errorf("Error updating node %s: %v", node.Name, err)
 				return gracePeriod, observedReadyCondition, currentReadyCondition, err
 			}
@@ -1119,7 +1120,7 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
 	return gracePeriod, observedReadyCondition, currentReadyCondition, nil
 }

-func (nc *Controller) handleDisruption(zoneToNodeConditions map[string][]*v1.NodeCondition, nodes []*v1.Node) {
+func (nc *Controller) handleDisruption(ctx context.Context, zoneToNodeConditions map[string][]*v1.NodeCondition, nodes []*v1.Node) {
 	newZoneStates := map[string]ZoneState{}
 	allAreFullyDisrupted := true
 	for k, v := range zoneToNodeConditions {
@@ -1163,7 +1164,7 @@ func (nc *Controller) handleDisruption(zoneToNodeConditions map[string][]*v1.Nod
 			klog.V(0).Info("Controller detected that all Nodes are not-Ready. Entering master disruption mode.")
 			for i := range nodes {
 				if nc.runTaintManager {
-					_, err := nc.markNodeAsReachable(nodes[i])
+					_, err := nc.markNodeAsReachable(ctx, nodes[i])
 					if err != nil {
 						klog.Errorf("Failed to remove taints from Node %v", nodes[i].Name)
 					}
@@ -1227,7 +1228,7 @@ func (nc *Controller) podUpdated(oldPod, newPod *v1.Pod) {
 	}
 }

-func (nc *Controller) doPodProcessingWorker() {
+func (nc *Controller) doPodProcessingWorker(ctx context.Context) {
 	for {
 		obj, shutdown := nc.podUpdateQueue.Get()
 		// "podUpdateQueue" will be shutdown when "stopCh" closed;
@@ -1237,7 +1238,7 @@ func (nc *Controller) doPodProcessingWorker() {
 		}

 		podItem := obj.(podUpdateItem)
-		nc.processPod(podItem)
+		nc.processPod(ctx, podItem)
 	}
 }

@@ -1245,7 +1246,7 @@ func (nc *Controller) doPodProcessingWorker() {
 // 1. for NodeReady=true node, taint eviction for this pod will be cancelled
 // 2. for NodeReady=false or unknown node, taint eviction of pod will happen and pod will be marked as not ready
 // 3. if node doesn't exist in cache, it will be skipped and handled later by doEvictionPass
-func (nc *Controller) processPod(podItem podUpdateItem) {
+func (nc *Controller) processPod(ctx context.Context, podItem podUpdateItem) {
 	defer nc.podUpdateQueue.Done(podItem)
 	pod, err := nc.podLister.Pods(podItem.namespace).Get(podItem.name)
 	if err != nil {
@@ -1286,7 +1287,7 @@ func (nc *Controller) processPod(podItem podUpdateItem) {
 	// In taint-based eviction mode, only node updates are processed by NodeLifecycleController.
 	// Pods are processed by TaintManager.
 	if !nc.runTaintManager {
-		if err := nc.processNoTaintBaseEviction(node, currentReadyCondition, nc.nodeMonitorGracePeriod, pods); err != nil {
+		if err := nc.processNoTaintBaseEviction(ctx, node, currentReadyCondition, nc.nodeMonitorGracePeriod, pods); err != nil {
 			klog.Warningf("Unable to process pod %+v eviction from node %v: %v.", podItem, nodeName, err)
 			nc.podUpdateQueue.AddRateLimited(podItem)
 			return
@@ -1294,7 +1295,7 @@ func (nc *Controller) processPod(podItem podUpdateItem) {
 	}

 	if currentReadyCondition.Status != v1.ConditionTrue {
-		if err := nodeutil.MarkPodsNotReady(nc.kubeClient, nc.recorder, pods, nodeName); err != nil {
+		if err := nodeutil.MarkPodsNotReady(ctx, nc.kubeClient, nc.recorder, pods, nodeName); err != nil {
 			klog.Warningf("Unable to mark pod %+v NotReady on node %v: %v.", podItem, nodeName, err)
 			nc.podUpdateQueue.AddRateLimited(podItem)
 		}
@@ -1421,14 +1422,14 @@ func (nc *Controller) cancelPodEviction(node *v1.Node) bool {
 //   Returns false if the node name was already enqueued.
 // - deletes pods immediately if node is already marked as evicted.
 //   Returns false, because the node wasn't added to the queue.
-func (nc *Controller) evictPods(node *v1.Node, pods []*v1.Pod) (bool, error) {
+func (nc *Controller) evictPods(ctx context.Context, node *v1.Node, pods []*v1.Pod) (bool, error) {
 	nc.evictorLock.Lock()
 	defer nc.evictorLock.Unlock()
 	status, ok := nc.nodeEvictionMap.getStatus(node.Name)
 	if ok && status == evicted {
 		// Node eviction already happened for this node.
 		// Handling immediate pod deletion.
-		_, err := nodeutil.DeletePods(nc.kubeClient, pods, nc.recorder, node.Name, string(node.UID), nc.daemonSetStore)
+		_, err := nodeutil.DeletePods(ctx, nc.kubeClient, pods, nc.recorder, node.Name, string(node.UID), nc.daemonSetStore)
 		if err != nil {
 			return false, fmt.Errorf("unable to delete pods from node %q: %v", node.Name, err)
 		}
@@ -1458,15 +1459,15 @@ func (nc *Controller) markNodeForTainting(node *v1.Node, status v1.ConditionStat
 	return nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Add(node.Name, string(node.UID))
 }

-func (nc *Controller) markNodeAsReachable(node *v1.Node) (bool, error) {
+func (nc *Controller) markNodeAsReachable(ctx context.Context, node *v1.Node) (bool, error) {
 	nc.evictorLock.Lock()
 	defer nc.evictorLock.Unlock()
-	err := controller.RemoveTaintOffNode(nc.kubeClient, node.Name, node, UnreachableTaintTemplate)
+	err := controller.RemoveTaintOffNode(ctx, nc.kubeClient, node.Name, node, UnreachableTaintTemplate)
 	if err != nil {
 		klog.Errorf("Failed to remove taint from node %v: %v", node.Name, err)
 		return false, err
 	}
-	err = controller.RemoveTaintOffNode(nc.kubeClient, node.Name, node, NotReadyTaintTemplate)
+	err = controller.RemoveTaintOffNode(ctx, nc.kubeClient, node.Name, node, NotReadyTaintTemplate)
 	if err != nil {
 		klog.Errorf("Failed to remove taint from node %v: %v", node.Name, err)
 		return false, err