mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-11-03 19:58:17 +00:00 
			
		
		
		
	Merge pull request #53353 from jiayingz/node-status-fix
Automatic merge from submit-queue (batch tested with PRs 53228, 53232, 53353). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Fixes a regression introduced by PR 52290 that extended resource capacity may temporarily drop to zero after kubelet restarts and PODs restarted during that time window could fail to be scheduled. **What this PR does / why we need it**: **Which issue this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close that issue when PR gets merged)*: fixes # https://github.com/kubernetes/kubernetes/issues/53342 **Special notes for your reviewer**: **Release note**: ```release-note ```
This commit is contained in:
		@@ -608,15 +608,6 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
 | 
			
		||||
					node.Status.Capacity[k] = v
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
			// Remove stale extended resources.
 | 
			
		||||
			for k := range node.Status.Capacity {
 | 
			
		||||
				if v1helper.IsExtendedResourceName(k) {
 | 
			
		||||
					if _, ok := currentCapacity[k]; !ok {
 | 
			
		||||
						glog.V(2).Infof("delete capacity for %s", k)
 | 
			
		||||
						delete(node.Status.Capacity, k)
 | 
			
		||||
					}
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -139,24 +139,6 @@ func areGPUsAvailableOnAllSchedulableNodes(f *framework.Framework) bool {
 | 
			
		||||
	return true
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func areGPUsAvailableOnAnySchedulableNodes(f *framework.Framework) bool {
 | 
			
		||||
	framework.Logf("Getting list of Nodes from API server")
 | 
			
		||||
	nodeList, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{})
 | 
			
		||||
	framework.ExpectNoError(err, "getting node list")
 | 
			
		||||
	for _, node := range nodeList.Items {
 | 
			
		||||
		if node.Spec.Unschedulable {
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
		framework.Logf("gpuResourceName %s", gpuResourceName)
 | 
			
		||||
		if val, ok := node.Status.Capacity[gpuResourceName]; ok && val.Value() > 0 {
 | 
			
		||||
			framework.Logf("Nvidia GPUs available on Node: %q", node.Name)
 | 
			
		||||
			return true
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	framework.Logf("Nvidia GPUs don't exist on all schedulable nodes")
 | 
			
		||||
	return false
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getGPUsAvailable(f *framework.Framework) int64 {
 | 
			
		||||
	nodeList, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{})
 | 
			
		||||
	framework.ExpectNoError(err, "getting node list")
 | 
			
		||||
@@ -225,27 +207,6 @@ var _ = SIGDescribe("[Feature:GPU]", func() {
 | 
			
		||||
var _ = SIGDescribe("[Feature:GPUDevicePlugin]", func() {
 | 
			
		||||
	f := framework.NewDefaultFramework("device-plugin-gpus")
 | 
			
		||||
	It("run Nvidia GPU Device Plugin tests on Container Optimized OS only", func() {
 | 
			
		||||
		// 1. Verifies GPU resource is successfully advertised on the nodes
 | 
			
		||||
		// and we can run pods using GPUs.
 | 
			
		||||
		By("Starting device plugin daemonset and running GPU pods")
 | 
			
		||||
		testNvidiaGPUsOnCOS(f)
 | 
			
		||||
 | 
			
		||||
		// 2. Verifies that when the device plugin DaemonSet is removed, resource capacity drops to zero.
 | 
			
		||||
		By("Deleting device plugin daemonset")
 | 
			
		||||
		ds, err := framework.DsFromManifest(dsYamlUrl)
 | 
			
		||||
		Expect(err).NotTo(HaveOccurred())
 | 
			
		||||
		falseVar := false
 | 
			
		||||
		err = f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Delete(ds.Name, &metav1.DeleteOptions{OrphanDependents: &falseVar})
 | 
			
		||||
		framework.ExpectNoError(err, "failed to delete daemonset")
 | 
			
		||||
		framework.Logf("Successfully deleted device plugin daemonset. Wait for resource to be removed.")
 | 
			
		||||
		// Wait for Nvidia GPUs to be unavailable on all nodes.
 | 
			
		||||
		Eventually(func() bool {
 | 
			
		||||
			return !areGPUsAvailableOnAnySchedulableNodes(f)
 | 
			
		||||
		}, 10*time.Minute, time.Second).Should(BeTrue())
 | 
			
		||||
 | 
			
		||||
		// 3. Restarts the device plugin DaemonSet. Verifies GPU resource is successfully advertised
 | 
			
		||||
		// on the nodes and we can run pods using GPUs.
 | 
			
		||||
		By("Restarting device plugin daemonset and running GPU pods")
 | 
			
		||||
		testNvidiaGPUsOnCOS(f)
 | 
			
		||||
	})
 | 
			
		||||
})
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user