mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-11-04 04:08:16 +00:00 
			
		
		
		
	Fix preemption race conditions on heavy utilized nodes
This commit is contained in:
		
				
					committed by
					
						
						Mike Dame
					
				
			
			
				
	
			
			
			
						parent
						
							50f9ea7999
						
					
				
				
					commit
					99b301d7e1
				
			@@ -34,6 +34,7 @@ import (
 | 
				
			|||||||
	"k8s.io/apimachinery/pkg/runtime"
 | 
						"k8s.io/apimachinery/pkg/runtime"
 | 
				
			||||||
	"k8s.io/apimachinery/pkg/watch"
 | 
						"k8s.io/apimachinery/pkg/watch"
 | 
				
			||||||
	clientset "k8s.io/client-go/kubernetes"
 | 
						clientset "k8s.io/client-go/kubernetes"
 | 
				
			||||||
 | 
						v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
 | 
				
			||||||
	"k8s.io/kubernetes/pkg/apis/scheduling"
 | 
						"k8s.io/kubernetes/pkg/apis/scheduling"
 | 
				
			||||||
	"k8s.io/kubernetes/test/e2e/framework"
 | 
						"k8s.io/kubernetes/test/e2e/framework"
 | 
				
			||||||
	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
 | 
						e2enode "k8s.io/kubernetes/test/e2e/framework/node"
 | 
				
			||||||
@@ -103,57 +104,77 @@ var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
 | 
				
			|||||||
		var podRes v1.ResourceList
 | 
							var podRes v1.ResourceList
 | 
				
			||||||
		// Create one pod per node that uses a lot of the node's resources.
 | 
							// Create one pod per node that uses a lot of the node's resources.
 | 
				
			||||||
		ginkgo.By("Create pods that use 60% of node resources.")
 | 
							ginkgo.By("Create pods that use 60% of node resources.")
 | 
				
			||||||
		pods := make([]*v1.Pod, len(nodeList.Items))
 | 
							pods := make([]*v1.Pod, 0, len(nodeList.Items))
 | 
				
			||||||
 | 
							allPods, err := cs.CoreV1().Pods(metav1.NamespaceAll).List(metav1.ListOptions{})
 | 
				
			||||||
 | 
							framework.ExpectNoError(err)
 | 
				
			||||||
		for i, node := range nodeList.Items {
 | 
							for i, node := range nodeList.Items {
 | 
				
			||||||
 | 
								currentCPUUsage, currentMemUsage := getCurrentPodUsageOnTheNode(node.Name, allPods.Items, podRequestedResource)
 | 
				
			||||||
 | 
								framework.Logf("Current cpu and memory usage %v, %v", currentCPUUsage, currentMemUsage)
 | 
				
			||||||
			cpuAllocatable, found := node.Status.Allocatable["cpu"]
 | 
								cpuAllocatable, found := node.Status.Allocatable["cpu"]
 | 
				
			||||||
			framework.ExpectEqual(found, true)
 | 
								framework.ExpectEqual(found, true)
 | 
				
			||||||
			milliCPU := cpuAllocatable.MilliValue() * 40 / 100
 | 
								milliCPU := cpuAllocatable.MilliValue()
 | 
				
			||||||
 | 
								milliCPU = int64(float64(milliCPU-currentCPUUsage) * float64(0.6))
 | 
				
			||||||
			memAllocatable, found := node.Status.Allocatable["memory"]
 | 
								memAllocatable, found := node.Status.Allocatable["memory"]
 | 
				
			||||||
			framework.ExpectEqual(found, true)
 | 
								framework.ExpectEqual(found, true)
 | 
				
			||||||
			memory := memAllocatable.Value() * 60 / 100
 | 
								memory := memAllocatable.Value()
 | 
				
			||||||
 | 
								memory = int64(float64(memory-currentMemUsage) * float64(0.6))
 | 
				
			||||||
 | 
								// If a node is already heavily utilized let not's create a pod there.
 | 
				
			||||||
 | 
								if milliCPU <= 0 || memory <= 0 {
 | 
				
			||||||
 | 
									framework.Logf("Node is heavily utilized, let's not create a pod here")
 | 
				
			||||||
 | 
									continue
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
			podRes = v1.ResourceList{}
 | 
								podRes = v1.ResourceList{}
 | 
				
			||||||
			podRes[v1.ResourceCPU] = *resource.NewMilliQuantity(int64(milliCPU), resource.DecimalSI)
 | 
								podRes[v1.ResourceCPU] = *resource.NewMilliQuantity(int64(milliCPU), resource.DecimalSI)
 | 
				
			||||||
			podRes[v1.ResourceMemory] = *resource.NewQuantity(int64(memory), resource.BinarySI)
 | 
								podRes[v1.ResourceMemory] = *resource.NewQuantity(int64(memory), resource.BinarySI)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
			// make the first pod low priority and the rest medium priority.
 | 
								// make the first pod low priority and the rest medium priority.
 | 
				
			||||||
			priorityName := mediumPriorityClassName
 | 
								priorityName := mediumPriorityClassName
 | 
				
			||||||
			if i == 0 {
 | 
								if len(pods) == 0 {
 | 
				
			||||||
				priorityName = lowPriorityClassName
 | 
									priorityName = lowPriorityClassName
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
			pods[i] = createPausePod(f, pausePodConfig{
 | 
								pods = append(pods, createPausePod(f, pausePodConfig{
 | 
				
			||||||
				Name:              fmt.Sprintf("pod%d-%v", i, priorityName),
 | 
									Name:              fmt.Sprintf("pod%d-%v", i, priorityName),
 | 
				
			||||||
				PriorityClassName: priorityName,
 | 
									PriorityClassName: priorityName,
 | 
				
			||||||
				Resources: &v1.ResourceRequirements{
 | 
									Resources: &v1.ResourceRequirements{
 | 
				
			||||||
					Requests: podRes,
 | 
										Requests: podRes,
 | 
				
			||||||
				},
 | 
									},
 | 
				
			||||||
			})
 | 
									NodeName: node.Name,
 | 
				
			||||||
 | 
								}))
 | 
				
			||||||
			framework.Logf("Created pod: %v", pods[i].Name)
 | 
								framework.Logf("Created pod: %v", pods[i].Name)
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
							if len(pods) < 2 {
 | 
				
			||||||
 | 
								framework.Failf("We need at least two pods to be created but" +
 | 
				
			||||||
 | 
									"all nodes are already heavily utilized, so preemption tests cannot be run")
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
		ginkgo.By("Wait for pods to be scheduled.")
 | 
							ginkgo.By("Wait for pods to be scheduled.")
 | 
				
			||||||
		for _, pod := range pods {
 | 
							for _, pod := range pods {
 | 
				
			||||||
			framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(cs, pod))
 | 
								framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(cs, pod))
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		ginkgo.By("Run a high priority pod that use 60% of a node resources.")
 | 
							// Set the pod request to the first pod's resources (should be low priority pod)
 | 
				
			||||||
		// Create a high priority pod and make sure it is scheduled.
 | 
							podRes = pods[0].Spec.Containers[0].Resources.Requests
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							ginkgo.By("Run a high priority pod that has same requirements as that of lower priority pod")
 | 
				
			||||||
 | 
							// Create a high priority pod and make sure it is scheduled on the same node as the low priority pod.
 | 
				
			||||||
		runPausePod(f, pausePodConfig{
 | 
							runPausePod(f, pausePodConfig{
 | 
				
			||||||
			Name:              "preemptor-pod",
 | 
								Name:              "preemptor-pod",
 | 
				
			||||||
			PriorityClassName: highPriorityClassName,
 | 
								PriorityClassName: highPriorityClassName,
 | 
				
			||||||
			Resources: &v1.ResourceRequirements{
 | 
								Resources: &v1.ResourceRequirements{
 | 
				
			||||||
				Requests: podRes,
 | 
									Requests: podRes,
 | 
				
			||||||
			},
 | 
								},
 | 
				
			||||||
 | 
								NodeName: pods[0].Spec.NodeName,
 | 
				
			||||||
		})
 | 
							})
 | 
				
			||||||
		// Make sure that the lowest priority pod is deleted.
 | 
					
 | 
				
			||||||
		preemptedPod, err := cs.CoreV1().Pods(pods[0].Namespace).Get(pods[0].Name, metav1.GetOptions{})
 | 
							preemptedPod, err := cs.CoreV1().Pods(pods[0].Namespace).Get(pods[0].Name, metav1.GetOptions{})
 | 
				
			||||||
		podDeleted := (err != nil && apierrors.IsNotFound(err)) ||
 | 
							podPreempted := (err != nil && apierrors.IsNotFound(err)) ||
 | 
				
			||||||
			(err == nil && preemptedPod.DeletionTimestamp != nil)
 | 
								(err == nil && preemptedPod.DeletionTimestamp != nil)
 | 
				
			||||||
		framework.ExpectEqual(podDeleted, true)
 | 
					 | 
				
			||||||
		// Other pods (mid priority ones) should be present.
 | 
					 | 
				
			||||||
		for i := 1; i < len(pods); i++ {
 | 
							for i := 1; i < len(pods); i++ {
 | 
				
			||||||
			livePod, err := cs.CoreV1().Pods(pods[i].Namespace).Get(pods[i].Name, metav1.GetOptions{})
 | 
								livePod, err := cs.CoreV1().Pods(pods[i].Namespace).Get(pods[i].Name, metav1.GetOptions{})
 | 
				
			||||||
			framework.ExpectNoError(err)
 | 
								framework.ExpectNoError(err)
 | 
				
			||||||
			gomega.Expect(livePod.DeletionTimestamp).To(gomega.BeNil())
 | 
								gomega.Expect(livePod.DeletionTimestamp).To(gomega.BeNil())
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							framework.ExpectEqual(podPreempted, true)
 | 
				
			||||||
	})
 | 
						})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// This test verifies that when a critical pod is created and no node with
 | 
						// This test verifies that when a critical pod is created and no node with
 | 
				
			||||||
@@ -163,21 +184,32 @@ var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
 | 
				
			|||||||
		var podRes v1.ResourceList
 | 
							var podRes v1.ResourceList
 | 
				
			||||||
		// Create one pod per node that uses a lot of the node's resources.
 | 
							// Create one pod per node that uses a lot of the node's resources.
 | 
				
			||||||
		ginkgo.By("Create pods that use 60% of node resources.")
 | 
							ginkgo.By("Create pods that use 60% of node resources.")
 | 
				
			||||||
		pods := make([]*v1.Pod, len(nodeList.Items))
 | 
							pods := make([]*v1.Pod, 0, len(nodeList.Items))
 | 
				
			||||||
 | 
							allPods, err := cs.CoreV1().Pods(metav1.NamespaceAll).List(metav1.ListOptions{})
 | 
				
			||||||
 | 
							framework.ExpectNoError(err)
 | 
				
			||||||
		for i, node := range nodeList.Items {
 | 
							for i, node := range nodeList.Items {
 | 
				
			||||||
 | 
								currentCPUUsage, currentMemUsage := getCurrentPodUsageOnTheNode(node.Name, allPods.Items, podRequestedResource)
 | 
				
			||||||
 | 
								framework.Logf("Current cpu usage and memory usage is %v, %v", currentCPUUsage, currentMemUsage)
 | 
				
			||||||
			cpuAllocatable, found := node.Status.Allocatable["cpu"]
 | 
								cpuAllocatable, found := node.Status.Allocatable["cpu"]
 | 
				
			||||||
			framework.ExpectEqual(found, true)
 | 
								framework.ExpectEqual(found, true)
 | 
				
			||||||
			milliCPU := cpuAllocatable.MilliValue() * 40 / 100
 | 
								milliCPU := cpuAllocatable.MilliValue()
 | 
				
			||||||
 | 
								milliCPU = int64(float64(milliCPU-currentCPUUsage) * float64(0.6))
 | 
				
			||||||
			memAllocatable, found := node.Status.Allocatable["memory"]
 | 
								memAllocatable, found := node.Status.Allocatable["memory"]
 | 
				
			||||||
			framework.ExpectEqual(found, true)
 | 
								framework.ExpectEqual(found, true)
 | 
				
			||||||
			memory := memAllocatable.Value() * 60 / 100
 | 
								memory := memAllocatable.Value()
 | 
				
			||||||
 | 
								memory = int64(float64(memory-currentMemUsage) * float64(0.6))
 | 
				
			||||||
			podRes = v1.ResourceList{}
 | 
								podRes = v1.ResourceList{}
 | 
				
			||||||
 | 
								// If a node is already heavily utilized let not's create a pod there.
 | 
				
			||||||
 | 
								if milliCPU <= 0 || memory <= 0 {
 | 
				
			||||||
 | 
									framework.Logf("Node is heavily utilized, let's not create a pod there")
 | 
				
			||||||
 | 
									continue
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
			podRes[v1.ResourceCPU] = *resource.NewMilliQuantity(int64(milliCPU), resource.DecimalSI)
 | 
								podRes[v1.ResourceCPU] = *resource.NewMilliQuantity(int64(milliCPU), resource.DecimalSI)
 | 
				
			||||||
			podRes[v1.ResourceMemory] = *resource.NewQuantity(int64(memory), resource.BinarySI)
 | 
								podRes[v1.ResourceMemory] = *resource.NewQuantity(int64(memory), resource.BinarySI)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
			// make the first pod low priority and the rest medium priority.
 | 
								// make the first pod low priority and the rest medium priority.
 | 
				
			||||||
			priorityName := mediumPriorityClassName
 | 
								priorityName := mediumPriorityClassName
 | 
				
			||||||
			if i == 0 {
 | 
								if len(pods) == 0 {
 | 
				
			||||||
				priorityName = lowPriorityClassName
 | 
									priorityName = lowPriorityClassName
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
			pods[i] = createPausePod(f, pausePodConfig{
 | 
								pods[i] = createPausePod(f, pausePodConfig{
 | 
				
			||||||
@@ -186,15 +218,22 @@ var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
 | 
				
			|||||||
				Resources: &v1.ResourceRequirements{
 | 
									Resources: &v1.ResourceRequirements{
 | 
				
			||||||
					Requests: podRes,
 | 
										Requests: podRes,
 | 
				
			||||||
				},
 | 
									},
 | 
				
			||||||
 | 
									NodeName: node.Name,
 | 
				
			||||||
			})
 | 
								})
 | 
				
			||||||
			framework.Logf("Created pod: %v", pods[i].Name)
 | 
								framework.Logf("Created pod: %v", pods[i].Name)
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
							if len(pods) < 2 {
 | 
				
			||||||
 | 
								framework.Skipf("We need at least two pods to be created but" +
 | 
				
			||||||
 | 
									"all nodes are already heavily utilized, so preemption tests cannot be run")
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
		ginkgo.By("Wait for pods to be scheduled.")
 | 
							ginkgo.By("Wait for pods to be scheduled.")
 | 
				
			||||||
		for _, pod := range pods {
 | 
							for _, pod := range pods {
 | 
				
			||||||
			framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(cs, pod))
 | 
								framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(cs, pod))
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		ginkgo.By("Run a critical pod that use 60% of a node resources.")
 | 
							// We want this pod to be preempted
 | 
				
			||||||
 | 
							podRes = pods[0].Spec.Containers[0].Resources.Requests
 | 
				
			||||||
 | 
							ginkgo.By("Run a critical pod that use same resources as that of a lower priority pod")
 | 
				
			||||||
		// Create a critical pod and make sure it is scheduled.
 | 
							// Create a critical pod and make sure it is scheduled.
 | 
				
			||||||
		defer func() {
 | 
							defer func() {
 | 
				
			||||||
			// Clean-up the critical pod
 | 
								// Clean-up the critical pod
 | 
				
			||||||
@@ -211,18 +250,25 @@ var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
 | 
				
			|||||||
			Resources: &v1.ResourceRequirements{
 | 
								Resources: &v1.ResourceRequirements{
 | 
				
			||||||
				Requests: podRes,
 | 
									Requests: podRes,
 | 
				
			||||||
			},
 | 
								},
 | 
				
			||||||
 | 
								NodeName: pods[0].Spec.NodeName,
 | 
				
			||||||
		})
 | 
							})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							defer func() {
 | 
				
			||||||
 | 
								// Clean-up the critical pod
 | 
				
			||||||
 | 
								err := f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Delete("critical-pod", metav1.NewDeleteOptions(0))
 | 
				
			||||||
 | 
								framework.ExpectNoError(err)
 | 
				
			||||||
 | 
							}()
 | 
				
			||||||
		// Make sure that the lowest priority pod is deleted.
 | 
							// Make sure that the lowest priority pod is deleted.
 | 
				
			||||||
		preemptedPod, err := cs.CoreV1().Pods(pods[0].Namespace).Get(pods[0].Name, metav1.GetOptions{})
 | 
							preemptedPod, err := cs.CoreV1().Pods(pods[0].Namespace).Get(pods[0].Name, metav1.GetOptions{})
 | 
				
			||||||
		podDeleted := (err != nil && apierrors.IsNotFound(err)) ||
 | 
							podPreempted := (err != nil && apierrors.IsNotFound(err)) ||
 | 
				
			||||||
			(err == nil && preemptedPod.DeletionTimestamp != nil)
 | 
								(err == nil && preemptedPod.DeletionTimestamp != nil)
 | 
				
			||||||
		framework.ExpectEqual(podDeleted, true)
 | 
					 | 
				
			||||||
		// Other pods (mid priority ones) should be present.
 | 
					 | 
				
			||||||
		for i := 1; i < len(pods); i++ {
 | 
							for i := 1; i < len(pods); i++ {
 | 
				
			||||||
			livePod, err := cs.CoreV1().Pods(pods[i].Namespace).Get(pods[i].Name, metav1.GetOptions{})
 | 
								livePod, err := cs.CoreV1().Pods(pods[i].Namespace).Get(pods[i].Name, metav1.GetOptions{})
 | 
				
			||||||
			framework.ExpectNoError(err)
 | 
								framework.ExpectNoError(err)
 | 
				
			||||||
			gomega.Expect(livePod.DeletionTimestamp).To(gomega.BeNil())
 | 
								gomega.Expect(livePod.DeletionTimestamp).To(gomega.BeNil())
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							framework.ExpectEqual(podPreempted, true)
 | 
				
			||||||
	})
 | 
						})
 | 
				
			||||||
})
 | 
					})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -516,3 +562,17 @@ func waitForPreemptingWithTimeout(f *framework.Framework, pod *v1.Pod, timeout t
 | 
				
			|||||||
	})
 | 
						})
 | 
				
			||||||
	framework.ExpectNoError(err, "pod %v/%v failed to preempt other pods", pod.Namespace, pod.Name)
 | 
						framework.ExpectNoError(err, "pod %v/%v failed to preempt other pods", pod.Namespace, pod.Name)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func getCurrentPodUsageOnTheNode(nodeName string, pods []v1.Pod, resource *v1.ResourceRequirements) (int64, int64) {
 | 
				
			||||||
 | 
						totalRequestedCPUResource := resource.Requests.Cpu().MilliValue()
 | 
				
			||||||
 | 
						totalRequestedMemResource := resource.Requests.Memory().Value()
 | 
				
			||||||
 | 
						for _, pod := range pods {
 | 
				
			||||||
 | 
							if pod.Spec.NodeName != nodeName || v1qos.GetPodQOS(&pod) == v1.PodQOSBestEffort {
 | 
				
			||||||
 | 
								continue
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							result := getNonZeroRequests(&pod)
 | 
				
			||||||
 | 
							totalRequestedCPUResource += result.MilliCPU
 | 
				
			||||||
 | 
							totalRequestedMemResource += result.Memory
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						return totalRequestedCPUResource, totalRequestedMemResource
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user