mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-10-30 17:58:14 +00:00 
			
		
		
		
	Merge pull request #44092 from MaciekPytel/ca_overscaleup_e2e
Automatic merge from submit-queue Cluster-Autoscaler e2e to catch unnecessary scale-ups This e2e catches the bug in 0.5.0 CA with reasonable probability. ```release-note ```
This commit is contained in:
		| @@ -48,12 +48,13 @@ import ( | ||||
| ) | ||||
|  | ||||
| const ( | ||||
| 	defaultTimeout      = 3 * time.Minute | ||||
| 	resizeTimeout       = 5 * time.Minute | ||||
| 	scaleUpTimeout      = 5 * time.Minute | ||||
| 	scaleDownTimeout    = 15 * time.Minute | ||||
| 	podTimeout          = 2 * time.Minute | ||||
| 	nodesRecoverTimeout = 5 * time.Minute | ||||
| 	defaultTimeout        = 3 * time.Minute | ||||
| 	resizeTimeout         = 5 * time.Minute | ||||
| 	scaleUpTimeout        = 5 * time.Minute | ||||
| 	scaleUpTriggerTimeout = 2 * time.Minute | ||||
| 	scaleDownTimeout      = 15 * time.Minute | ||||
| 	podTimeout            = 2 * time.Minute | ||||
| 	nodesRecoverTimeout   = 5 * time.Minute | ||||
|  | ||||
| 	gkeEndpoint      = "https://test-container.sandbox.googleapis.com" | ||||
| 	gkeUpdateTimeout = 15 * time.Minute | ||||
| @@ -61,6 +62,9 @@ const ( | ||||
| 	disabledTaint             = "DisabledForAutoscalingTest" | ||||
| 	newNodesForScaledownTests = 2 | ||||
| 	unhealthyClusterThreshold = 4 | ||||
|  | ||||
| 	caNoScaleUpStatus      = "NoActivity" | ||||
| 	caOngoingScaleUpStatus = "InProgress" | ||||
| ) | ||||
|  | ||||
| var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() { | ||||
| @@ -117,7 +121,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() { | ||||
|  | ||||
| 	It("shouldn't increase cluster size if pending pod is too large [Feature:ClusterSizeAutoscalingScaleUp]", func() { | ||||
| 		By("Creating unschedulable pod") | ||||
| 		ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memCapacityMb)), false) | ||||
| 		ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memCapacityMb)), false, defaultTimeout) | ||||
| 		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation") | ||||
|  | ||||
| 		By("Waiting for scale up hoping it won't happen") | ||||
| @@ -144,7 +148,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() { | ||||
| 	}) | ||||
|  | ||||
| 	It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]", func() { | ||||
| 		ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false) | ||||
| 		ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout) | ||||
| 		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation") | ||||
|  | ||||
| 		// Verify, that cluster size is increased | ||||
| @@ -153,6 +157,31 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() { | ||||
| 		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c)) | ||||
| 	}) | ||||
|  | ||||
| 	It("shouldn't trigger additional scale-ups during processing scale-up [Feature:ClusterSizeAutoscalingScaleUp]", func() { | ||||
| 		status, err := getScaleUpStatus(c) | ||||
| 		framework.ExpectNoError(err) | ||||
| 		unmanagedNodes := nodeCount - status.ready | ||||
|  | ||||
| 		By("Schedule more pods than can fit and wait for claster to scale-up") | ||||
| 		ReserveMemory(f, "memory-reservation", 100, (nodeCount+2)*memCapacityMb, false, 1*time.Second) | ||||
| 		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation") | ||||
|  | ||||
| 		status, err = waitForScaleUpStatus(c, caOngoingScaleUpStatus, scaleUpTriggerTimeout) | ||||
| 		framework.ExpectNoError(err) | ||||
| 		target := status.target | ||||
| 		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c)) | ||||
|  | ||||
| 		By("Expect no more scale-up to be happening after all pods are scheduled") | ||||
| 		status, err = getScaleUpStatus(c) | ||||
| 		framework.ExpectNoError(err) | ||||
| 		if status.target != target { | ||||
| 			glog.Warningf("Final number of nodes (%v) does not match initial scale-up target (%v).", status.target, target) | ||||
| 		} | ||||
| 		Expect(status.status).Should(Equal(caNoScaleUpStatus)) | ||||
| 		Expect(status.ready).Should(Equal(status.target)) | ||||
| 		Expect(len(framework.GetReadySchedulableNodesOrDie(f.ClientSet).Items)).Should(Equal(status.target + unmanagedNodes)) | ||||
| 	}) | ||||
|  | ||||
| 	It("should increase cluster size if pending pods are small and there is another node pool that is not autoscaled [Feature:ClusterSizeAutoscalingScaleUp]", func() { | ||||
| 		framework.SkipUnlessProviderIs("gke") | ||||
|  | ||||
| @@ -163,7 +192,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() { | ||||
| 		framework.ExpectNoError(framework.WaitForClusterSize(c, nodeCount+1, resizeTimeout)) | ||||
| 		glog.Infof("Not enabling cluster autoscaler for the node pool (on purpose).") | ||||
|  | ||||
| 		ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false) | ||||
| 		ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout) | ||||
| 		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation") | ||||
|  | ||||
| 		// Verify, that cluster size is increased | ||||
| @@ -295,7 +324,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() { | ||||
| 		framework.ExpectNoError(enableAutoscaler(extraPoolName, 1, 2)) | ||||
|  | ||||
| 		By("Creating rc with 2 pods too big to fit default-pool but fitting extra-pool") | ||||
| 		ReserveMemory(f, "memory-reservation", 2, int(2.1*float64(memCapacityMb)), false) | ||||
| 		ReserveMemory(f, "memory-reservation", 2, int(2.1*float64(memCapacityMb)), false, defaultTimeout) | ||||
| 		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation") | ||||
|  | ||||
| 		// Apparently GKE master is restarted couple minutes after the node pool is added | ||||
| @@ -383,7 +412,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() { | ||||
| 				nodesToBreak = nodesToBreak[1:] | ||||
| 				framework.TestUnderTemporaryNetworkFailure(c, "default", ntb, testFunction) | ||||
| 			} else { | ||||
| 				ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false) | ||||
| 				ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout) | ||||
| 				defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation") | ||||
| 				time.Sleep(scaleUpTimeout) | ||||
| 				currentNodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet) | ||||
| @@ -622,7 +651,7 @@ func CreateNodeSelectorPods(f *framework.Framework, id string, replicas int, nod | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, expectRunning bool) { | ||||
| func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration) { | ||||
| 	By(fmt.Sprintf("Running RC which reserves %v MB of memory", megabytes)) | ||||
| 	request := int64(1024 * 1024 * megabytes / replicas) | ||||
| 	config := &testutils.RCConfig{ | ||||
| @@ -630,7 +659,7 @@ func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, e | ||||
| 		InternalClient: f.InternalClientset, | ||||
| 		Name:           id, | ||||
| 		Namespace:      f.Namespace.Name, | ||||
| 		Timeout:        defaultTimeout, | ||||
| 		Timeout:        timeout, | ||||
| 		Image:          framework.GetPauseImageName(f.ClientSet), | ||||
| 		Replicas:       replicas, | ||||
| 		MemRequest:     request, | ||||
| @@ -896,3 +925,65 @@ func getClusterwideStatus(c clientset.Interface) (string, error) { | ||||
| 	} | ||||
| 	return result[1], nil | ||||
| } | ||||
|  | ||||
| type scaleUpStatus struct { | ||||
| 	status string | ||||
| 	ready  int | ||||
| 	target int | ||||
| } | ||||
|  | ||||
| // Try to get scaleup statuses of all node groups. | ||||
| // Status configmap is not parsing-friendly, so evil regexpery follows. | ||||
| func getScaleUpStatus(c clientset.Interface) (*scaleUpStatus, error) { | ||||
| 	configMap, err := c.CoreV1().ConfigMaps("kube-system").Get("cluster-autoscaler-status", metav1.GetOptions{}) | ||||
| 	if err != nil { | ||||
| 		return nil, err | ||||
| 	} | ||||
| 	status, ok := configMap.Data["status"] | ||||
| 	if !ok { | ||||
| 		return nil, fmt.Errorf("Status information not found in configmap") | ||||
| 	} | ||||
| 	matcher, err := regexp.Compile("s*ScaleUp:\\s*([A-Za-z]+)\\s*\\(ready=([0-9]+)\\s*cloudProviderTarget=([0-9]+)\\s*\\)") | ||||
| 	if err != nil { | ||||
| 		return nil, err | ||||
| 	} | ||||
| 	matches := matcher.FindAllStringSubmatch(status, -1) | ||||
| 	if len(matches) < 1 { | ||||
| 		return nil, fmt.Errorf("Failed to parse CA status configmap") | ||||
| 	} | ||||
| 	result := scaleUpStatus{ | ||||
| 		status: caNoScaleUpStatus, | ||||
| 		ready:  0, | ||||
| 		target: 0, | ||||
| 	} | ||||
| 	for _, match := range matches { | ||||
| 		if match[1] == caOngoingScaleUpStatus { | ||||
| 			result.status = caOngoingScaleUpStatus | ||||
| 		} | ||||
| 		newReady, err := strconv.Atoi(match[2]) | ||||
| 		if err != nil { | ||||
| 			return nil, err | ||||
| 		} | ||||
| 		result.ready += newReady | ||||
| 		newTarget, err := strconv.Atoi(match[3]) | ||||
| 		if err != nil { | ||||
| 			return nil, err | ||||
| 		} | ||||
| 		result.target += newTarget | ||||
| 	} | ||||
| 	glog.Infof("Cluster-Autoscaler scale-up status: %v (%v, %v)", result.status, result.ready, result.target) | ||||
| 	return &result, nil | ||||
| } | ||||
|  | ||||
| func waitForScaleUpStatus(c clientset.Interface, expected string, timeout time.Duration) (*scaleUpStatus, error) { | ||||
| 	for start := time.Now(); time.Since(start) < timeout; time.Sleep(5 * time.Second) { | ||||
| 		status, err := getScaleUpStatus(c) | ||||
| 		if err != nil { | ||||
| 			return nil, err | ||||
| 		} | ||||
| 		if status.status == expected { | ||||
| 			return status, nil | ||||
| 		} | ||||
| 	} | ||||
| 	return nil, fmt.Errorf("ScaleUp status did not reach expected value: %v", expected) | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kubernetes Submit Queue
					Kubernetes Submit Queue