mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-11-04 04:08:16 +00:00 
			
		
		
		
	Remove remants of broken stuff - nvidia/autoscaling
Signed-off-by: Davanum Srinivas <davanum@gmail.com>
This commit is contained in:
		@@ -1,57 +0,0 @@
 | 
			
		||||
apiVersion: apps/v1
 | 
			
		||||
kind: DaemonSet
 | 
			
		||||
metadata:
 | 
			
		||||
  name: nvidia-gpu-device-plugin
 | 
			
		||||
  namespace: kube-system
 | 
			
		||||
  labels:
 | 
			
		||||
    k8s-app: nvidia-gpu-device-plugin
 | 
			
		||||
    addonmanager.kubernetes.io/mode: EnsureExists
 | 
			
		||||
spec:
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      k8s-app: nvidia-gpu-device-plugin
 | 
			
		||||
  template:
 | 
			
		||||
    metadata:
 | 
			
		||||
      labels:
 | 
			
		||||
        k8s-app: nvidia-gpu-device-plugin
 | 
			
		||||
    spec:
 | 
			
		||||
      priorityClassName: system-node-critical
 | 
			
		||||
      affinity:
 | 
			
		||||
        nodeAffinity:
 | 
			
		||||
          requiredDuringSchedulingIgnoredDuringExecution:
 | 
			
		||||
            nodeSelectorTerms:
 | 
			
		||||
            - matchExpressions:
 | 
			
		||||
              - key: cloud.google.com/gke-accelerator
 | 
			
		||||
                operator: Exists
 | 
			
		||||
      tolerations:
 | 
			
		||||
      - operator: "Exists"
 | 
			
		||||
        effect: "NoExecute"
 | 
			
		||||
      - operator: "Exists"
 | 
			
		||||
        effect: "NoSchedule"
 | 
			
		||||
      volumes:
 | 
			
		||||
      - name: device-plugin
 | 
			
		||||
        hostPath:
 | 
			
		||||
          path: /var/lib/kubelet/device-plugins
 | 
			
		||||
      - name: dev
 | 
			
		||||
        hostPath:
 | 
			
		||||
          path: /dev
 | 
			
		||||
      containers:
 | 
			
		||||
      - image: "registry.k8s.io/nvidia-gpu-device-plugin@sha256:4b036e8844920336fa48f36edeb7d4398f426d6a934ba022848deed2edbf09aa"
 | 
			
		||||
        command: ["/usr/bin/nvidia-gpu-device-plugin", "-logtostderr"]
 | 
			
		||||
        name: nvidia-gpu-device-plugin
 | 
			
		||||
        resources:
 | 
			
		||||
          requests:
 | 
			
		||||
            cpu: 50m
 | 
			
		||||
            memory: 10Mi
 | 
			
		||||
          limits:
 | 
			
		||||
            cpu: 50m
 | 
			
		||||
            memory: 10Mi
 | 
			
		||||
        securityContext:
 | 
			
		||||
          privileged: true
 | 
			
		||||
        volumeMounts:
 | 
			
		||||
        - name: device-plugin
 | 
			
		||||
          mountPath: /device-plugin
 | 
			
		||||
        - name: dev
 | 
			
		||||
          mountPath: /dev
 | 
			
		||||
  updateStrategy:
 | 
			
		||||
    type: RollingUpdate
 | 
			
		||||
@@ -2943,9 +2943,6 @@ EOF
 | 
			
		||||
    sed -i -e "s@{{ metrics_server_memory_per_node }}@${metrics_server_memory_per_node}@g" "${metrics_server_yaml}"
 | 
			
		||||
    sed -i -e "s@{{ metrics_server_min_cluster_size }}@${metrics_server_min_cluster_size}@g" "${metrics_server_yaml}"
 | 
			
		||||
  fi
 | 
			
		||||
  if [[ "${ENABLE_NVIDIA_GPU_DEVICE_PLUGIN:-}" == "true" ]]; then
 | 
			
		||||
    setup-addon-manifests "addons" "device-plugins/nvidia-gpu"
 | 
			
		||||
  fi
 | 
			
		||||
  # Setting up the konnectivity-agent daemonset
 | 
			
		||||
  if [[ "${RUN_KONNECTIVITY_PODS:-false}" == "true" ]]; then
 | 
			
		||||
    setup-addon-manifests "addons" "konnectivity-agent"
 | 
			
		||||
 
 | 
			
		||||
@@ -1512,11 +1512,6 @@ EOF
 | 
			
		||||
    if [ -n "${CLUSTER_SIGNING_DURATION:-}" ]; then
 | 
			
		||||
      cat >>"$file" <<EOF
 | 
			
		||||
CLUSTER_SIGNING_DURATION: $(yaml-quote "${CLUSTER_SIGNING_DURATION}")
 | 
			
		||||
EOF
 | 
			
		||||
    fi
 | 
			
		||||
    if [[ "${NODE_ACCELERATORS:-}" == *"type=nvidia"* ]]; then
 | 
			
		||||
      cat >>"$file" <<EOF
 | 
			
		||||
ENABLE_NVIDIA_GPU_DEVICE_PLUGIN: $(yaml-quote "true")
 | 
			
		||||
EOF
 | 
			
		||||
    fi
 | 
			
		||||
    if [ -n "${ADDON_MANAGER_LEADER_ELECTION:-}" ]; then
 | 
			
		||||
 
 | 
			
		||||
@@ -1,125 +0,0 @@
 | 
			
		||||
/*
 | 
			
		||||
Copyright 2017 The Kubernetes Authors.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
you may not use this file except in compliance with the License.
 | 
			
		||||
You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
    http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
See the License for the specific language governing permissions and
 | 
			
		||||
limitations under the License.
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
package autoscaling
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"context"
 | 
			
		||||
	"strings"
 | 
			
		||||
	"time"
 | 
			
		||||
 | 
			
		||||
	v1 "k8s.io/api/core/v1"
 | 
			
		||||
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | 
			
		||||
	"k8s.io/kubernetes/test/e2e/feature"
 | 
			
		||||
	"k8s.io/kubernetes/test/e2e/framework"
 | 
			
		||||
	e2eautoscaling "k8s.io/kubernetes/test/e2e/framework/autoscaling"
 | 
			
		||||
	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
 | 
			
		||||
	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
 | 
			
		||||
	admissionapi "k8s.io/pod-security-admission/api"
 | 
			
		||||
 | 
			
		||||
	"github.com/onsi/ginkgo/v2"
 | 
			
		||||
	"github.com/onsi/gomega"
 | 
			
		||||
	"github.com/onsi/gomega/gmeasure"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
var _ = SIGDescribe(feature.ClusterSizeAutoscalingScaleUp, framework.WithSlow(), "Autoscaling", func() {
 | 
			
		||||
	f := framework.NewDefaultFramework("autoscaling")
 | 
			
		||||
	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
 | 
			
		||||
	var experiment *gmeasure.Experiment
 | 
			
		||||
 | 
			
		||||
	ginkgo.Describe("Autoscaling a service", func() {
 | 
			
		||||
		ginkgo.BeforeEach(func(ctx context.Context) {
 | 
			
		||||
			// Check if Cloud Autoscaler is enabled by trying to get its ConfigMap.
 | 
			
		||||
			_, err := f.ClientSet.CoreV1().ConfigMaps("kube-system").Get(ctx, "cluster-autoscaler-status", metav1.GetOptions{})
 | 
			
		||||
			if err != nil {
 | 
			
		||||
				e2eskipper.Skipf("test expects Cluster Autoscaler to be enabled")
 | 
			
		||||
			}
 | 
			
		||||
			experiment = gmeasure.NewExperiment("Autoscaling a service")
 | 
			
		||||
			ginkgo.AddReportEntry(experiment.Name, experiment)
 | 
			
		||||
		})
 | 
			
		||||
 | 
			
		||||
		ginkgo.Context("from 1 pod and 3 nodes to 8 pods and >=4 nodes", func() {
 | 
			
		||||
			const nodesNum = 3       // Expect there to be 3 nodes before and after the test.
 | 
			
		||||
			var nodeGroupName string // Set by BeforeEach, used by AfterEach to scale this node group down after the test.
 | 
			
		||||
			var nodes *v1.NodeList   // Set by BeforeEach, used by Measure to calculate CPU request based on node's sizes.
 | 
			
		||||
 | 
			
		||||
			ginkgo.BeforeEach(func(ctx context.Context) {
 | 
			
		||||
				// Make sure there is only 1 node group, otherwise this test becomes useless.
 | 
			
		||||
				nodeGroups := strings.Split(framework.TestContext.CloudConfig.NodeInstanceGroup, ",")
 | 
			
		||||
				if len(nodeGroups) != 1 {
 | 
			
		||||
					e2eskipper.Skipf("test expects 1 node group, found %d", len(nodeGroups))
 | 
			
		||||
				}
 | 
			
		||||
				nodeGroupName = nodeGroups[0]
 | 
			
		||||
 | 
			
		||||
				// Make sure the node group has exactly 'nodesNum' nodes, otherwise this test becomes useless.
 | 
			
		||||
				nodeGroupSize, err := framework.GroupSize(nodeGroupName)
 | 
			
		||||
				framework.ExpectNoError(err)
 | 
			
		||||
				if nodeGroupSize != nodesNum {
 | 
			
		||||
					e2eskipper.Skipf("test expects %d nodes, found %d", nodesNum, nodeGroupSize)
 | 
			
		||||
				}
 | 
			
		||||
 | 
			
		||||
				// Make sure all nodes are schedulable, otherwise we are in some kind of a problem state.
 | 
			
		||||
				nodes, err = e2enode.GetReadySchedulableNodes(ctx, f.ClientSet)
 | 
			
		||||
				framework.ExpectNoError(err)
 | 
			
		||||
				gomega.Expect(nodes.Items).To(gomega.HaveLen(nodeGroupSize), "not all nodes are schedulable")
 | 
			
		||||
			})
 | 
			
		||||
 | 
			
		||||
			ginkgo.AfterEach(func(ctx context.Context) {
 | 
			
		||||
				// Attempt cleanup only if a node group was targeted for scale up.
 | 
			
		||||
				// Otherwise the test was probably skipped and we'll get a gcloud error due to invalid parameters.
 | 
			
		||||
				if len(nodeGroupName) > 0 {
 | 
			
		||||
					// Scale down back to only 'nodesNum' nodes, as expected at the start of the test.
 | 
			
		||||
					framework.ExpectNoError(framework.ResizeGroup(nodeGroupName, nodesNum))
 | 
			
		||||
					framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, f.ClientSet, nodesNum, 15*time.Minute))
 | 
			
		||||
				}
 | 
			
		||||
			})
 | 
			
		||||
 | 
			
		||||
			ginkgo.It("takes less than 15 minutes", func(ctx context.Context) {
 | 
			
		||||
				// Measured over multiple samples, scaling takes 10 +/- 2 minutes, so 15 minutes should be fully sufficient.
 | 
			
		||||
				const timeToWait = 15 * time.Minute
 | 
			
		||||
 | 
			
		||||
				// Calculate the CPU request of the service.
 | 
			
		||||
				// This test expects that 8 pods will not fit in 'nodesNum' nodes, but will fit in >='nodesNum'+1 nodes.
 | 
			
		||||
				// Make it so that 'nodesNum' pods fit perfectly per node.
 | 
			
		||||
				nodeCpus := nodes.Items[0].Status.Allocatable[v1.ResourceCPU]
 | 
			
		||||
				nodeCPUMillis := (&nodeCpus).MilliValue()
 | 
			
		||||
				cpuRequestMillis := int64(nodeCPUMillis / nodesNum)
 | 
			
		||||
 | 
			
		||||
				// Start the service we want to scale and wait for it to be up and running.
 | 
			
		||||
				nodeMemoryBytes := nodes.Items[0].Status.Allocatable[v1.ResourceMemory]
 | 
			
		||||
				nodeMemoryMB := (&nodeMemoryBytes).Value() / 1024 / 1024
 | 
			
		||||
				memRequestMB := nodeMemoryMB / 10 // Ensure each pod takes not more than 10% of node's allocatable memory.
 | 
			
		||||
				replicas := 1
 | 
			
		||||
				resourceConsumer := e2eautoscaling.NewDynamicResourceConsumer(ctx, "resource-consumer", f.Namespace.Name, e2eautoscaling.KindDeployment, replicas, 0, 0, 0, cpuRequestMillis, memRequestMB, f.ClientSet, f.ScalesGetter, e2eautoscaling.Disable, e2eautoscaling.Idle)
 | 
			
		||||
				ginkgo.DeferCleanup(resourceConsumer.CleanUp)
 | 
			
		||||
				resourceConsumer.WaitForReplicas(ctx, replicas, 1*time.Minute) // Should finish ~immediately, so 1 minute is more than enough.
 | 
			
		||||
 | 
			
		||||
				// Enable Horizontal Pod Autoscaler with 50% target utilization and
 | 
			
		||||
				// scale up the CPU usage to trigger autoscaling to 8 pods for target to be satisfied.
 | 
			
		||||
				targetCPUUtilizationPercent := int32(50)
 | 
			
		||||
				hpa := e2eautoscaling.CreateCPUResourceHorizontalPodAutoscaler(ctx, resourceConsumer, targetCPUUtilizationPercent, 1, 10)
 | 
			
		||||
				ginkgo.DeferCleanup(e2eautoscaling.DeleteHorizontalPodAutoscaler, resourceConsumer, hpa.Name)
 | 
			
		||||
				cpuLoad := 8 * cpuRequestMillis * int64(targetCPUUtilizationPercent) / 100 // 8 pods utilized to the target level
 | 
			
		||||
				resourceConsumer.ConsumeCPU(int(cpuLoad))
 | 
			
		||||
 | 
			
		||||
				// Measure the time it takes for the service to scale to 8 pods with 50% CPU utilization each.
 | 
			
		||||
				experiment.SampleDuration("total scale-up time", func(idx int) {
 | 
			
		||||
					resourceConsumer.WaitForReplicas(ctx, 8, timeToWait)
 | 
			
		||||
				}, gmeasure.SamplingConfig{N: 1})
 | 
			
		||||
			}) // Increase to run the test more than once.
 | 
			
		||||
		})
 | 
			
		||||
	})
 | 
			
		||||
})
 | 
			
		||||
@@ -1,531 +0,0 @@
 | 
			
		||||
/*
 | 
			
		||||
Copyright 2016 The Kubernetes Authors.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
you may not use this file except in compliance with the License.
 | 
			
		||||
You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
    http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
See the License for the specific language governing permissions and
 | 
			
		||||
limitations under the License.
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
package autoscaling
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"context"
 | 
			
		||||
	"encoding/json"
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"math"
 | 
			
		||||
	"strings"
 | 
			
		||||
	"time"
 | 
			
		||||
 | 
			
		||||
	v1 "k8s.io/api/core/v1"
 | 
			
		||||
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/fields"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/types"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/util/strategicpatch"
 | 
			
		||||
	clientset "k8s.io/client-go/kubernetes"
 | 
			
		||||
	"k8s.io/klog/v2"
 | 
			
		||||
	"k8s.io/kubernetes/test/e2e/feature"
 | 
			
		||||
	"k8s.io/kubernetes/test/e2e/framework"
 | 
			
		||||
	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
 | 
			
		||||
	e2erc "k8s.io/kubernetes/test/e2e/framework/rc"
 | 
			
		||||
	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
 | 
			
		||||
	testutils "k8s.io/kubernetes/test/utils"
 | 
			
		||||
	imageutils "k8s.io/kubernetes/test/utils/image"
 | 
			
		||||
	admissionapi "k8s.io/pod-security-admission/api"
 | 
			
		||||
 | 
			
		||||
	"github.com/onsi/ginkgo/v2"
 | 
			
		||||
	"github.com/onsi/gomega"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
const (
 | 
			
		||||
	memoryReservationTimeout = 5 * time.Minute
 | 
			
		||||
	largeResizeTimeout       = 8 * time.Minute
 | 
			
		||||
	largeScaleUpTimeout      = 10 * time.Minute
 | 
			
		||||
	maxNodes                 = 1000
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
type clusterPredicates struct {
 | 
			
		||||
	nodes int
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type scaleUpTestConfig struct {
 | 
			
		||||
	initialNodes   int
 | 
			
		||||
	initialPods    int
 | 
			
		||||
	extraPods      *testutils.RCConfig
 | 
			
		||||
	expectedResult *clusterPredicates
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
var _ = SIGDescribe("Cluster size autoscaler scalability", framework.WithSlow(), func() {
 | 
			
		||||
	f := framework.NewDefaultFramework("autoscaling")
 | 
			
		||||
	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
 | 
			
		||||
	var c clientset.Interface
 | 
			
		||||
	var nodeCount int
 | 
			
		||||
	var coresPerNode int
 | 
			
		||||
	var memCapacityMb int
 | 
			
		||||
	var originalSizes map[string]int
 | 
			
		||||
	var sum int
 | 
			
		||||
 | 
			
		||||
	ginkgo.BeforeEach(func(ctx context.Context) {
 | 
			
		||||
		e2eskipper.SkipUnlessProviderIs("gce", "gke", "kubemark")
 | 
			
		||||
 | 
			
		||||
		// Check if Cloud Autoscaler is enabled by trying to get its ConfigMap.
 | 
			
		||||
		_, err := f.ClientSet.CoreV1().ConfigMaps("kube-system").Get(ctx, "cluster-autoscaler-status", metav1.GetOptions{})
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			e2eskipper.Skipf("test expects Cluster Autoscaler to be enabled")
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		c = f.ClientSet
 | 
			
		||||
		if originalSizes == nil {
 | 
			
		||||
			originalSizes = make(map[string]int)
 | 
			
		||||
			sum = 0
 | 
			
		||||
			for _, mig := range strings.Split(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") {
 | 
			
		||||
				size, err := framework.GroupSize(mig)
 | 
			
		||||
				framework.ExpectNoError(err)
 | 
			
		||||
				ginkgo.By(fmt.Sprintf("Initial size of %s: %d", mig, size))
 | 
			
		||||
				originalSizes[mig] = size
 | 
			
		||||
				sum += size
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, sum, scaleUpTimeout))
 | 
			
		||||
 | 
			
		||||
		nodes, err := e2enode.GetReadySchedulableNodes(ctx, f.ClientSet)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
		nodeCount = len(nodes.Items)
 | 
			
		||||
		cpu := nodes.Items[0].Status.Capacity[v1.ResourceCPU]
 | 
			
		||||
		mem := nodes.Items[0].Status.Capacity[v1.ResourceMemory]
 | 
			
		||||
		coresPerNode = int((&cpu).MilliValue() / 1000)
 | 
			
		||||
		memCapacityMb = int((&mem).Value() / 1024 / 1024)
 | 
			
		||||
 | 
			
		||||
		gomega.Expect(nodeCount).To(gomega.Equal(sum))
 | 
			
		||||
 | 
			
		||||
		if framework.ProviderIs("gke") {
 | 
			
		||||
			val, err := isAutoscalerEnabled(3)
 | 
			
		||||
			framework.ExpectNoError(err)
 | 
			
		||||
			if !val {
 | 
			
		||||
				err = enableAutoscaler("default-pool", 3, 5)
 | 
			
		||||
				framework.ExpectNoError(err)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	})
 | 
			
		||||
 | 
			
		||||
	ginkgo.AfterEach(func(ctx context.Context) {
 | 
			
		||||
		ginkgo.By(fmt.Sprintf("Restoring initial size of the cluster"))
 | 
			
		||||
		setMigSizes(originalSizes)
 | 
			
		||||
		framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, nodeCount, scaleDownTimeout))
 | 
			
		||||
		nodes, err := c.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
		s := time.Now()
 | 
			
		||||
	makeSchedulableLoop:
 | 
			
		||||
		for start := time.Now(); time.Since(start) < makeSchedulableTimeout; time.Sleep(makeSchedulableDelay) {
 | 
			
		||||
			for _, n := range nodes.Items {
 | 
			
		||||
				err = makeNodeSchedulable(ctx, c, &n, true)
 | 
			
		||||
				switch err.(type) {
 | 
			
		||||
				case CriticalAddonsOnlyError:
 | 
			
		||||
					continue makeSchedulableLoop
 | 
			
		||||
				default:
 | 
			
		||||
					framework.ExpectNoError(err)
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
			break
 | 
			
		||||
		}
 | 
			
		||||
		klog.Infof("Made nodes schedulable again in %v", time.Since(s).String())
 | 
			
		||||
	})
 | 
			
		||||
 | 
			
		||||
	f.It("should scale up at all", feature.ClusterAutoscalerScalability1, func(ctx context.Context) {
 | 
			
		||||
		perNodeReservation := int(float64(memCapacityMb) * 0.95)
 | 
			
		||||
		replicasPerNode := 10
 | 
			
		||||
 | 
			
		||||
		additionalNodes := maxNodes - nodeCount
 | 
			
		||||
		replicas := additionalNodes * replicasPerNode
 | 
			
		||||
		additionalReservation := additionalNodes * perNodeReservation
 | 
			
		||||
 | 
			
		||||
		// saturate cluster
 | 
			
		||||
		reservationCleanup := ReserveMemory(ctx, f, "some-pod", nodeCount*2, nodeCount*perNodeReservation, true, memoryReservationTimeout)
 | 
			
		||||
		defer reservationCleanup()
 | 
			
		||||
		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c))
 | 
			
		||||
 | 
			
		||||
		// configure pending pods & expected scale up
 | 
			
		||||
		rcConfig := reserveMemoryRCConfig(f, "extra-pod-1", replicas, additionalReservation, largeScaleUpTimeout)
 | 
			
		||||
		expectedResult := createClusterPredicates(nodeCount + additionalNodes)
 | 
			
		||||
		config := createScaleUpTestConfig(nodeCount, nodeCount, rcConfig, expectedResult)
 | 
			
		||||
 | 
			
		||||
		// run test
 | 
			
		||||
		testCleanup := simpleScaleUpTest(ctx, f, config)
 | 
			
		||||
		defer testCleanup()
 | 
			
		||||
	})
 | 
			
		||||
 | 
			
		||||
	f.It("should scale up twice", feature.ClusterAutoscalerScalability2, func(ctx context.Context) {
 | 
			
		||||
		perNodeReservation := int(float64(memCapacityMb) * 0.95)
 | 
			
		||||
		replicasPerNode := 10
 | 
			
		||||
		additionalNodes1 := int(math.Ceil(0.7 * maxNodes))
 | 
			
		||||
		additionalNodes2 := int(math.Ceil(0.25 * maxNodes))
 | 
			
		||||
		if additionalNodes1+additionalNodes2 > maxNodes {
 | 
			
		||||
			additionalNodes2 = maxNodes - additionalNodes1
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		replicas1 := additionalNodes1 * replicasPerNode
 | 
			
		||||
		replicas2 := additionalNodes2 * replicasPerNode
 | 
			
		||||
 | 
			
		||||
		klog.Infof("cores per node: %v", coresPerNode)
 | 
			
		||||
 | 
			
		||||
		// saturate cluster
 | 
			
		||||
		initialReplicas := nodeCount
 | 
			
		||||
		reservationCleanup := ReserveMemory(ctx, f, "some-pod", initialReplicas, nodeCount*perNodeReservation, true, memoryReservationTimeout)
 | 
			
		||||
		defer reservationCleanup()
 | 
			
		||||
		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c))
 | 
			
		||||
 | 
			
		||||
		klog.Infof("Reserved successfully")
 | 
			
		||||
 | 
			
		||||
		// configure pending pods & expected scale up #1
 | 
			
		||||
		rcConfig := reserveMemoryRCConfig(f, "extra-pod-1", replicas1, additionalNodes1*perNodeReservation, largeScaleUpTimeout)
 | 
			
		||||
		expectedResult := createClusterPredicates(nodeCount + additionalNodes1)
 | 
			
		||||
		config := createScaleUpTestConfig(nodeCount, nodeCount, rcConfig, expectedResult)
 | 
			
		||||
 | 
			
		||||
		// run test #1
 | 
			
		||||
		tolerateUnreadyNodes := additionalNodes1 / 20
 | 
			
		||||
		tolerateUnreadyPods := (initialReplicas + replicas1) / 20
 | 
			
		||||
		testCleanup1 := simpleScaleUpTestWithTolerance(ctx, f, config, tolerateUnreadyNodes, tolerateUnreadyPods)
 | 
			
		||||
		defer testCleanup1()
 | 
			
		||||
 | 
			
		||||
		klog.Infof("Scaled up once")
 | 
			
		||||
 | 
			
		||||
		// configure pending pods & expected scale up #2
 | 
			
		||||
		rcConfig2 := reserveMemoryRCConfig(f, "extra-pod-2", replicas2, additionalNodes2*perNodeReservation, largeScaleUpTimeout)
 | 
			
		||||
		expectedResult2 := createClusterPredicates(nodeCount + additionalNodes1 + additionalNodes2)
 | 
			
		||||
		config2 := createScaleUpTestConfig(nodeCount+additionalNodes1, nodeCount+additionalNodes2, rcConfig2, expectedResult2)
 | 
			
		||||
 | 
			
		||||
		// run test #2
 | 
			
		||||
		tolerateUnreadyNodes = maxNodes / 20
 | 
			
		||||
		tolerateUnreadyPods = (initialReplicas + replicas1 + replicas2) / 20
 | 
			
		||||
		testCleanup2 := simpleScaleUpTestWithTolerance(ctx, f, config2, tolerateUnreadyNodes, tolerateUnreadyPods)
 | 
			
		||||
		defer testCleanup2()
 | 
			
		||||
 | 
			
		||||
		klog.Infof("Scaled up twice")
 | 
			
		||||
	})
 | 
			
		||||
 | 
			
		||||
	f.It("should scale down empty nodes", feature.ClusterAutoscalerScalability3, func(ctx context.Context) {
 | 
			
		||||
		perNodeReservation := int(float64(memCapacityMb) * 0.7)
 | 
			
		||||
		replicas := int(math.Ceil(maxNodes * 0.7))
 | 
			
		||||
		totalNodes := maxNodes
 | 
			
		||||
 | 
			
		||||
		// resize cluster to totalNodes
 | 
			
		||||
		newSizes := map[string]int{
 | 
			
		||||
			anyKey(originalSizes): totalNodes,
 | 
			
		||||
		}
 | 
			
		||||
		setMigSizes(newSizes)
 | 
			
		||||
		framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, f.ClientSet, totalNodes, largeResizeTimeout))
 | 
			
		||||
 | 
			
		||||
		// run replicas
 | 
			
		||||
		rcConfig := reserveMemoryRCConfig(f, "some-pod", replicas, replicas*perNodeReservation, largeScaleUpTimeout)
 | 
			
		||||
		expectedResult := createClusterPredicates(totalNodes)
 | 
			
		||||
		config := createScaleUpTestConfig(totalNodes, totalNodes, rcConfig, expectedResult)
 | 
			
		||||
		tolerateUnreadyNodes := totalNodes / 10
 | 
			
		||||
		tolerateUnreadyPods := replicas / 10
 | 
			
		||||
		testCleanup := simpleScaleUpTestWithTolerance(ctx, f, config, tolerateUnreadyNodes, tolerateUnreadyPods)
 | 
			
		||||
		defer testCleanup()
 | 
			
		||||
 | 
			
		||||
		// check if empty nodes are scaled down
 | 
			
		||||
		framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet,
 | 
			
		||||
			func(size int) bool {
 | 
			
		||||
				return size <= replicas+3 // leaving space for non-evictable kube-system pods
 | 
			
		||||
			}, scaleDownTimeout))
 | 
			
		||||
	})
 | 
			
		||||
 | 
			
		||||
	f.It("should scale down underutilized nodes", feature.ClusterAutoscalerScalability4, func(ctx context.Context) {
 | 
			
		||||
		perPodReservation := int(float64(memCapacityMb) * 0.01)
 | 
			
		||||
		// underutilizedNodes are 10% full
 | 
			
		||||
		underutilizedPerNodeReplicas := 10
 | 
			
		||||
		// fullNodes are 70% full
 | 
			
		||||
		fullPerNodeReplicas := 70
 | 
			
		||||
		totalNodes := maxNodes
 | 
			
		||||
		underutilizedRatio := 0.3
 | 
			
		||||
		maxDelta := 30
 | 
			
		||||
 | 
			
		||||
		// resize cluster to totalNodes
 | 
			
		||||
		newSizes := map[string]int{
 | 
			
		||||
			anyKey(originalSizes): totalNodes,
 | 
			
		||||
		}
 | 
			
		||||
		setMigSizes(newSizes)
 | 
			
		||||
 | 
			
		||||
		framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, f.ClientSet, totalNodes, largeResizeTimeout))
 | 
			
		||||
 | 
			
		||||
		// annotate all nodes with no-scale-down
 | 
			
		||||
		ScaleDownDisabledKey := "cluster-autoscaler.kubernetes.io/scale-down-disabled"
 | 
			
		||||
 | 
			
		||||
		nodes, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{
 | 
			
		||||
			FieldSelector: fields.Set{
 | 
			
		||||
				"spec.unschedulable": "false",
 | 
			
		||||
			}.AsSelector().String(),
 | 
			
		||||
		})
 | 
			
		||||
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
		framework.ExpectNoError(addAnnotation(ctx, f, nodes.Items, ScaleDownDisabledKey, "true"))
 | 
			
		||||
 | 
			
		||||
		// distribute pods using replication controllers taking up space that should
 | 
			
		||||
		// be empty after pods are distributed
 | 
			
		||||
		underutilizedNodesNum := int(float64(maxNodes) * underutilizedRatio)
 | 
			
		||||
		fullNodesNum := totalNodes - underutilizedNodesNum
 | 
			
		||||
 | 
			
		||||
		podDistribution := []podBatch{
 | 
			
		||||
			{numNodes: fullNodesNum, podsPerNode: fullPerNodeReplicas},
 | 
			
		||||
			{numNodes: underutilizedNodesNum, podsPerNode: underutilizedPerNodeReplicas}}
 | 
			
		||||
 | 
			
		||||
		distributeLoad(ctx, f, f.Namespace.Name, "10-70", podDistribution, perPodReservation,
 | 
			
		||||
			int(0.95*float64(memCapacityMb)), map[string]string{}, largeScaleUpTimeout)
 | 
			
		||||
 | 
			
		||||
		// enable scale down again
 | 
			
		||||
		framework.ExpectNoError(addAnnotation(ctx, f, nodes.Items, ScaleDownDisabledKey, "false"))
 | 
			
		||||
 | 
			
		||||
		// wait for scale down to start. Node deletion takes a long time, so we just
 | 
			
		||||
		// wait for maximum of 30 nodes deleted
 | 
			
		||||
		nodesToScaleDownCount := int(float64(totalNodes) * 0.1)
 | 
			
		||||
		if nodesToScaleDownCount > maxDelta {
 | 
			
		||||
			nodesToScaleDownCount = maxDelta
 | 
			
		||||
		}
 | 
			
		||||
		expectedSize := totalNodes - nodesToScaleDownCount
 | 
			
		||||
		timeout := time.Duration(nodesToScaleDownCount)*time.Minute + scaleDownTimeout
 | 
			
		||||
		framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, func(size int) bool {
 | 
			
		||||
			return size <= expectedSize
 | 
			
		||||
		}, timeout))
 | 
			
		||||
	})
 | 
			
		||||
 | 
			
		||||
	f.It("shouldn't scale down with underutilized nodes due to host port conflicts", feature.ClusterAutoscalerScalability5, func(ctx context.Context) {
 | 
			
		||||
		fullReservation := int(float64(memCapacityMb) * 0.9)
 | 
			
		||||
		hostPortPodReservation := int(float64(memCapacityMb) * 0.3)
 | 
			
		||||
		totalNodes := maxNodes
 | 
			
		||||
		reservedPort := 4321
 | 
			
		||||
 | 
			
		||||
		// resize cluster to totalNodes
 | 
			
		||||
		newSizes := map[string]int{
 | 
			
		||||
			anyKey(originalSizes): totalNodes,
 | 
			
		||||
		}
 | 
			
		||||
		setMigSizes(newSizes)
 | 
			
		||||
		framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, f.ClientSet, totalNodes, largeResizeTimeout))
 | 
			
		||||
		divider := int(float64(totalNodes) * 0.7)
 | 
			
		||||
		fullNodesCount := divider
 | 
			
		||||
		underutilizedNodesCount := totalNodes - fullNodesCount
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Reserving full nodes")
 | 
			
		||||
		// run RC1 w/o host port
 | 
			
		||||
		cleanup := ReserveMemory(ctx, f, "filling-pod", fullNodesCount, fullNodesCount*fullReservation, true, largeScaleUpTimeout*2)
 | 
			
		||||
		defer cleanup()
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Reserving host ports on remaining nodes")
 | 
			
		||||
		// run RC2 w/ host port
 | 
			
		||||
		ginkgo.DeferCleanup(createHostPortPodsWithMemory, f, "underutilizing-host-port-pod", underutilizedNodesCount, reservedPort, underutilizedNodesCount*hostPortPodReservation, largeScaleUpTimeout)
 | 
			
		||||
 | 
			
		||||
		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c))
 | 
			
		||||
		// wait and check scale down doesn't occur
 | 
			
		||||
		ginkgo.By(fmt.Sprintf("Sleeping %v minutes...", scaleDownTimeout.Minutes()))
 | 
			
		||||
		time.Sleep(scaleDownTimeout)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Checking if the number of nodes is as expected")
 | 
			
		||||
		nodes, err := e2enode.GetReadySchedulableNodes(ctx, f.ClientSet)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
		klog.Infof("Nodes: %v, expected: %v", len(nodes.Items), totalNodes)
 | 
			
		||||
		gomega.Expect(nodes.Items).To(gomega.HaveLen(totalNodes))
 | 
			
		||||
	})
 | 
			
		||||
 | 
			
		||||
	f.It("CA ignores unschedulable pods while scheduling schedulable pods", feature.ClusterAutoscalerScalability6, func(ctx context.Context) {
 | 
			
		||||
		// Start a number of pods saturating existing nodes.
 | 
			
		||||
		perNodeReservation := int(float64(memCapacityMb) * 0.80)
 | 
			
		||||
		replicasPerNode := 10
 | 
			
		||||
		initialPodReplicas := nodeCount * replicasPerNode
 | 
			
		||||
		initialPodsTotalMemory := nodeCount * perNodeReservation
 | 
			
		||||
		reservationCleanup := ReserveMemory(ctx, f, "initial-pod", initialPodReplicas, initialPodsTotalMemory, true /* wait for pods to run */, memoryReservationTimeout)
 | 
			
		||||
		ginkgo.DeferCleanup(reservationCleanup)
 | 
			
		||||
		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c))
 | 
			
		||||
 | 
			
		||||
		// Configure a number of unschedulable pods.
 | 
			
		||||
		unschedulableMemReservation := memCapacityMb * 2
 | 
			
		||||
		unschedulablePodReplicas := 1000
 | 
			
		||||
		totalMemReservation := unschedulableMemReservation * unschedulablePodReplicas
 | 
			
		||||
		timeToWait := 5 * time.Minute
 | 
			
		||||
		podsConfig := reserveMemoryRCConfig(f, "unschedulable-pod", unschedulablePodReplicas, totalMemReservation, timeToWait)
 | 
			
		||||
		_ = e2erc.RunRC(ctx, *podsConfig) // Ignore error (it will occur because pods are unschedulable)
 | 
			
		||||
		ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, podsConfig.Name)
 | 
			
		||||
 | 
			
		||||
		// Ensure that no new nodes have been added so far.
 | 
			
		||||
		readyNodeCount, _ := e2enode.TotalReady(ctx, f.ClientSet)
 | 
			
		||||
		gomega.Expect(readyNodeCount).To(gomega.Equal(nodeCount))
 | 
			
		||||
 | 
			
		||||
		// Start a number of schedulable pods to ensure CA reacts.
 | 
			
		||||
		additionalNodes := maxNodes - nodeCount
 | 
			
		||||
		replicas := additionalNodes * replicasPerNode
 | 
			
		||||
		totalMemory := additionalNodes * perNodeReservation
 | 
			
		||||
		rcConfig := reserveMemoryRCConfig(f, "extra-pod", replicas, totalMemory, largeScaleUpTimeout)
 | 
			
		||||
		expectedResult := createClusterPredicates(nodeCount + additionalNodes)
 | 
			
		||||
		config := createScaleUpTestConfig(nodeCount, initialPodReplicas, rcConfig, expectedResult)
 | 
			
		||||
 | 
			
		||||
		// Test that scale up happens, allowing 1000 unschedulable pods not to be scheduled.
 | 
			
		||||
		testCleanup := simpleScaleUpTestWithTolerance(ctx, f, config, 0, unschedulablePodReplicas)
 | 
			
		||||
		ginkgo.DeferCleanup(testCleanup)
 | 
			
		||||
	})
 | 
			
		||||
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
func anyKey(input map[string]int) string {
 | 
			
		||||
	for k := range input {
 | 
			
		||||
		return k
 | 
			
		||||
	}
 | 
			
		||||
	return ""
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func simpleScaleUpTestWithTolerance(ctx context.Context, f *framework.Framework, config *scaleUpTestConfig, tolerateMissingNodeCount int, tolerateMissingPodCount int) func() error {
 | 
			
		||||
	// resize cluster to start size
 | 
			
		||||
	// run rc based on config
 | 
			
		||||
	ginkgo.By(fmt.Sprintf("Running RC %v from config", config.extraPods.Name))
 | 
			
		||||
	start := time.Now()
 | 
			
		||||
	framework.ExpectNoError(e2erc.RunRC(ctx, *config.extraPods))
 | 
			
		||||
	// check results
 | 
			
		||||
	if tolerateMissingNodeCount > 0 {
 | 
			
		||||
		// Tolerate some number of nodes not to be created.
 | 
			
		||||
		minExpectedNodeCount := config.expectedResult.nodes - tolerateMissingNodeCount
 | 
			
		||||
		framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet,
 | 
			
		||||
			func(size int) bool { return size >= minExpectedNodeCount }, scaleUpTimeout))
 | 
			
		||||
	} else {
 | 
			
		||||
		framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, f.ClientSet, config.expectedResult.nodes, scaleUpTimeout))
 | 
			
		||||
	}
 | 
			
		||||
	klog.Infof("cluster is increased")
 | 
			
		||||
	if tolerateMissingPodCount > 0 {
 | 
			
		||||
		framework.ExpectNoError(waitForCaPodsReadyInNamespace(ctx, f, f.ClientSet, tolerateMissingPodCount))
 | 
			
		||||
	} else {
 | 
			
		||||
		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, f.ClientSet))
 | 
			
		||||
	}
 | 
			
		||||
	timeTrack(start, fmt.Sprintf("Scale up to %v", config.expectedResult.nodes))
 | 
			
		||||
	return func() error {
 | 
			
		||||
		return e2erc.DeleteRCAndWaitForGC(ctx, f.ClientSet, f.Namespace.Name, config.extraPods.Name)
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func simpleScaleUpTest(ctx context.Context, f *framework.Framework, config *scaleUpTestConfig) func() error {
 | 
			
		||||
	return simpleScaleUpTestWithTolerance(ctx, f, config, 0, 0)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func reserveMemoryRCConfig(f *framework.Framework, id string, replicas, megabytes int, timeout time.Duration) *testutils.RCConfig {
 | 
			
		||||
	return &testutils.RCConfig{
 | 
			
		||||
		Client:     f.ClientSet,
 | 
			
		||||
		Name:       id,
 | 
			
		||||
		Namespace:  f.Namespace.Name,
 | 
			
		||||
		Timeout:    timeout,
 | 
			
		||||
		Image:      imageutils.GetPauseImageName(),
 | 
			
		||||
		Replicas:   replicas,
 | 
			
		||||
		MemRequest: int64(1024 * 1024 * megabytes / replicas),
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func createScaleUpTestConfig(nodes, pods int, extraPods *testutils.RCConfig, expectedResult *clusterPredicates) *scaleUpTestConfig {
 | 
			
		||||
	return &scaleUpTestConfig{
 | 
			
		||||
		initialNodes:   nodes,
 | 
			
		||||
		initialPods:    pods,
 | 
			
		||||
		extraPods:      extraPods,
 | 
			
		||||
		expectedResult: expectedResult,
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func createClusterPredicates(nodes int) *clusterPredicates {
 | 
			
		||||
	return &clusterPredicates{
 | 
			
		||||
		nodes: nodes,
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func addAnnotation(ctx context.Context, f *framework.Framework, nodes []v1.Node, key, value string) error {
 | 
			
		||||
	for _, node := range nodes {
 | 
			
		||||
		oldData, err := json.Marshal(node)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			return err
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if node.Annotations == nil {
 | 
			
		||||
			node.Annotations = make(map[string]string)
 | 
			
		||||
		}
 | 
			
		||||
		node.Annotations[key] = value
 | 
			
		||||
 | 
			
		||||
		newData, err := json.Marshal(node)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			return err
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldData, newData, v1.Node{})
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			return err
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		_, err = f.ClientSet.CoreV1().Nodes().Patch(ctx, string(node.Name), types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{})
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			return err
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func createHostPortPodsWithMemory(ctx context.Context, f *framework.Framework, id string, replicas, port, megabytes int, timeout time.Duration) func() error {
 | 
			
		||||
	ginkgo.By(fmt.Sprintf("Running RC which reserves host port and memory"))
 | 
			
		||||
	request := int64(1024 * 1024 * megabytes / replicas)
 | 
			
		||||
	config := &testutils.RCConfig{
 | 
			
		||||
		Client:     f.ClientSet,
 | 
			
		||||
		Name:       id,
 | 
			
		||||
		Namespace:  f.Namespace.Name,
 | 
			
		||||
		Timeout:    timeout,
 | 
			
		||||
		Image:      imageutils.GetPauseImageName(),
 | 
			
		||||
		Replicas:   replicas,
 | 
			
		||||
		HostPorts:  map[string]int{"port1": port},
 | 
			
		||||
		MemRequest: request,
 | 
			
		||||
	}
 | 
			
		||||
	err := e2erc.RunRC(ctx, *config)
 | 
			
		||||
	framework.ExpectNoError(err)
 | 
			
		||||
	return func() error {
 | 
			
		||||
		return e2erc.DeleteRCAndWaitForGC(ctx, f.ClientSet, f.Namespace.Name, id)
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type podBatch struct {
 | 
			
		||||
	numNodes    int
 | 
			
		||||
	podsPerNode int
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// distributeLoad distributes the pods in the way described by podDostribution,
 | 
			
		||||
// assuming all pods will have the same memory reservation and all nodes the same
 | 
			
		||||
// memory capacity. This allows us generate the load on the cluster in the exact
 | 
			
		||||
// way that we want.
 | 
			
		||||
//
 | 
			
		||||
// To achieve this we do the following:
 | 
			
		||||
// 1. Create replication controllers that eat up all the space that should be
 | 
			
		||||
// empty after setup, making sure they end up on different nodes by specifying
 | 
			
		||||
// conflicting host port
 | 
			
		||||
// 2. Create target RC that will generate the load on the cluster
 | 
			
		||||
// 3. Remove the rcs created in 1.
 | 
			
		||||
func distributeLoad(ctx context.Context, f *framework.Framework, namespace string, id string, podDistribution []podBatch,
 | 
			
		||||
	podMemRequestMegabytes int, nodeMemCapacity int, labels map[string]string, timeout time.Duration) {
 | 
			
		||||
	port := 8013
 | 
			
		||||
	// Create load-distribution RCs with one pod per node, reserving all remaining
 | 
			
		||||
	// memory to force the distribution of pods for the target RCs.
 | 
			
		||||
	// The load-distribution RCs will be deleted on function return.
 | 
			
		||||
	totalPods := 0
 | 
			
		||||
	for i, podBatch := range podDistribution {
 | 
			
		||||
		totalPods += podBatch.numNodes * podBatch.podsPerNode
 | 
			
		||||
		remainingMem := nodeMemCapacity - podBatch.podsPerNode*podMemRequestMegabytes
 | 
			
		||||
		replicas := podBatch.numNodes
 | 
			
		||||
		cleanup := createHostPortPodsWithMemory(ctx, f, fmt.Sprintf("load-distribution%d", i), replicas, port, remainingMem*replicas, timeout)
 | 
			
		||||
		defer cleanup()
 | 
			
		||||
	}
 | 
			
		||||
	framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, f.ClientSet))
 | 
			
		||||
	// Create the target RC
 | 
			
		||||
	rcConfig := reserveMemoryRCConfig(f, id, totalPods, totalPods*podMemRequestMegabytes, timeout)
 | 
			
		||||
	framework.ExpectNoError(e2erc.RunRC(ctx, *rcConfig))
 | 
			
		||||
	framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, f.ClientSet))
 | 
			
		||||
	ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, id)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func timeTrack(start time.Time, name string) {
 | 
			
		||||
	elapsed := time.Since(start)
 | 
			
		||||
	klog.Infof("%s took %s", name, elapsed)
 | 
			
		||||
}
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -1,425 +0,0 @@
 | 
			
		||||
/*
 | 
			
		||||
Copyright 2016 The Kubernetes Authors.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
you may not use this file except in compliance with the License.
 | 
			
		||||
You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
    http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
See the License for the specific language governing permissions and
 | 
			
		||||
limitations under the License.
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
package autoscaling
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"context"
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"math"
 | 
			
		||||
	"strings"
 | 
			
		||||
	"time"
 | 
			
		||||
 | 
			
		||||
	v1 "k8s.io/api/core/v1"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/api/resource"
 | 
			
		||||
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/labels"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/util/wait"
 | 
			
		||||
	clientset "k8s.io/client-go/kubernetes"
 | 
			
		||||
	"k8s.io/kubernetes/test/e2e/framework"
 | 
			
		||||
	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
 | 
			
		||||
	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
 | 
			
		||||
	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
 | 
			
		||||
	admissionapi "k8s.io/pod-security-admission/api"
 | 
			
		||||
 | 
			
		||||
	"github.com/onsi/ginkgo/v2"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
// This test requires coredns to be installed on the cluster with autoscaling enabled.
 | 
			
		||||
// Compare your coredns manifest against the command below
 | 
			
		||||
// helm template coredns -n kube-system coredns/coredns --set k8sAppLabelOverride=kube-dns --set fullnameOverride=coredns --set autoscaler.enabled=true
 | 
			
		||||
 | 
			
		||||
// Constants used in dns-autoscaling test.
 | 
			
		||||
const (
 | 
			
		||||
	DNSdefaultTimeout    = 5 * time.Minute
 | 
			
		||||
	ClusterAddonLabelKey = "k8s-app"
 | 
			
		||||
	DNSLabelName         = "kube-dns"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
var _ = SIGDescribe("DNS horizontal autoscaling", func() {
 | 
			
		||||
	f := framework.NewDefaultFramework("dns-autoscaling")
 | 
			
		||||
	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
 | 
			
		||||
	var c clientset.Interface
 | 
			
		||||
	var previousParams map[string]string
 | 
			
		||||
	var configMapNames map[string]string
 | 
			
		||||
	var originDNSReplicasCount int
 | 
			
		||||
	var DNSParams1 DNSParamsLinear
 | 
			
		||||
	var DNSParams2 DNSParamsLinear
 | 
			
		||||
	var DNSParams3 DNSParamsLinear
 | 
			
		||||
 | 
			
		||||
	ginkgo.BeforeEach(func(ctx context.Context) {
 | 
			
		||||
		e2eskipper.SkipUnlessProviderIs("gce", "gke")
 | 
			
		||||
		c = f.ClientSet
 | 
			
		||||
 | 
			
		||||
		nodes, err := e2enode.GetReadySchedulableNodes(ctx, c)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
		nodeCount := len(nodes.Items)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Collecting original replicas count and DNS scaling params")
 | 
			
		||||
 | 
			
		||||
		// Check if we are running coredns or kube-dns, the only difference is the name of the autoscaling CM.
 | 
			
		||||
		// The test should be have identically on both dns providers
 | 
			
		||||
		provider, err := detectDNSProvider(ctx, c)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		originDNSReplicasCount, err = getDNSReplicas(ctx, c)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
		configMapNames = map[string]string{
 | 
			
		||||
			"kube-dns": "kube-dns-autoscaler",
 | 
			
		||||
			"coredns":  "coredns-autoscaler",
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		pcm, err := fetchDNSScalingConfigMap(ctx, c, configMapNames[provider])
 | 
			
		||||
		framework.Logf("original DNS scaling params: %v", pcm)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
		previousParams = pcm.Data
 | 
			
		||||
 | 
			
		||||
		if nodeCount <= 500 {
 | 
			
		||||
			DNSParams1 = DNSParamsLinear{
 | 
			
		||||
				nodesPerReplica: 1,
 | 
			
		||||
			}
 | 
			
		||||
			DNSParams2 = DNSParamsLinear{
 | 
			
		||||
				nodesPerReplica: 2,
 | 
			
		||||
			}
 | 
			
		||||
			DNSParams3 = DNSParamsLinear{
 | 
			
		||||
				nodesPerReplica: 3,
 | 
			
		||||
				coresPerReplica: 3,
 | 
			
		||||
			}
 | 
			
		||||
		} else {
 | 
			
		||||
			// In large clusters, avoid creating/deleting too many DNS pods,
 | 
			
		||||
			// it is supposed to be correctness test, not performance one.
 | 
			
		||||
			// The default setup is: 256 cores/replica, 16 nodes/replica.
 | 
			
		||||
			// With nodeCount > 500, nodes/13, nodes/14, nodes/15 and nodes/16
 | 
			
		||||
			// are different numbers.
 | 
			
		||||
			DNSParams1 = DNSParamsLinear{
 | 
			
		||||
				nodesPerReplica: 13,
 | 
			
		||||
			}
 | 
			
		||||
			DNSParams2 = DNSParamsLinear{
 | 
			
		||||
				nodesPerReplica: 14,
 | 
			
		||||
			}
 | 
			
		||||
			DNSParams3 = DNSParamsLinear{
 | 
			
		||||
				nodesPerReplica: 15,
 | 
			
		||||
				coresPerReplica: 15,
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	})
 | 
			
		||||
 | 
			
		||||
	// This test is separated because it is slow and need to run serially.
 | 
			
		||||
	// Will take around 5 minutes to run on a 4 nodes cluster.
 | 
			
		||||
	// TODO(upodroid) This test will be removed in 1.33 when kubeup is removed
 | 
			
		||||
	// TODO: make it cloud provider agnostic or move it to cloud-provider-gcp repository
 | 
			
		||||
	f.It(f.WithSerial(), f.WithSlow(), f.WithLabel("KubeUp"), f.WithLabel("sig-cloud-provider-gcp"), "kube-dns-autoscaler should scale kube-dns pods when cluster size changed", func(ctx context.Context) {
 | 
			
		||||
		numNodes, err := e2enode.TotalRegistered(ctx, c)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		configMapNames = map[string]string{
 | 
			
		||||
			"kube-dns": "kube-dns-autoscaler",
 | 
			
		||||
			"coredns":  "coredns-autoscaler",
 | 
			
		||||
		}
 | 
			
		||||
		provider, err := detectDNSProvider(ctx, c)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Replace the dns autoscaling parameters with testing parameters")
 | 
			
		||||
		err = updateDNSScalingConfigMap(ctx, c, packDNSScalingConfigMap(configMapNames[provider], packLinearParams(&DNSParams1)))
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
		defer func() {
 | 
			
		||||
			ginkgo.By("Restoring initial dns autoscaling parameters")
 | 
			
		||||
			err = updateDNSScalingConfigMap(ctx, c, packDNSScalingConfigMap(configMapNames[provider], previousParams))
 | 
			
		||||
			framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
			ginkgo.By("Wait for number of running and ready kube-dns pods recover")
 | 
			
		||||
			label := labels.SelectorFromSet(labels.Set(map[string]string{ClusterAddonLabelKey: DNSLabelName}))
 | 
			
		||||
			_, err := e2epod.WaitForPodsWithLabelRunningReady(ctx, c, metav1.NamespaceSystem, label, originDNSReplicasCount, DNSdefaultTimeout)
 | 
			
		||||
			framework.ExpectNoError(err)
 | 
			
		||||
		}()
 | 
			
		||||
		ginkgo.By("Wait for kube-dns scaled to expected number")
 | 
			
		||||
		getExpectReplicasLinear := getExpectReplicasFuncLinear(ctx, c, &DNSParams1)
 | 
			
		||||
		err = waitForDNSReplicasSatisfied(ctx, c, getExpectReplicasLinear, DNSdefaultTimeout)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		originalSizes := make(map[string]int)
 | 
			
		||||
		for _, mig := range strings.Split(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") {
 | 
			
		||||
			size, err := framework.GroupSize(mig)
 | 
			
		||||
			framework.ExpectNoError(err)
 | 
			
		||||
			ginkgo.By(fmt.Sprintf("Initial size of %s: %d", mig, size))
 | 
			
		||||
			originalSizes[mig] = size
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Manually increase cluster size")
 | 
			
		||||
		increasedSizes := make(map[string]int)
 | 
			
		||||
		for key, val := range originalSizes {
 | 
			
		||||
			increasedSizes[key] = val + 1
 | 
			
		||||
		}
 | 
			
		||||
		setMigSizes(increasedSizes)
 | 
			
		||||
		err = WaitForClusterSizeFunc(ctx, c,
 | 
			
		||||
			func(size int) bool { return size == numNodes+len(originalSizes) }, scaleUpTimeout)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Wait for kube-dns scaled to expected number")
 | 
			
		||||
		getExpectReplicasLinear = getExpectReplicasFuncLinear(ctx, c, &DNSParams1)
 | 
			
		||||
		err = waitForDNSReplicasSatisfied(ctx, c, getExpectReplicasLinear, DNSdefaultTimeout)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Replace the dns autoscaling parameters with another testing parameters")
 | 
			
		||||
		err = updateDNSScalingConfigMap(ctx, c, packDNSScalingConfigMap(configMapNames[provider], packLinearParams(&DNSParams3)))
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Wait for kube-dns scaled to expected number")
 | 
			
		||||
		getExpectReplicasLinear = getExpectReplicasFuncLinear(ctx, c, &DNSParams3)
 | 
			
		||||
		err = waitForDNSReplicasSatisfied(ctx, c, getExpectReplicasLinear, DNSdefaultTimeout)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Restoring cluster size")
 | 
			
		||||
		setMigSizes(originalSizes)
 | 
			
		||||
		err = e2enode.WaitForReadyNodes(ctx, c, numNodes, scaleDownTimeout)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Wait for kube-dns scaled to expected number")
 | 
			
		||||
		err = waitForDNSReplicasSatisfied(ctx, c, getExpectReplicasLinear, DNSdefaultTimeout)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
	})
 | 
			
		||||
 | 
			
		||||
	ginkgo.It("kube-dns-autoscaler should scale kube-dns pods in both nonfaulty and faulty scenarios", func(ctx context.Context) {
 | 
			
		||||
 | 
			
		||||
		configMapNames = map[string]string{
 | 
			
		||||
			"kube-dns": "kube-dns-autoscaler",
 | 
			
		||||
			"coredns":  "coredns-autoscaler",
 | 
			
		||||
		}
 | 
			
		||||
		provider, err := detectDNSProvider(ctx, c)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Replace the dns autoscaling parameters with testing parameters")
 | 
			
		||||
		cm := packDNSScalingConfigMap(configMapNames[provider], packLinearParams(&DNSParams1))
 | 
			
		||||
		framework.Logf("Updating the following cm: %v", cm)
 | 
			
		||||
		err = updateDNSScalingConfigMap(ctx, c, cm)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
		defer func() {
 | 
			
		||||
			ginkgo.By("Restoring initial dns autoscaling parameters")
 | 
			
		||||
			err = updateDNSScalingConfigMap(ctx, c, packDNSScalingConfigMap(configMapNames[provider], previousParams))
 | 
			
		||||
			framework.ExpectNoError(err)
 | 
			
		||||
		}()
 | 
			
		||||
		ginkgo.By("Wait for kube-dns scaled to expected number")
 | 
			
		||||
		getExpectReplicasLinear := getExpectReplicasFuncLinear(ctx, c, &DNSParams1)
 | 
			
		||||
		err = waitForDNSReplicasSatisfied(ctx, c, getExpectReplicasLinear, DNSdefaultTimeout)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("--- Scenario: should scale kube-dns based on changed parameters ---")
 | 
			
		||||
		ginkgo.By("Replace the dns autoscaling parameters with another testing parameters")
 | 
			
		||||
		err = updateDNSScalingConfigMap(ctx, c, packDNSScalingConfigMap(configMapNames[provider], packLinearParams(&DNSParams3)))
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
		ginkgo.By("Wait for kube-dns scaled to expected number")
 | 
			
		||||
		getExpectReplicasLinear = getExpectReplicasFuncLinear(ctx, c, &DNSParams3)
 | 
			
		||||
		err = waitForDNSReplicasSatisfied(ctx, c, getExpectReplicasLinear, DNSdefaultTimeout)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("--- Scenario: should re-create scaling parameters with default value when parameters got deleted ---")
 | 
			
		||||
		ginkgo.By("Delete the ConfigMap for autoscaler")
 | 
			
		||||
		err = deleteDNSScalingConfigMap(ctx, c, configMapNames[provider])
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Wait for the ConfigMap got re-created")
 | 
			
		||||
		_, err = waitForDNSConfigMapCreated(ctx, c, DNSdefaultTimeout, configMapNames[provider])
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Replace the dns autoscaling parameters with another testing parameters")
 | 
			
		||||
		err = updateDNSScalingConfigMap(ctx, c, packDNSScalingConfigMap(configMapNames[provider], packLinearParams(&DNSParams2)))
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
		ginkgo.By("Wait for kube-dns/coredns scaled to expected number")
 | 
			
		||||
		getExpectReplicasLinear = getExpectReplicasFuncLinear(ctx, c, &DNSParams2)
 | 
			
		||||
		err = waitForDNSReplicasSatisfied(ctx, c, getExpectReplicasLinear, DNSdefaultTimeout)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("--- Scenario: should recover after autoscaler pod got deleted ---")
 | 
			
		||||
		ginkgo.By("Delete the autoscaler pod for kube-dns/coredns")
 | 
			
		||||
		err = deleteDNSAutoscalerPod(ctx, c)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Replace the dns autoscaling parameters with another testing parameters")
 | 
			
		||||
		err = updateDNSScalingConfigMap(ctx, c, packDNSScalingConfigMap(configMapNames[provider], packLinearParams(&DNSParams1)))
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
		ginkgo.By("Wait for kube-dns/coredns scaled to expected number")
 | 
			
		||||
		getExpectReplicasLinear = getExpectReplicasFuncLinear(ctx, c, &DNSParams1)
 | 
			
		||||
		err = waitForDNSReplicasSatisfied(ctx, c, getExpectReplicasLinear, DNSdefaultTimeout)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
	})
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
// DNSParamsLinear is a struct for number of DNS pods.
 | 
			
		||||
type DNSParamsLinear struct {
 | 
			
		||||
	nodesPerReplica float64
 | 
			
		||||
	coresPerReplica float64
 | 
			
		||||
	min             int
 | 
			
		||||
	max             int
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type getExpectReplicasFunc func(c clientset.Interface) int
 | 
			
		||||
 | 
			
		||||
func getExpectReplicasFuncLinear(ctx context.Context, c clientset.Interface, params *DNSParamsLinear) getExpectReplicasFunc {
 | 
			
		||||
	return func(c clientset.Interface) int {
 | 
			
		||||
		var replicasFromNodes float64
 | 
			
		||||
		var replicasFromCores float64
 | 
			
		||||
		nodes, err := e2enode.GetReadyNodesIncludingTainted(ctx, c)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
		if params.nodesPerReplica > 0 {
 | 
			
		||||
			replicasFromNodes = math.Ceil(float64(len(nodes.Items)) / params.nodesPerReplica)
 | 
			
		||||
		}
 | 
			
		||||
		if params.coresPerReplica > 0 {
 | 
			
		||||
			replicasFromCores = math.Ceil(float64(getSchedulableCores(nodes.Items)) / params.coresPerReplica)
 | 
			
		||||
		}
 | 
			
		||||
		return int(math.Max(1.0, math.Max(replicasFromNodes, replicasFromCores)))
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getSchedulableCores(nodes []v1.Node) int64 {
 | 
			
		||||
	var sc resource.Quantity
 | 
			
		||||
	for _, node := range nodes {
 | 
			
		||||
		if !node.Spec.Unschedulable {
 | 
			
		||||
			sc.Add(node.Status.Allocatable[v1.ResourceCPU])
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	return sc.Value()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func detectDNSProvider(ctx context.Context, c clientset.Interface) (string, error) {
 | 
			
		||||
	cm, err := c.CoreV1().ConfigMaps(metav1.NamespaceSystem).Get(ctx, "coredns-autoscaler", metav1.GetOptions{})
 | 
			
		||||
	if cm != nil && err == nil {
 | 
			
		||||
		return "coredns", nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	cm, err = c.CoreV1().ConfigMaps(metav1.NamespaceSystem).Get(ctx, "kube-dns-autoscaler", metav1.GetOptions{})
 | 
			
		||||
	if cm != nil && err == nil {
 | 
			
		||||
		return "kube-dns", nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return "", fmt.Errorf("the cluster doesn't have kube-dns or coredns autoscaling configured")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func fetchDNSScalingConfigMap(ctx context.Context, c clientset.Interface, configMapName string) (*v1.ConfigMap, error) {
 | 
			
		||||
	cm, err := c.CoreV1().ConfigMaps(metav1.NamespaceSystem).Get(ctx, configMapName, metav1.GetOptions{})
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return nil, err
 | 
			
		||||
	}
 | 
			
		||||
	return cm, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func deleteDNSScalingConfigMap(ctx context.Context, c clientset.Interface, configMapName string) error {
 | 
			
		||||
	if err := c.CoreV1().ConfigMaps(metav1.NamespaceSystem).Delete(ctx, configMapName, metav1.DeleteOptions{}); err != nil {
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
	framework.Logf("DNS autoscaling ConfigMap deleted.")
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func packLinearParams(params *DNSParamsLinear) map[string]string {
 | 
			
		||||
	paramsMap := make(map[string]string)
 | 
			
		||||
	paramsMap["linear"] = fmt.Sprintf("{\"nodesPerReplica\": %v,\"coresPerReplica\": %v,\"min\": %v,\"max\": %v}",
 | 
			
		||||
		params.nodesPerReplica,
 | 
			
		||||
		params.coresPerReplica,
 | 
			
		||||
		params.min,
 | 
			
		||||
		params.max)
 | 
			
		||||
	return paramsMap
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func packDNSScalingConfigMap(configMapName string, params map[string]string) *v1.ConfigMap {
 | 
			
		||||
	configMap := v1.ConfigMap{}
 | 
			
		||||
	configMap.ObjectMeta.Name = configMapName
 | 
			
		||||
	configMap.ObjectMeta.Namespace = metav1.NamespaceSystem
 | 
			
		||||
	configMap.Data = params
 | 
			
		||||
	return &configMap
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func updateDNSScalingConfigMap(ctx context.Context, c clientset.Interface, configMap *v1.ConfigMap) error {
 | 
			
		||||
	_, err := c.CoreV1().ConfigMaps(metav1.NamespaceSystem).Update(ctx, configMap, metav1.UpdateOptions{})
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
	framework.Logf("DNS autoscaling ConfigMap updated.")
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getDNSReplicas(ctx context.Context, c clientset.Interface) (int, error) {
 | 
			
		||||
	label := labels.SelectorFromSet(labels.Set(map[string]string{ClusterAddonLabelKey: DNSLabelName}))
 | 
			
		||||
	listOpts := metav1.ListOptions{LabelSelector: label.String()}
 | 
			
		||||
	deployments, err := c.AppsV1().Deployments(metav1.NamespaceSystem).List(ctx, listOpts)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return 0, err
 | 
			
		||||
	}
 | 
			
		||||
	if len(deployments.Items) != 1 {
 | 
			
		||||
		return 0, fmt.Errorf("expected 1 DNS deployment, got %v", len(deployments.Items))
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	deployment := deployments.Items[0]
 | 
			
		||||
	return int(*(deployment.Spec.Replicas)), nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func deleteDNSAutoscalerPod(ctx context.Context, c clientset.Interface) error {
 | 
			
		||||
	selector, _ := labels.Parse(fmt.Sprintf("%s in (kube-dns-autoscaler, coredns-autoscaler)", ClusterAddonLabelKey))
 | 
			
		||||
	listOpts := metav1.ListOptions{LabelSelector: selector.String()}
 | 
			
		||||
	pods, err := c.CoreV1().Pods(metav1.NamespaceSystem).List(ctx, listOpts)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
	if len(pods.Items) != 1 {
 | 
			
		||||
		return fmt.Errorf("expected 1 autoscaler pod, got %v", len(pods.Items))
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	podName := pods.Items[0].Name
 | 
			
		||||
	if err := c.CoreV1().Pods(metav1.NamespaceSystem).Delete(ctx, podName, metav1.DeleteOptions{}); err != nil {
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
	framework.Logf("DNS autoscaling pod %v deleted.", podName)
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func waitForDNSReplicasSatisfied(ctx context.Context, c clientset.Interface, getExpected getExpectReplicasFunc, timeout time.Duration) (err error) {
 | 
			
		||||
	var current int
 | 
			
		||||
	var expected int
 | 
			
		||||
	framework.Logf("Waiting up to %v for kube-dns to reach expected replicas", timeout)
 | 
			
		||||
	condition := func(ctx context.Context) (bool, error) {
 | 
			
		||||
		current, err = getDNSReplicas(ctx, c)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			return false, err
 | 
			
		||||
		}
 | 
			
		||||
		expected = getExpected(c)
 | 
			
		||||
		if current != expected {
 | 
			
		||||
			framework.Logf("Replicas not as expected: got %v, expected %v", current, expected)
 | 
			
		||||
			return false, nil
 | 
			
		||||
		}
 | 
			
		||||
		return true, nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if err = wait.PollUntilContextTimeout(ctx, 2*time.Second, timeout, false, condition); err != nil {
 | 
			
		||||
		return fmt.Errorf("err waiting for DNS replicas to satisfy %v, got %v: %w", expected, current, err)
 | 
			
		||||
	}
 | 
			
		||||
	framework.Logf("kube-dns reaches expected replicas: %v", expected)
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func waitForDNSConfigMapCreated(ctx context.Context, c clientset.Interface, timeout time.Duration, configMapName string) (configMap *v1.ConfigMap, err error) {
 | 
			
		||||
	framework.Logf("Waiting up to %v for DNS autoscaling ConfigMap to be re-created", timeout)
 | 
			
		||||
	condition := func(ctx context.Context) (bool, error) {
 | 
			
		||||
		configMap, err = fetchDNSScalingConfigMap(ctx, c, configMapName)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			return false, nil
 | 
			
		||||
		}
 | 
			
		||||
		return true, nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if err = wait.PollUntilContextTimeout(ctx, time.Second, timeout, false, condition); err != nil {
 | 
			
		||||
		return nil, fmt.Errorf("err waiting for DNS autoscaling ConfigMap got re-created: %w", err)
 | 
			
		||||
	}
 | 
			
		||||
	return configMap, nil
 | 
			
		||||
}
 | 
			
		||||
@@ -20,9 +20,4 @@ const (
 | 
			
		||||
	// NVIDIAGPUResourceName is the extended name of the GPU resource since v1.8
 | 
			
		||||
	// this uses the device plugin mechanism
 | 
			
		||||
	NVIDIAGPUResourceName = "nvidia.com/gpu"
 | 
			
		||||
 | 
			
		||||
	// GPUDevicePluginDSYAML is the official Google Device Plugin Daemonset NVIDIA GPU manifest for GKE
 | 
			
		||||
	// TODO: Parametrize it by making it a feature in TestFramework.
 | 
			
		||||
	// so we can override the daemonset in other setups (non COS).
 | 
			
		||||
	GPUDevicePluginDSYAML = "https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml"
 | 
			
		||||
)
 | 
			
		||||
 
 | 
			
		||||
@@ -32,7 +32,6 @@ import (
 | 
			
		||||
	internalapi "k8s.io/cri-api/pkg/apis"
 | 
			
		||||
	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
 | 
			
		||||
	commontest "k8s.io/kubernetes/test/e2e/common"
 | 
			
		||||
	e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
 | 
			
		||||
	e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
 | 
			
		||||
	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
 | 
			
		||||
	e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles"
 | 
			
		||||
@@ -83,11 +82,6 @@ func updateImageAllowList(ctx context.Context) {
 | 
			
		||||
	} else {
 | 
			
		||||
		e2epod.ImagePrePullList.Insert(sriovDevicePluginImage)
 | 
			
		||||
	}
 | 
			
		||||
	if gpuDevicePluginImage, err := getGPUDevicePluginImage(ctx); err != nil {
 | 
			
		||||
		klog.Errorln(err)
 | 
			
		||||
	} else {
 | 
			
		||||
		e2epod.ImagePrePullList.Insert(gpuDevicePluginImage)
 | 
			
		||||
	}
 | 
			
		||||
	if samplePluginImage, err := getContainerImageFromE2ETestDaemonset(SampleDevicePluginDSYAML); err != nil {
 | 
			
		||||
		klog.Errorln(err)
 | 
			
		||||
	} else {
 | 
			
		||||
@@ -217,21 +211,6 @@ func PrePullAllImages() error {
 | 
			
		||||
	return utilerrors.NewAggregate(pullErrs)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// getGPUDevicePluginImage returns the image of GPU device plugin.
 | 
			
		||||
func getGPUDevicePluginImage(ctx context.Context) (string, error) {
 | 
			
		||||
	ds, err := e2emanifest.DaemonSetFromURL(ctx, e2egpu.GPUDevicePluginDSYAML)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return "", fmt.Errorf("failed to parse the device plugin image: %w", err)
 | 
			
		||||
	}
 | 
			
		||||
	if ds == nil {
 | 
			
		||||
		return "", fmt.Errorf("failed to parse the device plugin image: the extracted DaemonSet is nil")
 | 
			
		||||
	}
 | 
			
		||||
	if len(ds.Spec.Template.Spec.Containers) < 1 {
 | 
			
		||||
		return "", fmt.Errorf("failed to parse the device plugin image: cannot extract the container from YAML")
 | 
			
		||||
	}
 | 
			
		||||
	return ds.Spec.Template.Spec.Containers[0].Image, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getContainerImageFromE2ETestDaemonset(dsYamlPath string) (string, error) {
 | 
			
		||||
	data, err := e2etestfiles.Read(dsYamlPath)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
 
 | 
			
		||||
@@ -1,27 +0,0 @@
 | 
			
		||||
#cloud-config
 | 
			
		||||
 | 
			
		||||
runcmd:
 | 
			
		||||
  - modprobe configs
 | 
			
		||||
  # Install GPU drivers - https://cloud.google.com/container-optimized-os/docs/how-to/run-gpus
 | 
			
		||||
  - cos-extensions install gpu
 | 
			
		||||
  - mount --bind /var/lib/nvidia /var/lib/nvidia
 | 
			
		||||
  - mount -o remount,exec /var/lib/nvidia /var/lib/nvidia
 | 
			
		||||
  # Run nvidia-smi to verify installation
 | 
			
		||||
  - /var/lib/nvidia/bin/nvidia-smi
 | 
			
		||||
  # Remove build containers. They're very large.
 | 
			
		||||
  - docker rm -f $(docker ps -aq)
 | 
			
		||||
  # Standard installation proceeds
 | 
			
		||||
  - mount /tmp /tmp -o remount,exec,suid
 | 
			
		||||
  - usermod -a -G docker jenkins
 | 
			
		||||
  - mkdir -p /var/lib/kubelet
 | 
			
		||||
  - mkdir -p /home/kubernetes/containerized_mounter/rootfs
 | 
			
		||||
  - mount --bind /home/kubernetes/containerized_mounter/ /home/kubernetes/containerized_mounter/
 | 
			
		||||
  - mount -o remount, exec /home/kubernetes/containerized_mounter/
 | 
			
		||||
  - wget https://storage.googleapis.com/kubernetes-release/gci-mounter/mounter.tar -O /tmp/mounter.tar
 | 
			
		||||
  - tar xvf /tmp/mounter.tar -C /home/kubernetes/containerized_mounter/rootfs
 | 
			
		||||
  - mkdir -p /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet
 | 
			
		||||
  - mount --rbind /var/lib/kubelet /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet
 | 
			
		||||
  - mount --make-rshared /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet
 | 
			
		||||
  - mount --bind /proc /home/kubernetes/containerized_mounter/rootfs/proc
 | 
			
		||||
  - mount --bind /dev /home/kubernetes/containerized_mounter/rootfs/dev
 | 
			
		||||
  - rm /tmp/mounter.tar
 | 
			
		||||
		Reference in New Issue
	
	Block a user