mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-10-31 10:18:13 +00:00 
			
		
		
		
	feature: support Memory QoS for cgroups v2
This commit is contained in:
		| @@ -522,6 +522,11 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend | ||||
| 		return err | ||||
| 	} | ||||
|  | ||||
| 	// Warn if MemoryQoS enabled with cgroups v1 | ||||
| 	if utilfeature.DefaultFeatureGate.Enabled(features.MemoryQoS) && | ||||
| 		!isCgroup2UnifiedMode() { | ||||
| 		klog.InfoS("Warning: MemoryQoS feature only works with cgroups v2 on Linux, but enabled with cgroups v1") | ||||
| 	} | ||||
| 	// Obtain Kubelet Lock File | ||||
| 	if s.ExitOnLockContention && s.LockFilePath == "" { | ||||
| 		return errors.New("cannot exit on lock file contention: no lock file specified") | ||||
|   | ||||
| @@ -19,6 +19,8 @@ package app | ||||
| import ( | ||||
| 	"k8s.io/klog/v2" | ||||
| 	"k8s.io/utils/inotify" | ||||
|  | ||||
| 	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" | ||||
| ) | ||||
|  | ||||
| func watchForLockfileContention(path string, done chan struct{}) error { | ||||
| @@ -44,3 +46,7 @@ func watchForLockfileContention(path string, done chan struct{}) error { | ||||
| 	}() | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| func isCgroup2UnifiedMode() bool { | ||||
| 	return libcontainercgroups.IsCgroup2UnifiedMode() | ||||
| } | ||||
|   | ||||
| @@ -23,3 +23,7 @@ import "errors" | ||||
| func watchForLockfileContention(path string, done chan struct{}) error { | ||||
| 	return errors.New("kubelet unsupported in this build") | ||||
| } | ||||
|  | ||||
| func isCgroup2UnifiedMode() bool { | ||||
| 	return false | ||||
| } | ||||
|   | ||||
| @@ -773,6 +773,13 @@ const ( | ||||
| 	// The user namespace has to be created before running kubelet. | ||||
| 	// All the node components such as CRI need to be running in the same user namespace. | ||||
| 	KubeletInUserNamespace featuregate.Feature = "KubeletInUserNamespace" | ||||
|  | ||||
| 	// owner: @xiaoxubeii | ||||
| 	// kep: http://kep.k8s.io/2570 | ||||
| 	// alpha: v1.22 | ||||
| 	// | ||||
| 	// Enables kubelet to support memory QoS with cgroups v2. | ||||
| 	MemoryQoS featuregate.Feature = "MemoryQoS" | ||||
| ) | ||||
|  | ||||
| func init() { | ||||
| @@ -888,6 +895,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS | ||||
| 	CSRDuration:                                    {Default: true, PreRelease: featuregate.Beta}, | ||||
| 	DelegateFSGroupToCSIDriver:                     {Default: false, PreRelease: featuregate.Alpha}, | ||||
| 	KubeletInUserNamespace:                         {Default: false, PreRelease: featuregate.Alpha}, | ||||
| 	MemoryQoS:                                      {Default: false, PreRelease: featuregate.Alpha}, | ||||
|  | ||||
| 	// inherited features from generic apiserver, relisted here to get a conflict if it is changed | ||||
| 	// unintentionally on either side: | ||||
|   | ||||
| @@ -17,6 +17,7 @@ limitations under the License. | ||||
| package fuzzer | ||||
|  | ||||
| import ( | ||||
| 	"math/rand" | ||||
| 	"time" | ||||
|  | ||||
| 	"github.com/google/gofuzz" | ||||
| @@ -29,6 +30,7 @@ import ( | ||||
| 	kubeletconfigv1beta1 "k8s.io/kubernetes/pkg/kubelet/apis/config/v1beta1" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/qos" | ||||
| 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types" | ||||
| 	utilpointer "k8s.io/utils/pointer" | ||||
| ) | ||||
|  | ||||
| // Funcs returns the fuzzer functions for the kubeletconfig apis. | ||||
| @@ -106,6 +108,7 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} { | ||||
| 				obj.Logging.Format = "text" | ||||
| 			} | ||||
| 			obj.EnableSystemLogHandler = true | ||||
| 			obj.MemoryThrottlingFactor = utilpointer.Float64Ptr(rand.Float64()) | ||||
| 		}, | ||||
| 	} | ||||
| } | ||||
|   | ||||
| @@ -249,5 +249,6 @@ var ( | ||||
| 		"VolumePluginDir", | ||||
| 		"ShutdownGracePeriod.Duration", | ||||
| 		"ShutdownGracePeriodCriticalPods.Duration", | ||||
| 		"MemoryThrottlingFactor", | ||||
| 	) | ||||
| ) | ||||
|   | ||||
| @@ -59,6 +59,7 @@ maxOpenFiles: 1000000 | ||||
| maxPods: 110 | ||||
| memoryManagerPolicy: None | ||||
| memorySwap: {} | ||||
| memoryThrottlingFactor: 0.8 | ||||
| nodeLeaseDurationSeconds: 40 | ||||
| nodeStatusMaxImages: 50 | ||||
| nodeStatusReportFrequency: 5m0s | ||||
|   | ||||
| @@ -59,6 +59,7 @@ maxOpenFiles: 1000000 | ||||
| maxPods: 110 | ||||
| memoryManagerPolicy: None | ||||
| memorySwap: {} | ||||
| memoryThrottlingFactor: 0.8 | ||||
| nodeLeaseDurationSeconds: 40 | ||||
| nodeStatusMaxImages: 50 | ||||
| nodeStatusReportFrequency: 5m0s | ||||
|   | ||||
| @@ -413,6 +413,15 @@ type KubeletConfiguration struct { | ||||
| 	EnableDebugFlagsHandler bool | ||||
| 	// SeccompDefault enables the use of `RuntimeDefault` as the default seccomp profile for all workloads. | ||||
| 	SeccompDefault bool | ||||
| 	// MemoryThrottlingFactor specifies the factor multiplied by the memory limit or node allocatable memory | ||||
| 	// when setting the cgroupv2 memory.high value to enforce MemoryQoS. | ||||
| 	// Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure | ||||
| 	// while increasing will put less reclaim pressure. | ||||
| 	// See http://kep.k8s.io/2570 for more details. | ||||
| 	// Default: 0.8 | ||||
| 	// +featureGate=MemoryQoS | ||||
| 	// +optional | ||||
| 	MemoryThrottlingFactor *float64 | ||||
| } | ||||
|  | ||||
| // KubeletAuthorizationMode denotes the authorization mode for the kubelet | ||||
|   | ||||
| @@ -36,6 +36,9 @@ const ( | ||||
| 	DefaultIPTablesMasqueradeBit = 14 | ||||
| 	DefaultIPTablesDropBit       = 15 | ||||
| 	DefaultVolumePluginDir       = "/usr/libexec/kubernetes/kubelet-plugins/volume/exec/" | ||||
|  | ||||
| 	// See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos | ||||
| 	DefaultMemoryThrottlingFactor = 0.8 | ||||
| ) | ||||
|  | ||||
| var ( | ||||
| @@ -255,4 +258,7 @@ func SetDefaults_KubeletConfiguration(obj *kubeletconfigv1beta1.KubeletConfigura | ||||
| 	if obj.SeccompDefault == nil { | ||||
| 		obj.SeccompDefault = utilpointer.BoolPtr(false) | ||||
| 	} | ||||
| 	if obj.MemoryThrottlingFactor == nil { | ||||
| 		obj.MemoryThrottlingFactor = utilpointer.Float64Ptr(DefaultMemoryThrottlingFactor) | ||||
| 	} | ||||
| } | ||||
|   | ||||
| @@ -387,6 +387,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in | ||||
| 	if err := v1.Convert_Pointer_bool_To_bool(&in.SeccompDefault, &out.SeccompDefault, s); err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 	out.MemoryThrottlingFactor = (*float64)(unsafe.Pointer(in.MemoryThrottlingFactor)) | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| @@ -554,6 +555,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in | ||||
| 	if err := v1.Convert_bool_To_Pointer_bool(&in.SeccompDefault, &out.SeccompDefault, s); err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 	out.MemoryThrottlingFactor = (*float64)(unsafe.Pointer(in.MemoryThrottlingFactor)) | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -212,5 +212,13 @@ func ValidateKubeletConfiguration(kc *kubeletconfig.KubeletConfiguration) error | ||||
| 	if errs := logs.ValidateLoggingConfiguration(&kc.Logging, field.NewPath("logging")); len(errs) > 0 { | ||||
| 		allErrors = append(allErrors, errs.ToAggregate().Errors()...) | ||||
| 	} | ||||
|  | ||||
| 	if localFeatureGate.Enabled(features.MemoryQoS) && kc.MemoryThrottlingFactor == nil { | ||||
| 		allErrors = append(allErrors, fmt.Errorf("invalid configuration: memoryThrottlingFactor is required when MemoryQoS feature flag is enabled")) | ||||
| 	} | ||||
| 	if kc.MemoryThrottlingFactor != nil && (*kc.MemoryThrottlingFactor <= 0 || *kc.MemoryThrottlingFactor > 1.0) { | ||||
| 		allErrors = append(allErrors, fmt.Errorf("invalid configuration: memoryThrottlingFactor %v must be greater than 0 and less than or equal to 1.0", kc.MemoryThrottlingFactor)) | ||||
| 	} | ||||
|  | ||||
| 	return utilerrors.NewAggregate(allErrors) | ||||
| } | ||||
|   | ||||
| @@ -25,6 +25,7 @@ import ( | ||||
| 	componentbaseconfig "k8s.io/component-base/config" | ||||
| 	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" | ||||
| 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types" | ||||
| 	utilpointer "k8s.io/utils/pointer" | ||||
| ) | ||||
|  | ||||
| func TestValidateKubeletConfiguration(t *testing.T) { | ||||
| @@ -59,9 +60,11 @@ func TestValidateKubeletConfiguration(t *testing.T) { | ||||
| 		TopologyManagerPolicy:           kubeletconfig.SingleNumaNodeTopologyManagerPolicy, | ||||
| 		ShutdownGracePeriod:             metav1.Duration{Duration: 30 * time.Second}, | ||||
| 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second}, | ||||
| 		MemoryThrottlingFactor:          utilpointer.Float64Ptr(0.8), | ||||
| 		FeatureGates: map[string]bool{ | ||||
| 			"CustomCPUCFSQuotaPeriod": true, | ||||
| 			"GracefulNodeShutdown":    true, | ||||
| 			"MemoryQoS":               true, | ||||
| 		}, | ||||
| 		Logging: componentbaseconfig.LoggingConfiguration{ | ||||
| 			Format: "text", | ||||
| @@ -103,8 +106,10 @@ func TestValidateKubeletConfiguration(t *testing.T) { | ||||
| 		TopologyManagerPolicy:           kubeletconfig.NoneTopologyManagerPolicy, | ||||
| 		ShutdownGracePeriod:             metav1.Duration{Duration: 10 * time.Minute}, | ||||
| 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0}, | ||||
| 		MemoryThrottlingFactor:          utilpointer.Float64Ptr(0.9), | ||||
| 		FeatureGates: map[string]bool{ | ||||
| 			"CustomCPUCFSQuotaPeriod": true, | ||||
| 			"MemoryQoS":               true, | ||||
| 		}, | ||||
| 		Logging: componentbaseconfig.LoggingConfiguration{ | ||||
| 			Format: "text", | ||||
| @@ -147,10 +152,12 @@ func TestValidateKubeletConfiguration(t *testing.T) { | ||||
| 		ShutdownGracePeriod:             metav1.Duration{Duration: 10 * time.Minute}, | ||||
| 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0}, | ||||
| 		MemorySwap:                      kubeletconfig.MemorySwapConfiguration{SwapBehavior: kubetypes.UnlimitedSwap}, | ||||
| 		MemoryThrottlingFactor:          utilpointer.Float64Ptr(0.5), | ||||
| 		FeatureGates: map[string]bool{ | ||||
| 			"CustomCPUCFSQuotaPeriod": true, | ||||
| 			"GracefulNodeShutdown":    true, | ||||
| 			"NodeSwapEnabled":         true, | ||||
| 			"MemoryQoS":               true, | ||||
| 		}, | ||||
| 		Logging: componentbaseconfig.LoggingConfiguration{ | ||||
| 			Format: "text", | ||||
| @@ -230,16 +237,18 @@ func TestValidateKubeletConfiguration(t *testing.T) { | ||||
| 		ShutdownGracePeriod:             metav1.Duration{Duration: 40 * time.Second}, | ||||
| 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second}, | ||||
| 		MemorySwap:                      kubeletconfig.MemorySwapConfiguration{SwapBehavior: "invalid"}, | ||||
| 		MemoryThrottlingFactor:          utilpointer.Float64Ptr(1.1), | ||||
| 		FeatureGates: map[string]bool{ | ||||
| 			"CustomCPUCFSQuotaPeriod": true, | ||||
| 			"GracefulNodeShutdown":    true, | ||||
| 			"NodeSwapEnabled":         true, | ||||
| 			"MemoryQoS":               true, | ||||
| 		}, | ||||
| 		Logging: componentbaseconfig.LoggingConfiguration{ | ||||
| 			Format: "text", | ||||
| 		}, | ||||
| 	} | ||||
| 	const numErrsErrorCase2 = 4 | ||||
| 	const numErrsErrorCase2 = 5 | ||||
| 	if allErrors := ValidateKubeletConfiguration(errorCase2); len(allErrors.(utilerrors.Aggregate).Errors()) != numErrsErrorCase2 { | ||||
| 		t.Errorf("expect %d errors, got %v", numErrsErrorCase2, len(allErrors.(utilerrors.Aggregate).Errors())) | ||||
| 	} | ||||
|   | ||||
							
								
								
									
										5
									
								
								pkg/kubelet/apis/config/zz_generated.deepcopy.go
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										5
									
								
								pkg/kubelet/apis/config/zz_generated.deepcopy.go
									
									
									
										generated
									
									
									
								
							| @@ -282,6 +282,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { | ||||
| 			(*in)[i].DeepCopyInto(&(*out)[i]) | ||||
| 		} | ||||
| 	} | ||||
| 	if in.MemoryThrottlingFactor != nil { | ||||
| 		in, out := &in.MemoryThrottlingFactor, &out.MemoryThrottlingFactor | ||||
| 		*out = new(float64) | ||||
| 		**out = **in | ||||
| 	} | ||||
| 	return | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -53,6 +53,10 @@ const ( | ||||
| 	libcontainerSystemd libcontainerCgroupManagerType = "systemd" | ||||
| 	// systemdSuffix is the cgroup name suffix for systemd | ||||
| 	systemdSuffix string = ".slice" | ||||
| 	// MemoryMin is memory.min for cgroup v2 | ||||
| 	MemoryMin string = "memory.min" | ||||
| 	// MemoryHigh is memory.high for cgroup v2 | ||||
| 	MemoryHigh string = "memory.high" | ||||
| ) | ||||
|  | ||||
| var RootCgroupName = CgroupName([]string{}) | ||||
| @@ -434,6 +438,15 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont | ||||
| 			Limit:    uint64(0), | ||||
| 		}) | ||||
| 	} | ||||
| 	// Ideally unified is used for all the resources when running on cgroup v2. | ||||
| 	// It doesn't make difference for the memory.max limit, but for e.g. the cpu controller | ||||
| 	// you can specify the correct setting without relying on the conversions performed by the OCI runtime. | ||||
| 	if resourceConfig.Unified != nil && libcontainercgroups.IsCgroup2UnifiedMode() { | ||||
| 		resources.Unified = make(map[string]string) | ||||
| 		for k, v := range resourceConfig.Unified { | ||||
| 			resources.Unified[k] = v | ||||
| 		} | ||||
| 	} | ||||
| 	return resources | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -112,6 +112,9 @@ type ContainerManager interface { | ||||
| 	// GetAllocateResourcesPodAdmitHandler returns an instance of a PodAdmitHandler responsible for allocating pod resources. | ||||
| 	GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler | ||||
|  | ||||
| 	// GetNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement. | ||||
| 	GetNodeAllocatableAbsolute() v1.ResourceList | ||||
|  | ||||
| 	// Implements the podresources Provider API for CPUs, Memory and Devices | ||||
| 	podresources.CPUsProvider | ||||
| 	podresources.DevicesProvider | ||||
|   | ||||
| @@ -491,7 +491,7 @@ func (cm *containerManagerImpl) setupNode(activePods ActivePodsFunc) error { | ||||
| 		if err := cm.createNodeAllocatableCgroups(); err != nil { | ||||
| 			return err | ||||
| 		} | ||||
| 		err = cm.qosContainerManager.Start(cm.getNodeAllocatableAbsolute, activePods) | ||||
| 		err = cm.qosContainerManager.Start(cm.GetNodeAllocatableAbsolute, activePods) | ||||
| 		if err != nil { | ||||
| 			return fmt.Errorf("failed to initialize top level QOS containers: %v", err) | ||||
| 		} | ||||
|   | ||||
| @@ -147,6 +147,10 @@ func (cm *containerManagerStub) GetAllocatableMemory() []*podresourcesapi.Contai | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| func (cm *containerManagerStub) GetNodeAllocatableAbsolute() v1.ResourceList { | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| func NewStubContainerManager() ContainerManager { | ||||
| 	return &containerManagerStub{shouldResetExtendedResourceCapacity: false} | ||||
| } | ||||
|   | ||||
| @@ -251,3 +251,7 @@ func (cm *containerManagerImpl) GetMemory(_, _ string) []*podresourcesapi.Contai | ||||
| func (cm *containerManagerImpl) GetAllocatableMemory() []*podresourcesapi.ContainerMemory { | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| func (cm *containerManagerImpl) GetNodeAllocatableAbsolute() v1.ResourceList { | ||||
| 	return nil | ||||
| } | ||||
|   | ||||
| @@ -227,3 +227,9 @@ func (cm *FakeContainerManager) GetAllocatableMemory() []*podresourcesapi.Contai | ||||
| 	defer cm.Unlock() | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| func (cm *FakeContainerManager) GetNodeAllocatableAbsolute() v1.ResourceList { | ||||
| 	cm.Lock() | ||||
| 	defer cm.Unlock() | ||||
| 	return nil | ||||
| } | ||||
|   | ||||
| @@ -113,7 +113,7 @@ func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 { | ||||
| } | ||||
|  | ||||
| // ResourceConfigForPod takes the input pod and outputs the cgroup resource config. | ||||
| func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64) *ResourceConfig { | ||||
| func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig { | ||||
| 	// sum requests and limits. | ||||
| 	reqs, limits := resource.PodRequestsAndLimits(pod) | ||||
|  | ||||
| @@ -185,6 +185,19 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64) | ||||
| 		result.CpuShares = &shares | ||||
| 	} | ||||
| 	result.HugePageLimit = hugePageLimits | ||||
|  | ||||
| 	if enforceMemoryQoS { | ||||
| 		memoryMin := int64(0) | ||||
| 		if request, found := reqs[v1.ResourceMemory]; found { | ||||
| 			memoryMin = request.Value() | ||||
| 		} | ||||
| 		if memoryMin > 0 { | ||||
| 			result.Unified = map[string]string{ | ||||
| 				MemoryMin: strconv.FormatInt(memoryMin, 10), | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	return result | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -252,7 +252,7 @@ func TestResourceConfigForPod(t *testing.T) { | ||||
|  | ||||
| 	for testName, testCase := range testCases { | ||||
|  | ||||
| 		actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod) | ||||
| 		actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, false) | ||||
|  | ||||
| 		if !reflect.DeepEqual(actual.CpuPeriod, testCase.expected.CpuPeriod) { | ||||
| 			t.Errorf("unexpected result, test: %v, cpu period not as expected", testName) | ||||
| @@ -472,7 +472,7 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { | ||||
|  | ||||
| 	for testName, testCase := range testCases { | ||||
|  | ||||
| 		actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod) | ||||
| 		actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, false) | ||||
|  | ||||
| 		if !reflect.DeepEqual(actual.CpuPeriod, testCase.expected.CpuPeriod) { | ||||
| 			t.Errorf("unexpected result, test: %v, cpu period not as expected", testName) | ||||
| @@ -622,3 +622,211 @@ func TestHugePageLimits(t *testing.T) { | ||||
|  | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func TestResourceConfigForPodWithEnforceMemoryQoS(t *testing.T) { | ||||
| 	defaultQuotaPeriod := uint64(100 * time.Millisecond / time.Microsecond) | ||||
| 	tunedQuotaPeriod := uint64(5 * time.Millisecond / time.Microsecond) | ||||
|  | ||||
| 	minShares := uint64(MinShares) | ||||
| 	burstableShares := MilliCPUToShares(100) | ||||
| 	memoryQuantity := resource.MustParse("200Mi") | ||||
| 	burstableMemory := memoryQuantity.Value() | ||||
| 	burstablePartialShares := MilliCPUToShares(200) | ||||
| 	burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod)) | ||||
| 	guaranteedShares := MilliCPUToShares(100) | ||||
| 	guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod)) | ||||
| 	guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod)) | ||||
| 	memoryQuantity = resource.MustParse("100Mi") | ||||
| 	cpuNoLimit := int64(-1) | ||||
| 	guaranteedMemory := memoryQuantity.Value() | ||||
| 	testCases := map[string]struct { | ||||
| 		pod              *v1.Pod | ||||
| 		expected         *ResourceConfig | ||||
| 		enforceCPULimits bool | ||||
| 		quotaPeriod      uint64 | ||||
| 	}{ | ||||
| 		"besteffort": { | ||||
| 			pod: &v1.Pod{ | ||||
| 				Spec: v1.PodSpec{ | ||||
| 					Containers: []v1.Container{ | ||||
| 						{ | ||||
| 							Resources: getResourceRequirements(getResourceList("", ""), getResourceList("", "")), | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 			enforceCPULimits: true, | ||||
| 			quotaPeriod:      defaultQuotaPeriod, | ||||
| 			expected:         &ResourceConfig{CpuShares: &minShares}, | ||||
| 		}, | ||||
| 		"burstable-no-limits": { | ||||
| 			pod: &v1.Pod{ | ||||
| 				Spec: v1.PodSpec{ | ||||
| 					Containers: []v1.Container{ | ||||
| 						{ | ||||
| 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("", "")), | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 			enforceCPULimits: true, | ||||
| 			quotaPeriod:      defaultQuotaPeriod, | ||||
| 			expected:         &ResourceConfig{CpuShares: &burstableShares, Unified: map[string]string{"memory.min": "104857600"}}, | ||||
| 		}, | ||||
| 		"burstable-with-limits": { | ||||
| 			pod: &v1.Pod{ | ||||
| 				Spec: v1.PodSpec{ | ||||
| 					Containers: []v1.Container{ | ||||
| 						{ | ||||
| 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")), | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 			enforceCPULimits: true, | ||||
| 			quotaPeriod:      defaultQuotaPeriod, | ||||
| 			expected:         &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &burstableQuota, CpuPeriod: &defaultQuotaPeriod, Memory: &burstableMemory, Unified: map[string]string{"memory.min": "104857600"}}, | ||||
| 		}, | ||||
| 		"burstable-with-limits-no-cpu-enforcement": { | ||||
| 			pod: &v1.Pod{ | ||||
| 				Spec: v1.PodSpec{ | ||||
| 					Containers: []v1.Container{ | ||||
| 						{ | ||||
| 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")), | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 			enforceCPULimits: false, | ||||
| 			quotaPeriod:      defaultQuotaPeriod, | ||||
| 			expected:         &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &cpuNoLimit, CpuPeriod: &defaultQuotaPeriod, Memory: &burstableMemory, Unified: map[string]string{"memory.min": "104857600"}}, | ||||
| 		}, | ||||
| 		"burstable-partial-limits": { | ||||
| 			pod: &v1.Pod{ | ||||
| 				Spec: v1.PodSpec{ | ||||
| 					Containers: []v1.Container{ | ||||
| 						{ | ||||
| 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")), | ||||
| 						}, | ||||
| 						{ | ||||
| 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("", "")), | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 			enforceCPULimits: true, | ||||
| 			quotaPeriod:      defaultQuotaPeriod, | ||||
| 			expected:         &ResourceConfig{CpuShares: &burstablePartialShares, Unified: map[string]string{"memory.min": "209715200"}}, | ||||
| 		}, | ||||
| 		"burstable-with-limits-with-tuned-quota": { | ||||
| 			pod: &v1.Pod{ | ||||
| 				Spec: v1.PodSpec{ | ||||
| 					Containers: []v1.Container{ | ||||
| 						{ | ||||
| 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")), | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 			enforceCPULimits: true, | ||||
| 			quotaPeriod:      tunedQuotaPeriod, | ||||
| 			expected:         &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &burstableQuota, CpuPeriod: &tunedQuotaPeriod, Memory: &burstableMemory, Unified: map[string]string{"memory.min": "104857600"}}, | ||||
| 		}, | ||||
| 		"burstable-with-limits-no-cpu-enforcement-with-tuned-quota": { | ||||
| 			pod: &v1.Pod{ | ||||
| 				Spec: v1.PodSpec{ | ||||
| 					Containers: []v1.Container{ | ||||
| 						{ | ||||
| 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")), | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 			enforceCPULimits: false, | ||||
| 			quotaPeriod:      tunedQuotaPeriod, | ||||
| 			expected:         &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &cpuNoLimit, CpuPeriod: &tunedQuotaPeriod, Memory: &burstableMemory, Unified: map[string]string{"memory.min": "104857600"}}, | ||||
| 		}, | ||||
| 		"burstable-partial-limits-with-tuned-quota": { | ||||
| 			pod: &v1.Pod{ | ||||
| 				Spec: v1.PodSpec{ | ||||
| 					Containers: []v1.Container{ | ||||
| 						{ | ||||
| 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")), | ||||
| 						}, | ||||
| 						{ | ||||
| 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("", "")), | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 			enforceCPULimits: true, | ||||
| 			quotaPeriod:      tunedQuotaPeriod, | ||||
| 			expected:         &ResourceConfig{CpuShares: &burstablePartialShares, Unified: map[string]string{"memory.min": "209715200"}}, | ||||
| 		}, | ||||
| 		"guaranteed": { | ||||
| 			pod: &v1.Pod{ | ||||
| 				Spec: v1.PodSpec{ | ||||
| 					Containers: []v1.Container{ | ||||
| 						{ | ||||
| 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")), | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 			enforceCPULimits: true, | ||||
| 			quotaPeriod:      defaultQuotaPeriod, | ||||
| 			expected:         &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory, Unified: map[string]string{"memory.min": "104857600"}}, | ||||
| 		}, | ||||
| 		"guaranteed-no-cpu-enforcement": { | ||||
| 			pod: &v1.Pod{ | ||||
| 				Spec: v1.PodSpec{ | ||||
| 					Containers: []v1.Container{ | ||||
| 						{ | ||||
| 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")), | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 			enforceCPULimits: false, | ||||
| 			quotaPeriod:      defaultQuotaPeriod, | ||||
| 			expected:         &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory, Unified: map[string]string{"memory.min": "104857600"}}, | ||||
| 		}, | ||||
| 		"guaranteed-with-tuned-quota": { | ||||
| 			pod: &v1.Pod{ | ||||
| 				Spec: v1.PodSpec{ | ||||
| 					Containers: []v1.Container{ | ||||
| 						{ | ||||
| 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")), | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 			enforceCPULimits: true, | ||||
| 			quotaPeriod:      tunedQuotaPeriod, | ||||
| 			expected:         &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory, Unified: map[string]string{"memory.min": "104857600"}}, | ||||
| 		}, | ||||
| 		"guaranteed-no-cpu-enforcement-with-tuned-quota": { | ||||
| 			pod: &v1.Pod{ | ||||
| 				Spec: v1.PodSpec{ | ||||
| 					Containers: []v1.Container{ | ||||
| 						{ | ||||
| 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")), | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 			enforceCPULimits: false, | ||||
| 			quotaPeriod:      tunedQuotaPeriod, | ||||
| 			expected:         &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory, Unified: map[string]string{"memory.min": "104857600"}}, | ||||
| 		}, | ||||
| 	} | ||||
|  | ||||
| 	for testName, testCase := range testCases { | ||||
|  | ||||
| 		actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, true) | ||||
|  | ||||
| 		if !reflect.DeepEqual(actual.Unified, testCase.expected.Unified) { | ||||
| 			t.Errorf("unexpected result, test: %v, unified not as expected", testName) | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
|   | ||||
| @@ -42,7 +42,7 @@ func MilliCPUToShares(milliCPU int64) int64 { | ||||
| } | ||||
|  | ||||
| // ResourceConfigForPod takes the input pod and outputs the cgroup resource config. | ||||
| func ResourceConfigForPod(pod *v1.Pod, enforceCPULimit bool, cpuPeriod uint64) *ResourceConfig { | ||||
| func ResourceConfigForPod(pod *v1.Pod, enforceCPULimit bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig { | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -20,13 +20,16 @@ package cm | ||||
|  | ||||
| import ( | ||||
| 	"fmt" | ||||
| 	"strconv" | ||||
| 	"strings" | ||||
| 	"time" | ||||
|  | ||||
| 	"k8s.io/api/core/v1" | ||||
| 	v1 "k8s.io/api/core/v1" | ||||
| 	"k8s.io/apimachinery/pkg/api/resource" | ||||
| 	"k8s.io/apimachinery/pkg/types" | ||||
| 	utilfeature "k8s.io/apiserver/pkg/util/feature" | ||||
| 	"k8s.io/klog/v2" | ||||
| 	kubefeatures "k8s.io/kubernetes/pkg/features" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/events" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/stats/pidlimit" | ||||
| 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types" | ||||
| @@ -131,9 +134,22 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error { | ||||
|  | ||||
| // enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface. | ||||
| func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.ResourceList) error { | ||||
| 	rp := getCgroupConfig(rl) | ||||
|  | ||||
| 	// Enforce MemoryQoS for cgroups of kube-reserved/system-reserved. For more information, | ||||
| 	// see https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos | ||||
| 	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) { | ||||
| 		if rp.Memory != nil { | ||||
| 			if rp.Unified == nil { | ||||
| 				rp.Unified = make(map[string]string) | ||||
| 			} | ||||
| 			rp.Unified[MemoryMin] = strconv.FormatInt(*rp.Memory, 10) | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	cgroupConfig := &CgroupConfig{ | ||||
| 		Name:               cName, | ||||
| 		ResourceParameters: getCgroupConfig(rl), | ||||
| 		ResourceParameters: rp, | ||||
| 	} | ||||
| 	if cgroupConfig.ResourceParameters == nil { | ||||
| 		return fmt.Errorf("%q cgroup is not config properly", cgroupConfig.Name) | ||||
| @@ -174,10 +190,10 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig { | ||||
| 	return &rc | ||||
| } | ||||
|  | ||||
| // getNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement. | ||||
| // GetNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement. | ||||
| // Note that not all resources that are available on the node are included in the returned list of resources. | ||||
| // Returns a ResourceList. | ||||
| func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList { | ||||
| func (cm *containerManagerImpl) GetNodeAllocatableAbsolute() v1.ResourceList { | ||||
| 	return cm.getNodeAllocatableAbsoluteImpl(cm.capacity) | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -250,7 +250,7 @@ func TestNodeAllocatableForEnforcement(t *testing.T) { | ||||
| 			NodeConfig: nc, | ||||
| 			capacity:   tc.capacity, | ||||
| 		} | ||||
| 		for k, v := range cm.getNodeAllocatableAbsolute() { | ||||
| 		for k, v := range cm.GetNodeAllocatableAbsolute() { | ||||
| 			expected, exists := tc.expected[k] | ||||
| 			assert.True(t, exists) | ||||
| 			assert.Equal(t, expected.MilliValue(), v.MilliValue(), "test case %d failed for resource %q", idx+1, k) | ||||
|   | ||||
| @@ -23,11 +23,14 @@ import ( | ||||
| 	"path" | ||||
| 	"strings" | ||||
|  | ||||
| 	v1 "k8s.io/api/core/v1" | ||||
| 	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" | ||||
| 	"k8s.io/api/core/v1" | ||||
| 	"k8s.io/apimachinery/pkg/types" | ||||
| 	utilerrors "k8s.io/apimachinery/pkg/util/errors" | ||||
| 	utilfeature "k8s.io/apiserver/pkg/util/feature" | ||||
| 	"k8s.io/klog/v2" | ||||
| 	v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" | ||||
| 	kubefeatures "k8s.io/kubernetes/pkg/features" | ||||
| ) | ||||
|  | ||||
| const ( | ||||
| @@ -71,14 +74,22 @@ func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error { | ||||
| 	// check if container already exist | ||||
| 	alreadyExists := m.Exists(pod) | ||||
| 	if !alreadyExists { | ||||
| 		enforceMemoryQoS := false | ||||
| 		if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) && | ||||
| 			libcontainercgroups.IsCgroup2UnifiedMode() { | ||||
| 			enforceMemoryQoS = true | ||||
| 		} | ||||
| 		// Create the pod container | ||||
| 		containerConfig := &CgroupConfig{ | ||||
| 			Name:               podContainerName, | ||||
| 			ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod), | ||||
| 			ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod, enforceMemoryQoS), | ||||
| 		} | ||||
| 		if m.podPidsLimit > 0 { | ||||
| 			containerConfig.ResourceParameters.PidsLimit = &m.podPidsLimit | ||||
| 		} | ||||
| 		if enforceMemoryQoS { | ||||
| 			klog.V(4).InfoS("MemoryQoS config for pod", "pod", klog.KObj(pod), "unified", containerConfig.ResourceParameters.Unified) | ||||
| 		} | ||||
| 		if err := m.cgroupManager.Create(containerConfig); err != nil { | ||||
| 			return fmt.Errorf("failed to create container for %v : %v", podContainerName, err) | ||||
| 		} | ||||
|   | ||||
| @@ -18,6 +18,7 @@ package cm | ||||
|  | ||||
| import ( | ||||
| 	"fmt" | ||||
| 	"strconv" | ||||
| 	"strings" | ||||
| 	"sync" | ||||
| 	"time" | ||||
| @@ -27,6 +28,7 @@ import ( | ||||
| 	"k8s.io/apimachinery/pkg/util/wait" | ||||
|  | ||||
| 	units "github.com/docker/go-units" | ||||
| 	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" | ||||
| 	cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs" | ||||
| 	v1 "k8s.io/api/core/v1" | ||||
| 	utilfeature "k8s.io/apiserver/pkg/util/feature" | ||||
| @@ -191,10 +193,9 @@ func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass] | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| // setMemoryReserve sums the memory limits of all pods in a QOS class, | ||||
| // calculates QOS class memory limits, and set those limits in the | ||||
| // CgroupConfig for each QOS class. | ||||
| func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) { | ||||
| // getQoSMemoryRequests sums and returns the memory request of all pods for | ||||
| // guaranteed and burstable qos classes. | ||||
| func (m *qosContainerManagerImpl) getQoSMemoryRequests() map[v1.PodQOSClass]int64 { | ||||
| 	qosMemoryRequests := map[v1.PodQOSClass]int64{ | ||||
| 		v1.PodQOSGuaranteed: 0, | ||||
| 		v1.PodQOSBurstable:  0, | ||||
| @@ -216,6 +217,15 @@ func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*C | ||||
| 		qosMemoryRequests[qosClass] += podMemoryRequest | ||||
| 	} | ||||
|  | ||||
| 	return qosMemoryRequests | ||||
| } | ||||
|  | ||||
| // setMemoryReserve sums the memory limits of all pods in a QOS class, | ||||
| // calculates QOS class memory limits, and set those limits in the | ||||
| // CgroupConfig for each QOS class. | ||||
| func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) { | ||||
| 	qosMemoryRequests := m.getQoSMemoryRequests() | ||||
|  | ||||
| 	resources := m.getNodeAllocatable() | ||||
| 	allocatableResource, ok := resources[v1.ResourceMemory] | ||||
| 	if !ok { | ||||
| @@ -265,11 +275,43 @@ func (m *qosContainerManagerImpl) retrySetMemoryReserve(configs map[v1.PodQOSCla | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // setMemoryQoS sums the memory requests of all pods in the Burstable class, | ||||
| // and set the sum memory as the memory.min in the Unified field of CgroupConfig. | ||||
| func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*CgroupConfig) { | ||||
| 	qosMemoryRequests := m.getQoSMemoryRequests() | ||||
|  | ||||
| 	// Calculate the memory.min: | ||||
| 	// for burstable(/kubepods/burstable): sum of all burstable pods | ||||
| 	// for guaranteed(/kubepods): sum of all guaranteed and burstable pods | ||||
| 	burstableMin := qosMemoryRequests[v1.PodQOSBurstable] | ||||
| 	guaranteedMin := qosMemoryRequests[v1.PodQOSGuaranteed] + burstableMin | ||||
|  | ||||
| 	if burstableMin > 0 { | ||||
| 		if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil { | ||||
| 			configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string) | ||||
| 		} | ||||
| 		configs[v1.PodQOSBurstable].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(burstableMin, 10) | ||||
| 		klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memory.min", burstableMin) | ||||
| 	} | ||||
|  | ||||
| 	if guaranteedMin > 0 { | ||||
| 		if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil { | ||||
| 			configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string) | ||||
| 		} | ||||
| 		configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(guaranteedMin, 10) | ||||
| 		klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memory.min", guaranteedMin) | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func (m *qosContainerManagerImpl) UpdateCgroups() error { | ||||
| 	m.Lock() | ||||
| 	defer m.Unlock() | ||||
|  | ||||
| 	qosConfigs := map[v1.PodQOSClass]*CgroupConfig{ | ||||
| 		v1.PodQOSGuaranteed: { | ||||
| 			Name:               m.qosContainersInfo.Guaranteed, | ||||
| 			ResourceParameters: &ResourceConfig{}, | ||||
| 		}, | ||||
| 		v1.PodQOSBurstable: { | ||||
| 			Name:               m.qosContainersInfo.Burstable, | ||||
| 			ResourceParameters: &ResourceConfig{}, | ||||
| @@ -290,6 +332,12 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error { | ||||
| 		return err | ||||
| 	} | ||||
|  | ||||
| 	// update the qos level cgrougs v2 settings of memory qos if feature enabled | ||||
| 	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) && | ||||
| 		libcontainercgroups.IsCgroup2UnifiedMode() { | ||||
| 		m.setMemoryQoS(qosConfigs) | ||||
| 	} | ||||
|  | ||||
| 	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.QOSReserved) { | ||||
| 		for resource, percentReserve := range m.qosReserved { | ||||
| 			switch resource { | ||||
|   | ||||
							
								
								
									
										154
									
								
								pkg/kubelet/cm/qos_container_manager_linux_test.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										154
									
								
								pkg/kubelet/cm/qos_container_manager_linux_test.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,154 @@ | ||||
| // +build linux | ||||
|  | ||||
| /* | ||||
| Copyright 2021 The Kubernetes Authors. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| you may not use this file except in compliance with the License. | ||||
| You may obtain a copy of the License at | ||||
|  | ||||
|     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software | ||||
| distributed under the License is distributed on an "AS IS" BASIS, | ||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| See the License for the specific language governing permissions and | ||||
| limitations under the License. | ||||
| */ | ||||
|  | ||||
| package cm | ||||
|  | ||||
| import ( | ||||
| 	"fmt" | ||||
| 	"strconv" | ||||
| 	"testing" | ||||
|  | ||||
| 	"github.com/stretchr/testify/assert" | ||||
| 	v1 "k8s.io/api/core/v1" | ||||
| 	"k8s.io/apimachinery/pkg/api/resource" | ||||
| 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||
| ) | ||||
|  | ||||
| func activeTestPods() []*v1.Pod { | ||||
| 	return []*v1.Pod{ | ||||
| 		{ | ||||
| 			ObjectMeta: metav1.ObjectMeta{ | ||||
| 				UID:       "12345678", | ||||
| 				Name:      "guaranteed-pod", | ||||
| 				Namespace: "test", | ||||
| 			}, | ||||
| 			Spec: v1.PodSpec{ | ||||
| 				Containers: []v1.Container{ | ||||
| 					{ | ||||
| 						Name:  "foo", | ||||
| 						Image: "busybox", | ||||
| 						Resources: v1.ResourceRequirements{ | ||||
| 							Requests: v1.ResourceList{ | ||||
| 								v1.ResourceMemory: resource.MustParse("128Mi"), | ||||
| 								v1.ResourceCPU:    resource.MustParse("1"), | ||||
| 							}, | ||||
| 							Limits: v1.ResourceList{ | ||||
| 								v1.ResourceMemory: resource.MustParse("128Mi"), | ||||
| 								v1.ResourceCPU:    resource.MustParse("1"), | ||||
| 							}, | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 		}, | ||||
| 		{ | ||||
| 			ObjectMeta: metav1.ObjectMeta{ | ||||
| 				UID:       "87654321", | ||||
| 				Name:      "burstable-pod-1", | ||||
| 				Namespace: "test", | ||||
| 			}, | ||||
| 			Spec: v1.PodSpec{ | ||||
| 				Containers: []v1.Container{ | ||||
| 					{ | ||||
| 						Name:  "foo", | ||||
| 						Image: "busybox", | ||||
| 						Resources: v1.ResourceRequirements{ | ||||
| 							Requests: v1.ResourceList{ | ||||
| 								v1.ResourceMemory: resource.MustParse("128Mi"), | ||||
| 								v1.ResourceCPU:    resource.MustParse("1"), | ||||
| 							}, | ||||
| 							Limits: v1.ResourceList{ | ||||
| 								v1.ResourceMemory: resource.MustParse("256Mi"), | ||||
| 								v1.ResourceCPU:    resource.MustParse("2"), | ||||
| 							}, | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 		}, | ||||
| 		{ | ||||
| 			ObjectMeta: metav1.ObjectMeta{ | ||||
| 				UID:       "01234567", | ||||
| 				Name:      "burstable-pod-2", | ||||
| 				Namespace: "test", | ||||
| 			}, | ||||
| 			Spec: v1.PodSpec{ | ||||
| 				Containers: []v1.Container{ | ||||
| 					{ | ||||
| 						Name:  "foo", | ||||
| 						Image: "busybox", | ||||
| 						Resources: v1.ResourceRequirements{ | ||||
| 							Requests: v1.ResourceList{ | ||||
| 								v1.ResourceMemory: resource.MustParse("256Mi"), | ||||
| 								v1.ResourceCPU:    resource.MustParse("2"), | ||||
| 							}, | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 		}, | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func createTestQOSContainerManager() (*qosContainerManagerImpl, error) { | ||||
| 	subsystems, err := GetCgroupSubsystems() | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to get mounted cgroup subsystems: %v", err) | ||||
| 	} | ||||
|  | ||||
| 	cgroupRoot := ParseCgroupfsToCgroupName("/") | ||||
| 	cgroupRoot = NewCgroupName(cgroupRoot, defaultNodeAllocatableCgroupName) | ||||
|  | ||||
| 	qosContainerManager := &qosContainerManagerImpl{ | ||||
| 		subsystems:    subsystems, | ||||
| 		cgroupManager: NewCgroupManager(subsystems, "cgroupfs"), | ||||
| 		cgroupRoot:    cgroupRoot, | ||||
| 		qosReserved:   nil, | ||||
| 	} | ||||
|  | ||||
| 	qosContainerManager.activePods = activeTestPods | ||||
|  | ||||
| 	return qosContainerManager, nil | ||||
| } | ||||
|  | ||||
| func TestQoSContainerCgroup(t *testing.T) { | ||||
| 	m, err := createTestQOSContainerManager() | ||||
| 	assert.Nil(t, err) | ||||
|  | ||||
| 	qosConfigs := map[v1.PodQOSClass]*CgroupConfig{ | ||||
| 		v1.PodQOSGuaranteed: { | ||||
| 			Name:               m.qosContainersInfo.Guaranteed, | ||||
| 			ResourceParameters: &ResourceConfig{}, | ||||
| 		}, | ||||
| 		v1.PodQOSBurstable: { | ||||
| 			Name:               m.qosContainersInfo.Burstable, | ||||
| 			ResourceParameters: &ResourceConfig{}, | ||||
| 		}, | ||||
| 		v1.PodQOSBestEffort: { | ||||
| 			Name:               m.qosContainersInfo.BestEffort, | ||||
| 			ResourceParameters: &ResourceConfig{}, | ||||
| 		}, | ||||
| 	} | ||||
|  | ||||
| 	m.setMemoryQoS(qosConfigs) | ||||
|  | ||||
| 	burstableMin := resource.MustParse("384Mi") | ||||
| 	guaranteedMin := resource.MustParse("128Mi") | ||||
| 	assert.Equal(t, qosConfigs[v1.PodQOSGuaranteed].ResourceParameters.Unified["memory.min"], strconv.FormatInt(burstableMin.Value()+guaranteedMin.Value(), 10)) | ||||
| 	assert.Equal(t, qosConfigs[v1.PodQOSBurstable].ResourceParameters.Unified["memory.min"], strconv.FormatInt(burstableMin.Value(), 10)) | ||||
| } | ||||
| @@ -35,6 +35,8 @@ type ResourceConfig struct { | ||||
| 	HugePageLimit map[int64]int64 | ||||
| 	// Maximum number of pids | ||||
| 	PidsLimit *int64 | ||||
| 	// Unified for cgroup v2 | ||||
| 	Unified map[string]string | ||||
| } | ||||
|  | ||||
| // CgroupName is the abstract name of a cgroup prior to any driver specific conversion. | ||||
|   | ||||
| @@ -666,6 +666,8 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, | ||||
| 		klet.runtimeClassManager, | ||||
| 		seccompDefault, | ||||
| 		kubeCfg.MemorySwap.SwapBehavior, | ||||
| 		kubeDeps.ContainerManager.GetNodeAllocatableAbsolute, | ||||
| 		*kubeCfg.MemoryThrottlingFactor, | ||||
| 	) | ||||
| 	if err != nil { | ||||
| 		return nil, err | ||||
|   | ||||
| @@ -21,6 +21,8 @@ import ( | ||||
| 	"time" | ||||
|  | ||||
| 	cadvisorapi "github.com/google/cadvisor/info/v1" | ||||
| 	v1 "k8s.io/api/core/v1" | ||||
| 	"k8s.io/apimachinery/pkg/api/resource" | ||||
| 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||
| 	"k8s.io/apimachinery/pkg/types" | ||||
| 	"k8s.io/client-go/tools/record" | ||||
| @@ -38,6 +40,9 @@ import ( | ||||
|  | ||||
| const ( | ||||
| 	fakeSeccompProfileRoot = "/fakeSeccompProfileRoot" | ||||
|  | ||||
| 	fakeNodeAllocatableMemory = "32Gi" | ||||
| 	fakeNodeAllocatableCPU    = "16" | ||||
| ) | ||||
|  | ||||
| type fakeHTTP struct { | ||||
| @@ -94,6 +99,7 @@ func newFakeKubeRuntimeManager(runtimeService internalapi.RuntimeService, imageS | ||||
| 		internalLifecycle:      cm.NewFakeInternalContainerLifecycle(), | ||||
| 		logReduction:           logreduction.NewLogReduction(identicalErrorDelay), | ||||
| 		logManager:             logManager, | ||||
| 		memoryThrottlingFactor: 0.8, | ||||
| 	} | ||||
|  | ||||
| 	typedVersion, err := runtimeService.Version(kubeRuntimeAPIVersion) | ||||
| @@ -118,5 +124,12 @@ func newFakeKubeRuntimeManager(runtimeService internalapi.RuntimeService, imageS | ||||
| 		kubeRuntimeManager, | ||||
| 		kubeRuntimeManager) | ||||
|  | ||||
| 	kubeRuntimeManager.getNodeAllocatable = func() v1.ResourceList { | ||||
| 		return v1.ResourceList{ | ||||
| 			v1.ResourceMemory: resource.MustParse(fakeNodeAllocatableMemory), | ||||
| 			v1.ResourceCPU:    resource.MustParse(fakeNodeAllocatableCPU), | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	return kubeRuntimeManager, nil | ||||
| } | ||||
|   | ||||
| @@ -19,8 +19,10 @@ limitations under the License. | ||||
| package kuberuntime | ||||
|  | ||||
| import ( | ||||
| 	"strconv" | ||||
| 	"time" | ||||
|  | ||||
| 	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" | ||||
| 	cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs" | ||||
| 	v1 "k8s.io/api/core/v1" | ||||
| 	utilfeature "k8s.io/apiserver/pkg/util/feature" | ||||
| @@ -28,6 +30,7 @@ import ( | ||||
| 	"k8s.io/klog/v2" | ||||
| 	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" | ||||
| 	kubefeatures "k8s.io/kubernetes/pkg/features" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/cm" | ||||
| 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/qos" | ||||
| 	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" | ||||
| @@ -35,12 +38,18 @@ import ( | ||||
|  | ||||
| // applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig. | ||||
| func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error { | ||||
| 	config.Linux = m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget) | ||||
| 	enforceMemoryQoS := false | ||||
| 	// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2 | ||||
| 	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) && | ||||
| 		libcontainercgroups.IsCgroup2UnifiedMode() { | ||||
| 		enforceMemoryQoS = true | ||||
| 	} | ||||
| 	config.Linux = m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS) | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| // generateLinuxContainerConfig generates linux container config for kubelet runtime v1. | ||||
| func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) *runtimeapi.LinuxContainerConfig { | ||||
| func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID, enforceMemoryQoS bool) *runtimeapi.LinuxContainerConfig { | ||||
| 	lc := &runtimeapi.LinuxContainerConfig{ | ||||
| 		Resources:       &runtimeapi.LinuxContainerResources{}, | ||||
| 		SecurityContext: m.determineEffectiveSecurityContext(pod, container, uid, username), | ||||
| @@ -56,6 +65,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.C | ||||
| 	cpuRequest := container.Resources.Requests.Cpu() | ||||
| 	cpuLimit := container.Resources.Limits.Cpu() | ||||
| 	memoryLimit := container.Resources.Limits.Memory().Value() | ||||
| 	memoryRequest := container.Resources.Requests.Memory().Value() | ||||
| 	oomScoreAdj := int64(qos.GetContainerOOMScoreAdjust(pod, container, | ||||
| 		int64(m.machineInfo.MemoryCapacity))) | ||||
| 	// If request is not specified, but limit is, we want request to default to limit. | ||||
| @@ -107,6 +117,43 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.C | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	// Set memory.min and memory.high to enforce MemoryQoS | ||||
| 	if enforceMemoryQoS { | ||||
| 		unified := map[string]string{} | ||||
|  | ||||
| 		if memoryRequest != 0 { | ||||
| 			unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10) | ||||
| 		} | ||||
|  | ||||
| 		// If container sets limits.memory, we set memory.high=pod.spec.containers[i].resources.limits[memory] * memory_throttling_factor | ||||
| 		// for container level cgroup if memory.high>memory.min. | ||||
| 		// If container doesn't set limits.memory, we set memory.high=node_allocatable_memory * memory_throttling_factor | ||||
| 		// for container level cgroup. | ||||
| 		memoryHigh := int64(0) | ||||
| 		if memoryLimit != 0 { | ||||
| 			memoryHigh = int64(float64(memoryLimit) * m.memoryThrottlingFactor) | ||||
| 		} else { | ||||
| 			allocatable := m.getNodeAllocatable() | ||||
| 			allocatableMemory, ok := allocatable[v1.ResourceMemory] | ||||
| 			if ok && allocatableMemory.Value() > 0 { | ||||
| 				memoryHigh = int64(float64(allocatableMemory.Value()) * m.memoryThrottlingFactor) | ||||
| 			} | ||||
| 		} | ||||
| 		if memoryHigh > memoryRequest { | ||||
| 			unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10) | ||||
| 		} | ||||
| 		if len(unified) > 0 { | ||||
| 			if lc.Resources.Unified == nil { | ||||
| 				lc.Resources.Unified = unified | ||||
| 			} else { | ||||
| 				for k, v := range unified { | ||||
| 					lc.Resources.Unified[k] = v | ||||
| 				} | ||||
| 			} | ||||
| 			klog.V(4).InfoS("MemoryQoS config for container", "pod", klog.KObj(pod), "containerName", container.Name, "unified", unified) | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	return lc | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -20,6 +20,7 @@ package kuberuntime | ||||
|  | ||||
| import ( | ||||
| 	"reflect" | ||||
| 	"strconv" | ||||
| 	"testing" | ||||
|  | ||||
| 	"github.com/google/go-cmp/cmp" | ||||
| @@ -36,7 +37,7 @@ import ( | ||||
| 	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" | ||||
| ) | ||||
|  | ||||
| func makeExpectedConfig(m *kubeGenericRuntimeManager, pod *v1.Pod, containerIndex int) *runtimeapi.ContainerConfig { | ||||
| func makeExpectedConfig(m *kubeGenericRuntimeManager, pod *v1.Pod, containerIndex int, enforceMemoryQoS bool) *runtimeapi.ContainerConfig { | ||||
| 	container := &pod.Spec.Containers[containerIndex] | ||||
| 	podIP := "" | ||||
| 	restartCount := 0 | ||||
| @@ -62,7 +63,7 @@ func makeExpectedConfig(m *kubeGenericRuntimeManager, pod *v1.Pod, containerInde | ||||
| 		Stdin:       container.Stdin, | ||||
| 		StdinOnce:   container.StdinOnce, | ||||
| 		Tty:         container.TTY, | ||||
| 		Linux:       m.generateLinuxContainerConfig(container, pod, new(int64), "", nil), | ||||
| 		Linux:       m.generateLinuxContainerConfig(container, pod, new(int64), "", nil, enforceMemoryQoS), | ||||
| 		Envs:        envs, | ||||
| 	} | ||||
| 	return expectedConfig | ||||
| @@ -97,7 +98,7 @@ func TestGenerateContainerConfig(t *testing.T) { | ||||
| 		}, | ||||
| 	} | ||||
|  | ||||
| 	expectedConfig := makeExpectedConfig(m, pod, 0) | ||||
| 	expectedConfig := makeExpectedConfig(m, pod, 0, false) | ||||
| 	containerConfig, _, err := m.generateContainerConfig(&pod.Spec.Containers[0], pod, 0, "", pod.Spec.Containers[0].Image, []string{}, nil) | ||||
| 	assert.NoError(t, err) | ||||
| 	assert.Equal(t, expectedConfig, containerConfig, "generate container config for kubelet runtime v1.") | ||||
| @@ -145,6 +146,101 @@ func TestGenerateContainerConfig(t *testing.T) { | ||||
| 	assert.Error(t, err, "RunAsNonRoot should fail for non-numeric username") | ||||
| } | ||||
|  | ||||
| func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) { | ||||
| 	_, _, m, err := createTestRuntimeManager() | ||||
| 	assert.NoError(t, err) | ||||
|  | ||||
| 	pod1 := &v1.Pod{ | ||||
| 		ObjectMeta: metav1.ObjectMeta{ | ||||
| 			UID:       "12345678", | ||||
| 			Name:      "bar", | ||||
| 			Namespace: "new", | ||||
| 		}, | ||||
| 		Spec: v1.PodSpec{ | ||||
| 			Containers: []v1.Container{ | ||||
| 				{ | ||||
| 					Name:            "foo", | ||||
| 					Image:           "busybox", | ||||
| 					ImagePullPolicy: v1.PullIfNotPresent, | ||||
| 					Command:         []string{"testCommand"}, | ||||
| 					WorkingDir:      "testWorkingDir", | ||||
| 					Resources: v1.ResourceRequirements{ | ||||
| 						Requests: v1.ResourceList{ | ||||
| 							v1.ResourceMemory: resource.MustParse("128Mi"), | ||||
| 						}, | ||||
| 						Limits: v1.ResourceList{ | ||||
| 							v1.ResourceMemory: resource.MustParse("256Mi"), | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 		}, | ||||
| 	} | ||||
|  | ||||
| 	pod2 := &v1.Pod{ | ||||
| 		ObjectMeta: metav1.ObjectMeta{ | ||||
| 			UID:       "12345678", | ||||
| 			Name:      "bar", | ||||
| 			Namespace: "new", | ||||
| 		}, | ||||
| 		Spec: v1.PodSpec{ | ||||
| 			Containers: []v1.Container{ | ||||
| 				{ | ||||
| 					Name:            "foo", | ||||
| 					Image:           "busybox", | ||||
| 					ImagePullPolicy: v1.PullIfNotPresent, | ||||
| 					Command:         []string{"testCommand"}, | ||||
| 					WorkingDir:      "testWorkingDir", | ||||
| 					Resources: v1.ResourceRequirements{ | ||||
| 						Requests: v1.ResourceList{ | ||||
| 							v1.ResourceMemory: resource.MustParse("128Mi"), | ||||
| 						}, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 		}, | ||||
| 	} | ||||
| 	memoryNodeAllocatable := resource.MustParse(fakeNodeAllocatableMemory) | ||||
| 	pod2MemoryHigh := float64(memoryNodeAllocatable.Value()) * m.memoryThrottlingFactor | ||||
|  | ||||
| 	type expectedResult struct { | ||||
| 		containerConfig *runtimeapi.LinuxContainerConfig | ||||
| 		memoryLow       int64 | ||||
| 		memoryHigh      int64 | ||||
| 	} | ||||
| 	tests := []struct { | ||||
| 		name     string | ||||
| 		pod      *v1.Pod | ||||
| 		expected *expectedResult | ||||
| 	}{ | ||||
| 		{ | ||||
| 			name: "Request128MBLimit256MB", | ||||
| 			pod:  pod1, | ||||
| 			expected: &expectedResult{ | ||||
| 				m.generateLinuxContainerConfig(&pod1.Spec.Containers[0], pod1, new(int64), "", nil, true), | ||||
| 				128 * 1024 * 1024, | ||||
| 				int64(float64(256*1024*1024) * m.memoryThrottlingFactor), | ||||
| 			}, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name: "Request128MBWithoutLimit", | ||||
| 			pod:  pod2, | ||||
| 			expected: &expectedResult{ | ||||
| 				m.generateLinuxContainerConfig(&pod2.Spec.Containers[0], pod2, new(int64), "", nil, true), | ||||
| 				128 * 1024 * 1024, | ||||
| 				int64(pod2MemoryHigh), | ||||
| 			}, | ||||
| 		}, | ||||
| 	} | ||||
|  | ||||
| 	for _, test := range tests { | ||||
| 		linuxConfig := m.generateLinuxContainerConfig(&test.pod.Spec.Containers[0], test.pod, new(int64), "", nil, true) | ||||
| 		assert.Equal(t, test.expected.containerConfig, linuxConfig, test.name) | ||||
| 		assert.Equal(t, linuxConfig.GetResources().GetUnified()["memory.min"], strconv.FormatInt(test.expected.memoryLow, 10), test.name) | ||||
| 		assert.Equal(t, linuxConfig.GetResources().GetUnified()["memory.high"], strconv.FormatInt(test.expected.memoryHigh, 10), test.name) | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func TestGetHugepageLimitsFromResources(t *testing.T) { | ||||
| 	var baseHugepage []*runtimeapi.HugepageLimit | ||||
|  | ||||
| @@ -361,7 +457,7 @@ func TestGenerateLinuxContainerConfigNamespaces(t *testing.T) { | ||||
| 		}, | ||||
| 	} { | ||||
| 		t.Run(tc.name, func(t *testing.T) { | ||||
| 			got := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", tc.target) | ||||
| 			got := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", tc.target, false) | ||||
| 			if diff := cmp.Diff(tc.want, got.SecurityContext.NamespaceOptions); diff != "" { | ||||
| 				t.Errorf("%v: diff (-want +got):\n%v", t.Name(), diff) | ||||
| 			} | ||||
| @@ -452,7 +548,7 @@ func TestGenerateLinuxContainerConfigSwap(t *testing.T) { | ||||
| 	} { | ||||
| 		t.Run(tc.name, func(t *testing.T) { | ||||
| 			m.memorySwapBehavior = tc.swapSetting | ||||
| 			actual := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", nil) | ||||
| 			actual := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", nil, false) | ||||
| 			assert.Equal(t, tc.expected, actual.Resources.MemorySwapLimitInBytes, "memory swap config for %s", tc.name) | ||||
| 		}) | ||||
| 	} | ||||
|   | ||||
| @@ -147,6 +147,12 @@ type kubeGenericRuntimeManager struct { | ||||
|  | ||||
| 	// MemorySwapBehavior defines how swap is used | ||||
| 	memorySwapBehavior string | ||||
|  | ||||
| 	//Function to get node allocatable resources | ||||
| 	getNodeAllocatable func() v1.ResourceList | ||||
|  | ||||
| 	// Memory throttling factor for MemoryQoS | ||||
| 	memoryThrottlingFactor float64 | ||||
| } | ||||
|  | ||||
| // KubeGenericRuntime is a interface contains interfaces for container runtime and command. | ||||
| @@ -190,6 +196,8 @@ func NewKubeGenericRuntimeManager( | ||||
| 	runtimeClassManager *runtimeclass.Manager, | ||||
| 	seccompDefault bool, | ||||
| 	memorySwapBehavior string, | ||||
| 	getNodeAllocatable func() v1.ResourceList, | ||||
| 	memoryThrottlingFactor float64, | ||||
| ) (KubeGenericRuntime, error) { | ||||
| 	kubeRuntimeManager := &kubeGenericRuntimeManager{ | ||||
| 		recorder:               recorder, | ||||
| @@ -211,6 +219,8 @@ func NewKubeGenericRuntimeManager( | ||||
| 		logReduction:           logreduction.NewLogReduction(identicalErrorDelay), | ||||
| 		seccompDefault:         seccompDefault, | ||||
| 		memorySwapBehavior:     memorySwapBehavior, | ||||
| 		getNodeAllocatable:     getNodeAllocatable, | ||||
| 		memoryThrottlingFactor: memoryThrottlingFactor, | ||||
| 	} | ||||
|  | ||||
| 	typedVersion, err := kubeRuntimeManager.getTypedVersion() | ||||
|   | ||||
| @@ -28,6 +28,7 @@ import ( | ||||
| 	"github.com/stretchr/testify/require" | ||||
|  | ||||
| 	v1 "k8s.io/api/core/v1" | ||||
| 	"k8s.io/apimachinery/pkg/api/resource" | ||||
| 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||
| 	"k8s.io/apimachinery/pkg/types" | ||||
| 	"k8s.io/apimachinery/pkg/util/sets" | ||||
| @@ -58,7 +59,10 @@ func customTestRuntimeManager(keyring *credentialprovider.BasicDockerKeyring) (* | ||||
| 	// Only an empty machineInfo is needed here, because in unit test all containers are besteffort, | ||||
| 	// data in machineInfo is not used. If burstable containers are used in unit test in the future, | ||||
| 	// we may want to set memory capacity. | ||||
| 	machineInfo := &cadvisorapi.MachineInfo{} | ||||
| 	memoryCapacityQuantity := resource.MustParse(fakeNodeAllocatableMemory) | ||||
| 	machineInfo := &cadvisorapi.MachineInfo{ | ||||
| 		MemoryCapacity: uint64(memoryCapacityQuantity.Value()), | ||||
| 	} | ||||
| 	osInterface := &containertest.FakeOS{} | ||||
| 	manager, err := newFakeKubeRuntimeManager(fakeRuntimeService, fakeImageService, machineInfo, osInterface, &containertest.FakeRuntimeHelper{}, keyring) | ||||
| 	return fakeRuntimeService, fakeImageService, manager, err | ||||
|   | ||||
| @@ -121,7 +121,7 @@ func calculateEmptyDirMemorySize(nodeAllocatableMemory *resource.Quantity, spec | ||||
| 	// determine pod resource allocation | ||||
| 	// we use the same function for pod cgroup assigment to maintain consistent behavior | ||||
| 	// NOTE: this could be nil on systems that do not support pod memory containment (i.e. windows) | ||||
| 	podResourceConfig := cm.ResourceConfigForPod(pod, false, uint64(100000)) | ||||
| 	podResourceConfig := cm.ResourceConfigForPod(pod, false, uint64(100000), false) | ||||
| 	if podResourceConfig != nil && podResourceConfig.Memory != nil { | ||||
| 		podMemoryLimit := resource.NewQuantity(*(podResourceConfig.Memory), resource.BinarySI) | ||||
| 		// ensure 0 < value < size | ||||
|   | ||||
| @@ -947,6 +947,15 @@ type KubeletConfiguration struct { | ||||
| 	// Default: false | ||||
| 	// +optional | ||||
| 	SeccompDefault *bool `json:"seccompDefault,omitempty"` | ||||
| 	// MemoryThrottlingFactor specifies the factor multiplied by the memory limit or node allocatable memory | ||||
| 	// when setting the cgroupv2 memory.high value to enforce MemoryQoS. | ||||
| 	// Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure | ||||
| 	// while increasing will put less reclaim pressure. | ||||
| 	// See http://kep.k8s.io/2570 for more details. | ||||
| 	// Default: 0.8 | ||||
| 	// +featureGate=MemoryQoS | ||||
| 	// +optional | ||||
| 	MemoryThrottlingFactor *float64 `json:"memoryThrottlingFactor,omitempty"` | ||||
| } | ||||
|  | ||||
| type KubeletAuthorizationMode string | ||||
|   | ||||
| @@ -327,6 +327,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { | ||||
| 		*out = new(bool) | ||||
| 		**out = **in | ||||
| 	} | ||||
| 	if in.MemoryThrottlingFactor != nil { | ||||
| 		in, out := &in.MemoryThrottlingFactor, &out.MemoryThrottlingFactor | ||||
| 		*out = new(float64) | ||||
| 		**out = **in | ||||
| 	} | ||||
| 	return | ||||
| } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Li Bo
					Li Bo