mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-10-31 10:18:13 +00:00 
			
		
		
		
	feature: support Memory QoS for cgroups v2
This commit is contained in:
		| @@ -522,6 +522,11 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend | |||||||
| 		return err | 		return err | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  | 	// Warn if MemoryQoS enabled with cgroups v1 | ||||||
|  | 	if utilfeature.DefaultFeatureGate.Enabled(features.MemoryQoS) && | ||||||
|  | 		!isCgroup2UnifiedMode() { | ||||||
|  | 		klog.InfoS("Warning: MemoryQoS feature only works with cgroups v2 on Linux, but enabled with cgroups v1") | ||||||
|  | 	} | ||||||
| 	// Obtain Kubelet Lock File | 	// Obtain Kubelet Lock File | ||||||
| 	if s.ExitOnLockContention && s.LockFilePath == "" { | 	if s.ExitOnLockContention && s.LockFilePath == "" { | ||||||
| 		return errors.New("cannot exit on lock file contention: no lock file specified") | 		return errors.New("cannot exit on lock file contention: no lock file specified") | ||||||
|   | |||||||
| @@ -19,6 +19,8 @@ package app | |||||||
| import ( | import ( | ||||||
| 	"k8s.io/klog/v2" | 	"k8s.io/klog/v2" | ||||||
| 	"k8s.io/utils/inotify" | 	"k8s.io/utils/inotify" | ||||||
|  |  | ||||||
|  | 	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| func watchForLockfileContention(path string, done chan struct{}) error { | func watchForLockfileContention(path string, done chan struct{}) error { | ||||||
| @@ -44,3 +46,7 @@ func watchForLockfileContention(path string, done chan struct{}) error { | |||||||
| 	}() | 	}() | ||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func isCgroup2UnifiedMode() bool { | ||||||
|  | 	return libcontainercgroups.IsCgroup2UnifiedMode() | ||||||
|  | } | ||||||
|   | |||||||
| @@ -23,3 +23,7 @@ import "errors" | |||||||
| func watchForLockfileContention(path string, done chan struct{}) error { | func watchForLockfileContention(path string, done chan struct{}) error { | ||||||
| 	return errors.New("kubelet unsupported in this build") | 	return errors.New("kubelet unsupported in this build") | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func isCgroup2UnifiedMode() bool { | ||||||
|  | 	return false | ||||||
|  | } | ||||||
|   | |||||||
| @@ -773,6 +773,13 @@ const ( | |||||||
| 	// The user namespace has to be created before running kubelet. | 	// The user namespace has to be created before running kubelet. | ||||||
| 	// All the node components such as CRI need to be running in the same user namespace. | 	// All the node components such as CRI need to be running in the same user namespace. | ||||||
| 	KubeletInUserNamespace featuregate.Feature = "KubeletInUserNamespace" | 	KubeletInUserNamespace featuregate.Feature = "KubeletInUserNamespace" | ||||||
|  |  | ||||||
|  | 	// owner: @xiaoxubeii | ||||||
|  | 	// kep: http://kep.k8s.io/2570 | ||||||
|  | 	// alpha: v1.22 | ||||||
|  | 	// | ||||||
|  | 	// Enables kubelet to support memory QoS with cgroups v2. | ||||||
|  | 	MemoryQoS featuregate.Feature = "MemoryQoS" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| func init() { | func init() { | ||||||
| @@ -888,6 +895,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS | |||||||
| 	CSRDuration:                                    {Default: true, PreRelease: featuregate.Beta}, | 	CSRDuration:                                    {Default: true, PreRelease: featuregate.Beta}, | ||||||
| 	DelegateFSGroupToCSIDriver:                     {Default: false, PreRelease: featuregate.Alpha}, | 	DelegateFSGroupToCSIDriver:                     {Default: false, PreRelease: featuregate.Alpha}, | ||||||
| 	KubeletInUserNamespace:                         {Default: false, PreRelease: featuregate.Alpha}, | 	KubeletInUserNamespace:                         {Default: false, PreRelease: featuregate.Alpha}, | ||||||
|  | 	MemoryQoS:                                      {Default: false, PreRelease: featuregate.Alpha}, | ||||||
|  |  | ||||||
| 	// inherited features from generic apiserver, relisted here to get a conflict if it is changed | 	// inherited features from generic apiserver, relisted here to get a conflict if it is changed | ||||||
| 	// unintentionally on either side: | 	// unintentionally on either side: | ||||||
|   | |||||||
| @@ -17,6 +17,7 @@ limitations under the License. | |||||||
| package fuzzer | package fuzzer | ||||||
|  |  | ||||||
| import ( | import ( | ||||||
|  | 	"math/rand" | ||||||
| 	"time" | 	"time" | ||||||
|  |  | ||||||
| 	"github.com/google/gofuzz" | 	"github.com/google/gofuzz" | ||||||
| @@ -29,6 +30,7 @@ import ( | |||||||
| 	kubeletconfigv1beta1 "k8s.io/kubernetes/pkg/kubelet/apis/config/v1beta1" | 	kubeletconfigv1beta1 "k8s.io/kubernetes/pkg/kubelet/apis/config/v1beta1" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/qos" | 	"k8s.io/kubernetes/pkg/kubelet/qos" | ||||||
| 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types" | 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types" | ||||||
|  | 	utilpointer "k8s.io/utils/pointer" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| // Funcs returns the fuzzer functions for the kubeletconfig apis. | // Funcs returns the fuzzer functions for the kubeletconfig apis. | ||||||
| @@ -106,6 +108,7 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} { | |||||||
| 				obj.Logging.Format = "text" | 				obj.Logging.Format = "text" | ||||||
| 			} | 			} | ||||||
| 			obj.EnableSystemLogHandler = true | 			obj.EnableSystemLogHandler = true | ||||||
|  | 			obj.MemoryThrottlingFactor = utilpointer.Float64Ptr(rand.Float64()) | ||||||
| 		}, | 		}, | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
|   | |||||||
| @@ -249,5 +249,6 @@ var ( | |||||||
| 		"VolumePluginDir", | 		"VolumePluginDir", | ||||||
| 		"ShutdownGracePeriod.Duration", | 		"ShutdownGracePeriod.Duration", | ||||||
| 		"ShutdownGracePeriodCriticalPods.Duration", | 		"ShutdownGracePeriodCriticalPods.Duration", | ||||||
|  | 		"MemoryThrottlingFactor", | ||||||
| 	) | 	) | ||||||
| ) | ) | ||||||
|   | |||||||
| @@ -59,6 +59,7 @@ maxOpenFiles: 1000000 | |||||||
| maxPods: 110 | maxPods: 110 | ||||||
| memoryManagerPolicy: None | memoryManagerPolicy: None | ||||||
| memorySwap: {} | memorySwap: {} | ||||||
|  | memoryThrottlingFactor: 0.8 | ||||||
| nodeLeaseDurationSeconds: 40 | nodeLeaseDurationSeconds: 40 | ||||||
| nodeStatusMaxImages: 50 | nodeStatusMaxImages: 50 | ||||||
| nodeStatusReportFrequency: 5m0s | nodeStatusReportFrequency: 5m0s | ||||||
|   | |||||||
| @@ -59,6 +59,7 @@ maxOpenFiles: 1000000 | |||||||
| maxPods: 110 | maxPods: 110 | ||||||
| memoryManagerPolicy: None | memoryManagerPolicy: None | ||||||
| memorySwap: {} | memorySwap: {} | ||||||
|  | memoryThrottlingFactor: 0.8 | ||||||
| nodeLeaseDurationSeconds: 40 | nodeLeaseDurationSeconds: 40 | ||||||
| nodeStatusMaxImages: 50 | nodeStatusMaxImages: 50 | ||||||
| nodeStatusReportFrequency: 5m0s | nodeStatusReportFrequency: 5m0s | ||||||
|   | |||||||
| @@ -413,6 +413,15 @@ type KubeletConfiguration struct { | |||||||
| 	EnableDebugFlagsHandler bool | 	EnableDebugFlagsHandler bool | ||||||
| 	// SeccompDefault enables the use of `RuntimeDefault` as the default seccomp profile for all workloads. | 	// SeccompDefault enables the use of `RuntimeDefault` as the default seccomp profile for all workloads. | ||||||
| 	SeccompDefault bool | 	SeccompDefault bool | ||||||
|  | 	// MemoryThrottlingFactor specifies the factor multiplied by the memory limit or node allocatable memory | ||||||
|  | 	// when setting the cgroupv2 memory.high value to enforce MemoryQoS. | ||||||
|  | 	// Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure | ||||||
|  | 	// while increasing will put less reclaim pressure. | ||||||
|  | 	// See http://kep.k8s.io/2570 for more details. | ||||||
|  | 	// Default: 0.8 | ||||||
|  | 	// +featureGate=MemoryQoS | ||||||
|  | 	// +optional | ||||||
|  | 	MemoryThrottlingFactor *float64 | ||||||
| } | } | ||||||
|  |  | ||||||
| // KubeletAuthorizationMode denotes the authorization mode for the kubelet | // KubeletAuthorizationMode denotes the authorization mode for the kubelet | ||||||
|   | |||||||
| @@ -36,6 +36,9 @@ const ( | |||||||
| 	DefaultIPTablesMasqueradeBit = 14 | 	DefaultIPTablesMasqueradeBit = 14 | ||||||
| 	DefaultIPTablesDropBit       = 15 | 	DefaultIPTablesDropBit       = 15 | ||||||
| 	DefaultVolumePluginDir       = "/usr/libexec/kubernetes/kubelet-plugins/volume/exec/" | 	DefaultVolumePluginDir       = "/usr/libexec/kubernetes/kubelet-plugins/volume/exec/" | ||||||
|  |  | ||||||
|  | 	// See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos | ||||||
|  | 	DefaultMemoryThrottlingFactor = 0.8 | ||||||
| ) | ) | ||||||
|  |  | ||||||
| var ( | var ( | ||||||
| @@ -255,4 +258,7 @@ func SetDefaults_KubeletConfiguration(obj *kubeletconfigv1beta1.KubeletConfigura | |||||||
| 	if obj.SeccompDefault == nil { | 	if obj.SeccompDefault == nil { | ||||||
| 		obj.SeccompDefault = utilpointer.BoolPtr(false) | 		obj.SeccompDefault = utilpointer.BoolPtr(false) | ||||||
| 	} | 	} | ||||||
|  | 	if obj.MemoryThrottlingFactor == nil { | ||||||
|  | 		obj.MemoryThrottlingFactor = utilpointer.Float64Ptr(DefaultMemoryThrottlingFactor) | ||||||
|  | 	} | ||||||
| } | } | ||||||
|   | |||||||
| @@ -387,6 +387,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in | |||||||
| 	if err := v1.Convert_Pointer_bool_To_bool(&in.SeccompDefault, &out.SeccompDefault, s); err != nil { | 	if err := v1.Convert_Pointer_bool_To_bool(&in.SeccompDefault, &out.SeccompDefault, s); err != nil { | ||||||
| 		return err | 		return err | ||||||
| 	} | 	} | ||||||
|  | 	out.MemoryThrottlingFactor = (*float64)(unsafe.Pointer(in.MemoryThrottlingFactor)) | ||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -554,6 +555,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in | |||||||
| 	if err := v1.Convert_bool_To_Pointer_bool(&in.SeccompDefault, &out.SeccompDefault, s); err != nil { | 	if err := v1.Convert_bool_To_Pointer_bool(&in.SeccompDefault, &out.SeccompDefault, s); err != nil { | ||||||
| 		return err | 		return err | ||||||
| 	} | 	} | ||||||
|  | 	out.MemoryThrottlingFactor = (*float64)(unsafe.Pointer(in.MemoryThrottlingFactor)) | ||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -212,5 +212,13 @@ func ValidateKubeletConfiguration(kc *kubeletconfig.KubeletConfiguration) error | |||||||
| 	if errs := logs.ValidateLoggingConfiguration(&kc.Logging, field.NewPath("logging")); len(errs) > 0 { | 	if errs := logs.ValidateLoggingConfiguration(&kc.Logging, field.NewPath("logging")); len(errs) > 0 { | ||||||
| 		allErrors = append(allErrors, errs.ToAggregate().Errors()...) | 		allErrors = append(allErrors, errs.ToAggregate().Errors()...) | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  | 	if localFeatureGate.Enabled(features.MemoryQoS) && kc.MemoryThrottlingFactor == nil { | ||||||
|  | 		allErrors = append(allErrors, fmt.Errorf("invalid configuration: memoryThrottlingFactor is required when MemoryQoS feature flag is enabled")) | ||||||
|  | 	} | ||||||
|  | 	if kc.MemoryThrottlingFactor != nil && (*kc.MemoryThrottlingFactor <= 0 || *kc.MemoryThrottlingFactor > 1.0) { | ||||||
|  | 		allErrors = append(allErrors, fmt.Errorf("invalid configuration: memoryThrottlingFactor %v must be greater than 0 and less than or equal to 1.0", kc.MemoryThrottlingFactor)) | ||||||
|  | 	} | ||||||
|  |  | ||||||
| 	return utilerrors.NewAggregate(allErrors) | 	return utilerrors.NewAggregate(allErrors) | ||||||
| } | } | ||||||
|   | |||||||
| @@ -25,6 +25,7 @@ import ( | |||||||
| 	componentbaseconfig "k8s.io/component-base/config" | 	componentbaseconfig "k8s.io/component-base/config" | ||||||
| 	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" | 	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" | ||||||
| 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types" | 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types" | ||||||
|  | 	utilpointer "k8s.io/utils/pointer" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| func TestValidateKubeletConfiguration(t *testing.T) { | func TestValidateKubeletConfiguration(t *testing.T) { | ||||||
| @@ -59,9 +60,11 @@ func TestValidateKubeletConfiguration(t *testing.T) { | |||||||
| 		TopologyManagerPolicy:           kubeletconfig.SingleNumaNodeTopologyManagerPolicy, | 		TopologyManagerPolicy:           kubeletconfig.SingleNumaNodeTopologyManagerPolicy, | ||||||
| 		ShutdownGracePeriod:             metav1.Duration{Duration: 30 * time.Second}, | 		ShutdownGracePeriod:             metav1.Duration{Duration: 30 * time.Second}, | ||||||
| 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second}, | 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second}, | ||||||
|  | 		MemoryThrottlingFactor:          utilpointer.Float64Ptr(0.8), | ||||||
| 		FeatureGates: map[string]bool{ | 		FeatureGates: map[string]bool{ | ||||||
| 			"CustomCPUCFSQuotaPeriod": true, | 			"CustomCPUCFSQuotaPeriod": true, | ||||||
| 			"GracefulNodeShutdown":    true, | 			"GracefulNodeShutdown":    true, | ||||||
|  | 			"MemoryQoS":               true, | ||||||
| 		}, | 		}, | ||||||
| 		Logging: componentbaseconfig.LoggingConfiguration{ | 		Logging: componentbaseconfig.LoggingConfiguration{ | ||||||
| 			Format: "text", | 			Format: "text", | ||||||
| @@ -103,8 +106,10 @@ func TestValidateKubeletConfiguration(t *testing.T) { | |||||||
| 		TopologyManagerPolicy:           kubeletconfig.NoneTopologyManagerPolicy, | 		TopologyManagerPolicy:           kubeletconfig.NoneTopologyManagerPolicy, | ||||||
| 		ShutdownGracePeriod:             metav1.Duration{Duration: 10 * time.Minute}, | 		ShutdownGracePeriod:             metav1.Duration{Duration: 10 * time.Minute}, | ||||||
| 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0}, | 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0}, | ||||||
|  | 		MemoryThrottlingFactor:          utilpointer.Float64Ptr(0.9), | ||||||
| 		FeatureGates: map[string]bool{ | 		FeatureGates: map[string]bool{ | ||||||
| 			"CustomCPUCFSQuotaPeriod": true, | 			"CustomCPUCFSQuotaPeriod": true, | ||||||
|  | 			"MemoryQoS":               true, | ||||||
| 		}, | 		}, | ||||||
| 		Logging: componentbaseconfig.LoggingConfiguration{ | 		Logging: componentbaseconfig.LoggingConfiguration{ | ||||||
| 			Format: "text", | 			Format: "text", | ||||||
| @@ -147,10 +152,12 @@ func TestValidateKubeletConfiguration(t *testing.T) { | |||||||
| 		ShutdownGracePeriod:             metav1.Duration{Duration: 10 * time.Minute}, | 		ShutdownGracePeriod:             metav1.Duration{Duration: 10 * time.Minute}, | ||||||
| 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0}, | 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0}, | ||||||
| 		MemorySwap:                      kubeletconfig.MemorySwapConfiguration{SwapBehavior: kubetypes.UnlimitedSwap}, | 		MemorySwap:                      kubeletconfig.MemorySwapConfiguration{SwapBehavior: kubetypes.UnlimitedSwap}, | ||||||
|  | 		MemoryThrottlingFactor:          utilpointer.Float64Ptr(0.5), | ||||||
| 		FeatureGates: map[string]bool{ | 		FeatureGates: map[string]bool{ | ||||||
| 			"CustomCPUCFSQuotaPeriod": true, | 			"CustomCPUCFSQuotaPeriod": true, | ||||||
| 			"GracefulNodeShutdown":    true, | 			"GracefulNodeShutdown":    true, | ||||||
| 			"NodeSwapEnabled":         true, | 			"NodeSwapEnabled":         true, | ||||||
|  | 			"MemoryQoS":               true, | ||||||
| 		}, | 		}, | ||||||
| 		Logging: componentbaseconfig.LoggingConfiguration{ | 		Logging: componentbaseconfig.LoggingConfiguration{ | ||||||
| 			Format: "text", | 			Format: "text", | ||||||
| @@ -230,16 +237,18 @@ func TestValidateKubeletConfiguration(t *testing.T) { | |||||||
| 		ShutdownGracePeriod:             metav1.Duration{Duration: 40 * time.Second}, | 		ShutdownGracePeriod:             metav1.Duration{Duration: 40 * time.Second}, | ||||||
| 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second}, | 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second}, | ||||||
| 		MemorySwap:                      kubeletconfig.MemorySwapConfiguration{SwapBehavior: "invalid"}, | 		MemorySwap:                      kubeletconfig.MemorySwapConfiguration{SwapBehavior: "invalid"}, | ||||||
|  | 		MemoryThrottlingFactor:          utilpointer.Float64Ptr(1.1), | ||||||
| 		FeatureGates: map[string]bool{ | 		FeatureGates: map[string]bool{ | ||||||
| 			"CustomCPUCFSQuotaPeriod": true, | 			"CustomCPUCFSQuotaPeriod": true, | ||||||
| 			"GracefulNodeShutdown":    true, | 			"GracefulNodeShutdown":    true, | ||||||
| 			"NodeSwapEnabled":         true, | 			"NodeSwapEnabled":         true, | ||||||
|  | 			"MemoryQoS":               true, | ||||||
| 		}, | 		}, | ||||||
| 		Logging: componentbaseconfig.LoggingConfiguration{ | 		Logging: componentbaseconfig.LoggingConfiguration{ | ||||||
| 			Format: "text", | 			Format: "text", | ||||||
| 		}, | 		}, | ||||||
| 	} | 	} | ||||||
| 	const numErrsErrorCase2 = 4 | 	const numErrsErrorCase2 = 5 | ||||||
| 	if allErrors := ValidateKubeletConfiguration(errorCase2); len(allErrors.(utilerrors.Aggregate).Errors()) != numErrsErrorCase2 { | 	if allErrors := ValidateKubeletConfiguration(errorCase2); len(allErrors.(utilerrors.Aggregate).Errors()) != numErrsErrorCase2 { | ||||||
| 		t.Errorf("expect %d errors, got %v", numErrsErrorCase2, len(allErrors.(utilerrors.Aggregate).Errors())) | 		t.Errorf("expect %d errors, got %v", numErrsErrorCase2, len(allErrors.(utilerrors.Aggregate).Errors())) | ||||||
| 	} | 	} | ||||||
|   | |||||||
							
								
								
									
										5
									
								
								pkg/kubelet/apis/config/zz_generated.deepcopy.go
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										5
									
								
								pkg/kubelet/apis/config/zz_generated.deepcopy.go
									
									
									
										generated
									
									
									
								
							| @@ -282,6 +282,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { | |||||||
| 			(*in)[i].DeepCopyInto(&(*out)[i]) | 			(*in)[i].DeepCopyInto(&(*out)[i]) | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
|  | 	if in.MemoryThrottlingFactor != nil { | ||||||
|  | 		in, out := &in.MemoryThrottlingFactor, &out.MemoryThrottlingFactor | ||||||
|  | 		*out = new(float64) | ||||||
|  | 		**out = **in | ||||||
|  | 	} | ||||||
| 	return | 	return | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -53,6 +53,10 @@ const ( | |||||||
| 	libcontainerSystemd libcontainerCgroupManagerType = "systemd" | 	libcontainerSystemd libcontainerCgroupManagerType = "systemd" | ||||||
| 	// systemdSuffix is the cgroup name suffix for systemd | 	// systemdSuffix is the cgroup name suffix for systemd | ||||||
| 	systemdSuffix string = ".slice" | 	systemdSuffix string = ".slice" | ||||||
|  | 	// MemoryMin is memory.min for cgroup v2 | ||||||
|  | 	MemoryMin string = "memory.min" | ||||||
|  | 	// MemoryHigh is memory.high for cgroup v2 | ||||||
|  | 	MemoryHigh string = "memory.high" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| var RootCgroupName = CgroupName([]string{}) | var RootCgroupName = CgroupName([]string{}) | ||||||
| @@ -434,6 +438,15 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont | |||||||
| 			Limit:    uint64(0), | 			Limit:    uint64(0), | ||||||
| 		}) | 		}) | ||||||
| 	} | 	} | ||||||
|  | 	// Ideally unified is used for all the resources when running on cgroup v2. | ||||||
|  | 	// It doesn't make difference for the memory.max limit, but for e.g. the cpu controller | ||||||
|  | 	// you can specify the correct setting without relying on the conversions performed by the OCI runtime. | ||||||
|  | 	if resourceConfig.Unified != nil && libcontainercgroups.IsCgroup2UnifiedMode() { | ||||||
|  | 		resources.Unified = make(map[string]string) | ||||||
|  | 		for k, v := range resourceConfig.Unified { | ||||||
|  | 			resources.Unified[k] = v | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
| 	return resources | 	return resources | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -112,6 +112,9 @@ type ContainerManager interface { | |||||||
| 	// GetAllocateResourcesPodAdmitHandler returns an instance of a PodAdmitHandler responsible for allocating pod resources. | 	// GetAllocateResourcesPodAdmitHandler returns an instance of a PodAdmitHandler responsible for allocating pod resources. | ||||||
| 	GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler | 	GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler | ||||||
|  |  | ||||||
|  | 	// GetNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement. | ||||||
|  | 	GetNodeAllocatableAbsolute() v1.ResourceList | ||||||
|  |  | ||||||
| 	// Implements the podresources Provider API for CPUs, Memory and Devices | 	// Implements the podresources Provider API for CPUs, Memory and Devices | ||||||
| 	podresources.CPUsProvider | 	podresources.CPUsProvider | ||||||
| 	podresources.DevicesProvider | 	podresources.DevicesProvider | ||||||
|   | |||||||
| @@ -491,7 +491,7 @@ func (cm *containerManagerImpl) setupNode(activePods ActivePodsFunc) error { | |||||||
| 		if err := cm.createNodeAllocatableCgroups(); err != nil { | 		if err := cm.createNodeAllocatableCgroups(); err != nil { | ||||||
| 			return err | 			return err | ||||||
| 		} | 		} | ||||||
| 		err = cm.qosContainerManager.Start(cm.getNodeAllocatableAbsolute, activePods) | 		err = cm.qosContainerManager.Start(cm.GetNodeAllocatableAbsolute, activePods) | ||||||
| 		if err != nil { | 		if err != nil { | ||||||
| 			return fmt.Errorf("failed to initialize top level QOS containers: %v", err) | 			return fmt.Errorf("failed to initialize top level QOS containers: %v", err) | ||||||
| 		} | 		} | ||||||
|   | |||||||
| @@ -147,6 +147,10 @@ func (cm *containerManagerStub) GetAllocatableMemory() []*podresourcesapi.Contai | |||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func (cm *containerManagerStub) GetNodeAllocatableAbsolute() v1.ResourceList { | ||||||
|  | 	return nil | ||||||
|  | } | ||||||
|  |  | ||||||
| func NewStubContainerManager() ContainerManager { | func NewStubContainerManager() ContainerManager { | ||||||
| 	return &containerManagerStub{shouldResetExtendedResourceCapacity: false} | 	return &containerManagerStub{shouldResetExtendedResourceCapacity: false} | ||||||
| } | } | ||||||
|   | |||||||
| @@ -251,3 +251,7 @@ func (cm *containerManagerImpl) GetMemory(_, _ string) []*podresourcesapi.Contai | |||||||
| func (cm *containerManagerImpl) GetAllocatableMemory() []*podresourcesapi.ContainerMemory { | func (cm *containerManagerImpl) GetAllocatableMemory() []*podresourcesapi.ContainerMemory { | ||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func (cm *containerManagerImpl) GetNodeAllocatableAbsolute() v1.ResourceList { | ||||||
|  | 	return nil | ||||||
|  | } | ||||||
|   | |||||||
| @@ -227,3 +227,9 @@ func (cm *FakeContainerManager) GetAllocatableMemory() []*podresourcesapi.Contai | |||||||
| 	defer cm.Unlock() | 	defer cm.Unlock() | ||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func (cm *FakeContainerManager) GetNodeAllocatableAbsolute() v1.ResourceList { | ||||||
|  | 	cm.Lock() | ||||||
|  | 	defer cm.Unlock() | ||||||
|  | 	return nil | ||||||
|  | } | ||||||
|   | |||||||
| @@ -113,7 +113,7 @@ func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 { | |||||||
| } | } | ||||||
|  |  | ||||||
| // ResourceConfigForPod takes the input pod and outputs the cgroup resource config. | // ResourceConfigForPod takes the input pod and outputs the cgroup resource config. | ||||||
| func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64) *ResourceConfig { | func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig { | ||||||
| 	// sum requests and limits. | 	// sum requests and limits. | ||||||
| 	reqs, limits := resource.PodRequestsAndLimits(pod) | 	reqs, limits := resource.PodRequestsAndLimits(pod) | ||||||
|  |  | ||||||
| @@ -185,6 +185,19 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64) | |||||||
| 		result.CpuShares = &shares | 		result.CpuShares = &shares | ||||||
| 	} | 	} | ||||||
| 	result.HugePageLimit = hugePageLimits | 	result.HugePageLimit = hugePageLimits | ||||||
|  |  | ||||||
|  | 	if enforceMemoryQoS { | ||||||
|  | 		memoryMin := int64(0) | ||||||
|  | 		if request, found := reqs[v1.ResourceMemory]; found { | ||||||
|  | 			memoryMin = request.Value() | ||||||
|  | 		} | ||||||
|  | 		if memoryMin > 0 { | ||||||
|  | 			result.Unified = map[string]string{ | ||||||
|  | 				MemoryMin: strconv.FormatInt(memoryMin, 10), | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  |  | ||||||
| 	return result | 	return result | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -252,7 +252,7 @@ func TestResourceConfigForPod(t *testing.T) { | |||||||
|  |  | ||||||
| 	for testName, testCase := range testCases { | 	for testName, testCase := range testCases { | ||||||
|  |  | ||||||
| 		actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod) | 		actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, false) | ||||||
|  |  | ||||||
| 		if !reflect.DeepEqual(actual.CpuPeriod, testCase.expected.CpuPeriod) { | 		if !reflect.DeepEqual(actual.CpuPeriod, testCase.expected.CpuPeriod) { | ||||||
| 			t.Errorf("unexpected result, test: %v, cpu period not as expected", testName) | 			t.Errorf("unexpected result, test: %v, cpu period not as expected", testName) | ||||||
| @@ -472,7 +472,7 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { | |||||||
|  |  | ||||||
| 	for testName, testCase := range testCases { | 	for testName, testCase := range testCases { | ||||||
|  |  | ||||||
| 		actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod) | 		actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, false) | ||||||
|  |  | ||||||
| 		if !reflect.DeepEqual(actual.CpuPeriod, testCase.expected.CpuPeriod) { | 		if !reflect.DeepEqual(actual.CpuPeriod, testCase.expected.CpuPeriod) { | ||||||
| 			t.Errorf("unexpected result, test: %v, cpu period not as expected", testName) | 			t.Errorf("unexpected result, test: %v, cpu period not as expected", testName) | ||||||
| @@ -622,3 +622,211 @@ func TestHugePageLimits(t *testing.T) { | |||||||
|  |  | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func TestResourceConfigForPodWithEnforceMemoryQoS(t *testing.T) { | ||||||
|  | 	defaultQuotaPeriod := uint64(100 * time.Millisecond / time.Microsecond) | ||||||
|  | 	tunedQuotaPeriod := uint64(5 * time.Millisecond / time.Microsecond) | ||||||
|  |  | ||||||
|  | 	minShares := uint64(MinShares) | ||||||
|  | 	burstableShares := MilliCPUToShares(100) | ||||||
|  | 	memoryQuantity := resource.MustParse("200Mi") | ||||||
|  | 	burstableMemory := memoryQuantity.Value() | ||||||
|  | 	burstablePartialShares := MilliCPUToShares(200) | ||||||
|  | 	burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod)) | ||||||
|  | 	guaranteedShares := MilliCPUToShares(100) | ||||||
|  | 	guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod)) | ||||||
|  | 	guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod)) | ||||||
|  | 	memoryQuantity = resource.MustParse("100Mi") | ||||||
|  | 	cpuNoLimit := int64(-1) | ||||||
|  | 	guaranteedMemory := memoryQuantity.Value() | ||||||
|  | 	testCases := map[string]struct { | ||||||
|  | 		pod              *v1.Pod | ||||||
|  | 		expected         *ResourceConfig | ||||||
|  | 		enforceCPULimits bool | ||||||
|  | 		quotaPeriod      uint64 | ||||||
|  | 	}{ | ||||||
|  | 		"besteffort": { | ||||||
|  | 			pod: &v1.Pod{ | ||||||
|  | 				Spec: v1.PodSpec{ | ||||||
|  | 					Containers: []v1.Container{ | ||||||
|  | 						{ | ||||||
|  | 							Resources: getResourceRequirements(getResourceList("", ""), getResourceList("", "")), | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			enforceCPULimits: true, | ||||||
|  | 			quotaPeriod:      defaultQuotaPeriod, | ||||||
|  | 			expected:         &ResourceConfig{CpuShares: &minShares}, | ||||||
|  | 		}, | ||||||
|  | 		"burstable-no-limits": { | ||||||
|  | 			pod: &v1.Pod{ | ||||||
|  | 				Spec: v1.PodSpec{ | ||||||
|  | 					Containers: []v1.Container{ | ||||||
|  | 						{ | ||||||
|  | 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("", "")), | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			enforceCPULimits: true, | ||||||
|  | 			quotaPeriod:      defaultQuotaPeriod, | ||||||
|  | 			expected:         &ResourceConfig{CpuShares: &burstableShares, Unified: map[string]string{"memory.min": "104857600"}}, | ||||||
|  | 		}, | ||||||
|  | 		"burstable-with-limits": { | ||||||
|  | 			pod: &v1.Pod{ | ||||||
|  | 				Spec: v1.PodSpec{ | ||||||
|  | 					Containers: []v1.Container{ | ||||||
|  | 						{ | ||||||
|  | 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")), | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			enforceCPULimits: true, | ||||||
|  | 			quotaPeriod:      defaultQuotaPeriod, | ||||||
|  | 			expected:         &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &burstableQuota, CpuPeriod: &defaultQuotaPeriod, Memory: &burstableMemory, Unified: map[string]string{"memory.min": "104857600"}}, | ||||||
|  | 		}, | ||||||
|  | 		"burstable-with-limits-no-cpu-enforcement": { | ||||||
|  | 			pod: &v1.Pod{ | ||||||
|  | 				Spec: v1.PodSpec{ | ||||||
|  | 					Containers: []v1.Container{ | ||||||
|  | 						{ | ||||||
|  | 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")), | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			enforceCPULimits: false, | ||||||
|  | 			quotaPeriod:      defaultQuotaPeriod, | ||||||
|  | 			expected:         &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &cpuNoLimit, CpuPeriod: &defaultQuotaPeriod, Memory: &burstableMemory, Unified: map[string]string{"memory.min": "104857600"}}, | ||||||
|  | 		}, | ||||||
|  | 		"burstable-partial-limits": { | ||||||
|  | 			pod: &v1.Pod{ | ||||||
|  | 				Spec: v1.PodSpec{ | ||||||
|  | 					Containers: []v1.Container{ | ||||||
|  | 						{ | ||||||
|  | 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")), | ||||||
|  | 						}, | ||||||
|  | 						{ | ||||||
|  | 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("", "")), | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			enforceCPULimits: true, | ||||||
|  | 			quotaPeriod:      defaultQuotaPeriod, | ||||||
|  | 			expected:         &ResourceConfig{CpuShares: &burstablePartialShares, Unified: map[string]string{"memory.min": "209715200"}}, | ||||||
|  | 		}, | ||||||
|  | 		"burstable-with-limits-with-tuned-quota": { | ||||||
|  | 			pod: &v1.Pod{ | ||||||
|  | 				Spec: v1.PodSpec{ | ||||||
|  | 					Containers: []v1.Container{ | ||||||
|  | 						{ | ||||||
|  | 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")), | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			enforceCPULimits: true, | ||||||
|  | 			quotaPeriod:      tunedQuotaPeriod, | ||||||
|  | 			expected:         &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &burstableQuota, CpuPeriod: &tunedQuotaPeriod, Memory: &burstableMemory, Unified: map[string]string{"memory.min": "104857600"}}, | ||||||
|  | 		}, | ||||||
|  | 		"burstable-with-limits-no-cpu-enforcement-with-tuned-quota": { | ||||||
|  | 			pod: &v1.Pod{ | ||||||
|  | 				Spec: v1.PodSpec{ | ||||||
|  | 					Containers: []v1.Container{ | ||||||
|  | 						{ | ||||||
|  | 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")), | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			enforceCPULimits: false, | ||||||
|  | 			quotaPeriod:      tunedQuotaPeriod, | ||||||
|  | 			expected:         &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &cpuNoLimit, CpuPeriod: &tunedQuotaPeriod, Memory: &burstableMemory, Unified: map[string]string{"memory.min": "104857600"}}, | ||||||
|  | 		}, | ||||||
|  | 		"burstable-partial-limits-with-tuned-quota": { | ||||||
|  | 			pod: &v1.Pod{ | ||||||
|  | 				Spec: v1.PodSpec{ | ||||||
|  | 					Containers: []v1.Container{ | ||||||
|  | 						{ | ||||||
|  | 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")), | ||||||
|  | 						}, | ||||||
|  | 						{ | ||||||
|  | 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("", "")), | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			enforceCPULimits: true, | ||||||
|  | 			quotaPeriod:      tunedQuotaPeriod, | ||||||
|  | 			expected:         &ResourceConfig{CpuShares: &burstablePartialShares, Unified: map[string]string{"memory.min": "209715200"}}, | ||||||
|  | 		}, | ||||||
|  | 		"guaranteed": { | ||||||
|  | 			pod: &v1.Pod{ | ||||||
|  | 				Spec: v1.PodSpec{ | ||||||
|  | 					Containers: []v1.Container{ | ||||||
|  | 						{ | ||||||
|  | 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")), | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			enforceCPULimits: true, | ||||||
|  | 			quotaPeriod:      defaultQuotaPeriod, | ||||||
|  | 			expected:         &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory, Unified: map[string]string{"memory.min": "104857600"}}, | ||||||
|  | 		}, | ||||||
|  | 		"guaranteed-no-cpu-enforcement": { | ||||||
|  | 			pod: &v1.Pod{ | ||||||
|  | 				Spec: v1.PodSpec{ | ||||||
|  | 					Containers: []v1.Container{ | ||||||
|  | 						{ | ||||||
|  | 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")), | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			enforceCPULimits: false, | ||||||
|  | 			quotaPeriod:      defaultQuotaPeriod, | ||||||
|  | 			expected:         &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory, Unified: map[string]string{"memory.min": "104857600"}}, | ||||||
|  | 		}, | ||||||
|  | 		"guaranteed-with-tuned-quota": { | ||||||
|  | 			pod: &v1.Pod{ | ||||||
|  | 				Spec: v1.PodSpec{ | ||||||
|  | 					Containers: []v1.Container{ | ||||||
|  | 						{ | ||||||
|  | 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")), | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			enforceCPULimits: true, | ||||||
|  | 			quotaPeriod:      tunedQuotaPeriod, | ||||||
|  | 			expected:         &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory, Unified: map[string]string{"memory.min": "104857600"}}, | ||||||
|  | 		}, | ||||||
|  | 		"guaranteed-no-cpu-enforcement-with-tuned-quota": { | ||||||
|  | 			pod: &v1.Pod{ | ||||||
|  | 				Spec: v1.PodSpec{ | ||||||
|  | 					Containers: []v1.Container{ | ||||||
|  | 						{ | ||||||
|  | 							Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")), | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			enforceCPULimits: false, | ||||||
|  | 			quotaPeriod:      tunedQuotaPeriod, | ||||||
|  | 			expected:         &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory, Unified: map[string]string{"memory.min": "104857600"}}, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	for testName, testCase := range testCases { | ||||||
|  |  | ||||||
|  | 		actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, true) | ||||||
|  |  | ||||||
|  | 		if !reflect.DeepEqual(actual.Unified, testCase.expected.Unified) { | ||||||
|  | 			t.Errorf("unexpected result, test: %v, unified not as expected", testName) | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | } | ||||||
|   | |||||||
| @@ -42,7 +42,7 @@ func MilliCPUToShares(milliCPU int64) int64 { | |||||||
| } | } | ||||||
|  |  | ||||||
| // ResourceConfigForPod takes the input pod and outputs the cgroup resource config. | // ResourceConfigForPod takes the input pod and outputs the cgroup resource config. | ||||||
| func ResourceConfigForPod(pod *v1.Pod, enforceCPULimit bool, cpuPeriod uint64) *ResourceConfig { | func ResourceConfigForPod(pod *v1.Pod, enforceCPULimit bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig { | ||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -20,13 +20,16 @@ package cm | |||||||
|  |  | ||||||
| import ( | import ( | ||||||
| 	"fmt" | 	"fmt" | ||||||
|  | 	"strconv" | ||||||
| 	"strings" | 	"strings" | ||||||
| 	"time" | 	"time" | ||||||
|  |  | ||||||
| 	"k8s.io/api/core/v1" | 	v1 "k8s.io/api/core/v1" | ||||||
| 	"k8s.io/apimachinery/pkg/api/resource" | 	"k8s.io/apimachinery/pkg/api/resource" | ||||||
| 	"k8s.io/apimachinery/pkg/types" | 	"k8s.io/apimachinery/pkg/types" | ||||||
|  | 	utilfeature "k8s.io/apiserver/pkg/util/feature" | ||||||
| 	"k8s.io/klog/v2" | 	"k8s.io/klog/v2" | ||||||
|  | 	kubefeatures "k8s.io/kubernetes/pkg/features" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/events" | 	"k8s.io/kubernetes/pkg/kubelet/events" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/stats/pidlimit" | 	"k8s.io/kubernetes/pkg/kubelet/stats/pidlimit" | ||||||
| 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types" | 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types" | ||||||
| @@ -131,9 +134,22 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error { | |||||||
|  |  | ||||||
| // enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface. | // enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface. | ||||||
| func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.ResourceList) error { | func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.ResourceList) error { | ||||||
|  | 	rp := getCgroupConfig(rl) | ||||||
|  |  | ||||||
|  | 	// Enforce MemoryQoS for cgroups of kube-reserved/system-reserved. For more information, | ||||||
|  | 	// see https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos | ||||||
|  | 	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) { | ||||||
|  | 		if rp.Memory != nil { | ||||||
|  | 			if rp.Unified == nil { | ||||||
|  | 				rp.Unified = make(map[string]string) | ||||||
|  | 			} | ||||||
|  | 			rp.Unified[MemoryMin] = strconv.FormatInt(*rp.Memory, 10) | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  |  | ||||||
| 	cgroupConfig := &CgroupConfig{ | 	cgroupConfig := &CgroupConfig{ | ||||||
| 		Name:               cName, | 		Name:               cName, | ||||||
| 		ResourceParameters: getCgroupConfig(rl), | 		ResourceParameters: rp, | ||||||
| 	} | 	} | ||||||
| 	if cgroupConfig.ResourceParameters == nil { | 	if cgroupConfig.ResourceParameters == nil { | ||||||
| 		return fmt.Errorf("%q cgroup is not config properly", cgroupConfig.Name) | 		return fmt.Errorf("%q cgroup is not config properly", cgroupConfig.Name) | ||||||
| @@ -174,10 +190,10 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig { | |||||||
| 	return &rc | 	return &rc | ||||||
| } | } | ||||||
|  |  | ||||||
| // getNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement. | // GetNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement. | ||||||
| // Note that not all resources that are available on the node are included in the returned list of resources. | // Note that not all resources that are available on the node are included in the returned list of resources. | ||||||
| // Returns a ResourceList. | // Returns a ResourceList. | ||||||
| func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList { | func (cm *containerManagerImpl) GetNodeAllocatableAbsolute() v1.ResourceList { | ||||||
| 	return cm.getNodeAllocatableAbsoluteImpl(cm.capacity) | 	return cm.getNodeAllocatableAbsoluteImpl(cm.capacity) | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -250,7 +250,7 @@ func TestNodeAllocatableForEnforcement(t *testing.T) { | |||||||
| 			NodeConfig: nc, | 			NodeConfig: nc, | ||||||
| 			capacity:   tc.capacity, | 			capacity:   tc.capacity, | ||||||
| 		} | 		} | ||||||
| 		for k, v := range cm.getNodeAllocatableAbsolute() { | 		for k, v := range cm.GetNodeAllocatableAbsolute() { | ||||||
| 			expected, exists := tc.expected[k] | 			expected, exists := tc.expected[k] | ||||||
| 			assert.True(t, exists) | 			assert.True(t, exists) | ||||||
| 			assert.Equal(t, expected.MilliValue(), v.MilliValue(), "test case %d failed for resource %q", idx+1, k) | 			assert.Equal(t, expected.MilliValue(), v.MilliValue(), "test case %d failed for resource %q", idx+1, k) | ||||||
|   | |||||||
| @@ -23,11 +23,14 @@ import ( | |||||||
| 	"path" | 	"path" | ||||||
| 	"strings" | 	"strings" | ||||||
|  |  | ||||||
| 	v1 "k8s.io/api/core/v1" | 	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" | ||||||
|  | 	"k8s.io/api/core/v1" | ||||||
| 	"k8s.io/apimachinery/pkg/types" | 	"k8s.io/apimachinery/pkg/types" | ||||||
| 	utilerrors "k8s.io/apimachinery/pkg/util/errors" | 	utilerrors "k8s.io/apimachinery/pkg/util/errors" | ||||||
|  | 	utilfeature "k8s.io/apiserver/pkg/util/feature" | ||||||
| 	"k8s.io/klog/v2" | 	"k8s.io/klog/v2" | ||||||
| 	v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" | 	v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" | ||||||
|  | 	kubefeatures "k8s.io/kubernetes/pkg/features" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| const ( | const ( | ||||||
| @@ -71,14 +74,22 @@ func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error { | |||||||
| 	// check if container already exist | 	// check if container already exist | ||||||
| 	alreadyExists := m.Exists(pod) | 	alreadyExists := m.Exists(pod) | ||||||
| 	if !alreadyExists { | 	if !alreadyExists { | ||||||
|  | 		enforceMemoryQoS := false | ||||||
|  | 		if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) && | ||||||
|  | 			libcontainercgroups.IsCgroup2UnifiedMode() { | ||||||
|  | 			enforceMemoryQoS = true | ||||||
|  | 		} | ||||||
| 		// Create the pod container | 		// Create the pod container | ||||||
| 		containerConfig := &CgroupConfig{ | 		containerConfig := &CgroupConfig{ | ||||||
| 			Name:               podContainerName, | 			Name:               podContainerName, | ||||||
| 			ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod), | 			ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod, enforceMemoryQoS), | ||||||
| 		} | 		} | ||||||
| 		if m.podPidsLimit > 0 { | 		if m.podPidsLimit > 0 { | ||||||
| 			containerConfig.ResourceParameters.PidsLimit = &m.podPidsLimit | 			containerConfig.ResourceParameters.PidsLimit = &m.podPidsLimit | ||||||
| 		} | 		} | ||||||
|  | 		if enforceMemoryQoS { | ||||||
|  | 			klog.V(4).InfoS("MemoryQoS config for pod", "pod", klog.KObj(pod), "unified", containerConfig.ResourceParameters.Unified) | ||||||
|  | 		} | ||||||
| 		if err := m.cgroupManager.Create(containerConfig); err != nil { | 		if err := m.cgroupManager.Create(containerConfig); err != nil { | ||||||
| 			return fmt.Errorf("failed to create container for %v : %v", podContainerName, err) | 			return fmt.Errorf("failed to create container for %v : %v", podContainerName, err) | ||||||
| 		} | 		} | ||||||
|   | |||||||
| @@ -18,6 +18,7 @@ package cm | |||||||
|  |  | ||||||
| import ( | import ( | ||||||
| 	"fmt" | 	"fmt" | ||||||
|  | 	"strconv" | ||||||
| 	"strings" | 	"strings" | ||||||
| 	"sync" | 	"sync" | ||||||
| 	"time" | 	"time" | ||||||
| @@ -27,6 +28,7 @@ import ( | |||||||
| 	"k8s.io/apimachinery/pkg/util/wait" | 	"k8s.io/apimachinery/pkg/util/wait" | ||||||
|  |  | ||||||
| 	units "github.com/docker/go-units" | 	units "github.com/docker/go-units" | ||||||
|  | 	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" | ||||||
| 	cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs" | 	cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs" | ||||||
| 	v1 "k8s.io/api/core/v1" | 	v1 "k8s.io/api/core/v1" | ||||||
| 	utilfeature "k8s.io/apiserver/pkg/util/feature" | 	utilfeature "k8s.io/apiserver/pkg/util/feature" | ||||||
| @@ -191,10 +193,9 @@ func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass] | |||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
|  |  | ||||||
| // setMemoryReserve sums the memory limits of all pods in a QOS class, | // getQoSMemoryRequests sums and returns the memory request of all pods for | ||||||
| // calculates QOS class memory limits, and set those limits in the | // guaranteed and burstable qos classes. | ||||||
| // CgroupConfig for each QOS class. | func (m *qosContainerManagerImpl) getQoSMemoryRequests() map[v1.PodQOSClass]int64 { | ||||||
| func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) { |  | ||||||
| 	qosMemoryRequests := map[v1.PodQOSClass]int64{ | 	qosMemoryRequests := map[v1.PodQOSClass]int64{ | ||||||
| 		v1.PodQOSGuaranteed: 0, | 		v1.PodQOSGuaranteed: 0, | ||||||
| 		v1.PodQOSBurstable:  0, | 		v1.PodQOSBurstable:  0, | ||||||
| @@ -216,6 +217,15 @@ func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*C | |||||||
| 		qosMemoryRequests[qosClass] += podMemoryRequest | 		qosMemoryRequests[qosClass] += podMemoryRequest | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  | 	return qosMemoryRequests | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // setMemoryReserve sums the memory limits of all pods in a QOS class, | ||||||
|  | // calculates QOS class memory limits, and set those limits in the | ||||||
|  | // CgroupConfig for each QOS class. | ||||||
|  | func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) { | ||||||
|  | 	qosMemoryRequests := m.getQoSMemoryRequests() | ||||||
|  |  | ||||||
| 	resources := m.getNodeAllocatable() | 	resources := m.getNodeAllocatable() | ||||||
| 	allocatableResource, ok := resources[v1.ResourceMemory] | 	allocatableResource, ok := resources[v1.ResourceMemory] | ||||||
| 	if !ok { | 	if !ok { | ||||||
| @@ -265,11 +275,43 @@ func (m *qosContainerManagerImpl) retrySetMemoryReserve(configs map[v1.PodQOSCla | |||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // setMemoryQoS sums the memory requests of all pods in the Burstable class, | ||||||
|  | // and set the sum memory as the memory.min in the Unified field of CgroupConfig. | ||||||
|  | func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*CgroupConfig) { | ||||||
|  | 	qosMemoryRequests := m.getQoSMemoryRequests() | ||||||
|  |  | ||||||
|  | 	// Calculate the memory.min: | ||||||
|  | 	// for burstable(/kubepods/burstable): sum of all burstable pods | ||||||
|  | 	// for guaranteed(/kubepods): sum of all guaranteed and burstable pods | ||||||
|  | 	burstableMin := qosMemoryRequests[v1.PodQOSBurstable] | ||||||
|  | 	guaranteedMin := qosMemoryRequests[v1.PodQOSGuaranteed] + burstableMin | ||||||
|  |  | ||||||
|  | 	if burstableMin > 0 { | ||||||
|  | 		if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil { | ||||||
|  | 			configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string) | ||||||
|  | 		} | ||||||
|  | 		configs[v1.PodQOSBurstable].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(burstableMin, 10) | ||||||
|  | 		klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memory.min", burstableMin) | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	if guaranteedMin > 0 { | ||||||
|  | 		if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil { | ||||||
|  | 			configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string) | ||||||
|  | 		} | ||||||
|  | 		configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(guaranteedMin, 10) | ||||||
|  | 		klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memory.min", guaranteedMin) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
| func (m *qosContainerManagerImpl) UpdateCgroups() error { | func (m *qosContainerManagerImpl) UpdateCgroups() error { | ||||||
| 	m.Lock() | 	m.Lock() | ||||||
| 	defer m.Unlock() | 	defer m.Unlock() | ||||||
|  |  | ||||||
| 	qosConfigs := map[v1.PodQOSClass]*CgroupConfig{ | 	qosConfigs := map[v1.PodQOSClass]*CgroupConfig{ | ||||||
|  | 		v1.PodQOSGuaranteed: { | ||||||
|  | 			Name:               m.qosContainersInfo.Guaranteed, | ||||||
|  | 			ResourceParameters: &ResourceConfig{}, | ||||||
|  | 		}, | ||||||
| 		v1.PodQOSBurstable: { | 		v1.PodQOSBurstable: { | ||||||
| 			Name:               m.qosContainersInfo.Burstable, | 			Name:               m.qosContainersInfo.Burstable, | ||||||
| 			ResourceParameters: &ResourceConfig{}, | 			ResourceParameters: &ResourceConfig{}, | ||||||
| @@ -290,6 +332,12 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error { | |||||||
| 		return err | 		return err | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  | 	// update the qos level cgrougs v2 settings of memory qos if feature enabled | ||||||
|  | 	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) && | ||||||
|  | 		libcontainercgroups.IsCgroup2UnifiedMode() { | ||||||
|  | 		m.setMemoryQoS(qosConfigs) | ||||||
|  | 	} | ||||||
|  |  | ||||||
| 	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.QOSReserved) { | 	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.QOSReserved) { | ||||||
| 		for resource, percentReserve := range m.qosReserved { | 		for resource, percentReserve := range m.qosReserved { | ||||||
| 			switch resource { | 			switch resource { | ||||||
|   | |||||||
							
								
								
									
										154
									
								
								pkg/kubelet/cm/qos_container_manager_linux_test.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										154
									
								
								pkg/kubelet/cm/qos_container_manager_linux_test.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,154 @@ | |||||||
|  | // +build linux | ||||||
|  |  | ||||||
|  | /* | ||||||
|  | Copyright 2021 The Kubernetes Authors. | ||||||
|  |  | ||||||
|  | Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  | you may not use this file except in compliance with the License. | ||||||
|  | You may obtain a copy of the License at | ||||||
|  |  | ||||||
|  |     http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  | ||||||
|  | Unless required by applicable law or agreed to in writing, software | ||||||
|  | distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  | See the License for the specific language governing permissions and | ||||||
|  | limitations under the License. | ||||||
|  | */ | ||||||
|  |  | ||||||
|  | package cm | ||||||
|  |  | ||||||
|  | import ( | ||||||
|  | 	"fmt" | ||||||
|  | 	"strconv" | ||||||
|  | 	"testing" | ||||||
|  |  | ||||||
|  | 	"github.com/stretchr/testify/assert" | ||||||
|  | 	v1 "k8s.io/api/core/v1" | ||||||
|  | 	"k8s.io/apimachinery/pkg/api/resource" | ||||||
|  | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | func activeTestPods() []*v1.Pod { | ||||||
|  | 	return []*v1.Pod{ | ||||||
|  | 		{ | ||||||
|  | 			ObjectMeta: metav1.ObjectMeta{ | ||||||
|  | 				UID:       "12345678", | ||||||
|  | 				Name:      "guaranteed-pod", | ||||||
|  | 				Namespace: "test", | ||||||
|  | 			}, | ||||||
|  | 			Spec: v1.PodSpec{ | ||||||
|  | 				Containers: []v1.Container{ | ||||||
|  | 					{ | ||||||
|  | 						Name:  "foo", | ||||||
|  | 						Image: "busybox", | ||||||
|  | 						Resources: v1.ResourceRequirements{ | ||||||
|  | 							Requests: v1.ResourceList{ | ||||||
|  | 								v1.ResourceMemory: resource.MustParse("128Mi"), | ||||||
|  | 								v1.ResourceCPU:    resource.MustParse("1"), | ||||||
|  | 							}, | ||||||
|  | 							Limits: v1.ResourceList{ | ||||||
|  | 								v1.ResourceMemory: resource.MustParse("128Mi"), | ||||||
|  | 								v1.ResourceCPU:    resource.MustParse("1"), | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			ObjectMeta: metav1.ObjectMeta{ | ||||||
|  | 				UID:       "87654321", | ||||||
|  | 				Name:      "burstable-pod-1", | ||||||
|  | 				Namespace: "test", | ||||||
|  | 			}, | ||||||
|  | 			Spec: v1.PodSpec{ | ||||||
|  | 				Containers: []v1.Container{ | ||||||
|  | 					{ | ||||||
|  | 						Name:  "foo", | ||||||
|  | 						Image: "busybox", | ||||||
|  | 						Resources: v1.ResourceRequirements{ | ||||||
|  | 							Requests: v1.ResourceList{ | ||||||
|  | 								v1.ResourceMemory: resource.MustParse("128Mi"), | ||||||
|  | 								v1.ResourceCPU:    resource.MustParse("1"), | ||||||
|  | 							}, | ||||||
|  | 							Limits: v1.ResourceList{ | ||||||
|  | 								v1.ResourceMemory: resource.MustParse("256Mi"), | ||||||
|  | 								v1.ResourceCPU:    resource.MustParse("2"), | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			ObjectMeta: metav1.ObjectMeta{ | ||||||
|  | 				UID:       "01234567", | ||||||
|  | 				Name:      "burstable-pod-2", | ||||||
|  | 				Namespace: "test", | ||||||
|  | 			}, | ||||||
|  | 			Spec: v1.PodSpec{ | ||||||
|  | 				Containers: []v1.Container{ | ||||||
|  | 					{ | ||||||
|  | 						Name:  "foo", | ||||||
|  | 						Image: "busybox", | ||||||
|  | 						Resources: v1.ResourceRequirements{ | ||||||
|  | 							Requests: v1.ResourceList{ | ||||||
|  | 								v1.ResourceMemory: resource.MustParse("256Mi"), | ||||||
|  | 								v1.ResourceCPU:    resource.MustParse("2"), | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func createTestQOSContainerManager() (*qosContainerManagerImpl, error) { | ||||||
|  | 	subsystems, err := GetCgroupSubsystems() | ||||||
|  | 	if err != nil { | ||||||
|  | 		return nil, fmt.Errorf("failed to get mounted cgroup subsystems: %v", err) | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	cgroupRoot := ParseCgroupfsToCgroupName("/") | ||||||
|  | 	cgroupRoot = NewCgroupName(cgroupRoot, defaultNodeAllocatableCgroupName) | ||||||
|  |  | ||||||
|  | 	qosContainerManager := &qosContainerManagerImpl{ | ||||||
|  | 		subsystems:    subsystems, | ||||||
|  | 		cgroupManager: NewCgroupManager(subsystems, "cgroupfs"), | ||||||
|  | 		cgroupRoot:    cgroupRoot, | ||||||
|  | 		qosReserved:   nil, | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	qosContainerManager.activePods = activeTestPods | ||||||
|  |  | ||||||
|  | 	return qosContainerManager, nil | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func TestQoSContainerCgroup(t *testing.T) { | ||||||
|  | 	m, err := createTestQOSContainerManager() | ||||||
|  | 	assert.Nil(t, err) | ||||||
|  |  | ||||||
|  | 	qosConfigs := map[v1.PodQOSClass]*CgroupConfig{ | ||||||
|  | 		v1.PodQOSGuaranteed: { | ||||||
|  | 			Name:               m.qosContainersInfo.Guaranteed, | ||||||
|  | 			ResourceParameters: &ResourceConfig{}, | ||||||
|  | 		}, | ||||||
|  | 		v1.PodQOSBurstable: { | ||||||
|  | 			Name:               m.qosContainersInfo.Burstable, | ||||||
|  | 			ResourceParameters: &ResourceConfig{}, | ||||||
|  | 		}, | ||||||
|  | 		v1.PodQOSBestEffort: { | ||||||
|  | 			Name:               m.qosContainersInfo.BestEffort, | ||||||
|  | 			ResourceParameters: &ResourceConfig{}, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	m.setMemoryQoS(qosConfigs) | ||||||
|  |  | ||||||
|  | 	burstableMin := resource.MustParse("384Mi") | ||||||
|  | 	guaranteedMin := resource.MustParse("128Mi") | ||||||
|  | 	assert.Equal(t, qosConfigs[v1.PodQOSGuaranteed].ResourceParameters.Unified["memory.min"], strconv.FormatInt(burstableMin.Value()+guaranteedMin.Value(), 10)) | ||||||
|  | 	assert.Equal(t, qosConfigs[v1.PodQOSBurstable].ResourceParameters.Unified["memory.min"], strconv.FormatInt(burstableMin.Value(), 10)) | ||||||
|  | } | ||||||
| @@ -35,6 +35,8 @@ type ResourceConfig struct { | |||||||
| 	HugePageLimit map[int64]int64 | 	HugePageLimit map[int64]int64 | ||||||
| 	// Maximum number of pids | 	// Maximum number of pids | ||||||
| 	PidsLimit *int64 | 	PidsLimit *int64 | ||||||
|  | 	// Unified for cgroup v2 | ||||||
|  | 	Unified map[string]string | ||||||
| } | } | ||||||
|  |  | ||||||
| // CgroupName is the abstract name of a cgroup prior to any driver specific conversion. | // CgroupName is the abstract name of a cgroup prior to any driver specific conversion. | ||||||
|   | |||||||
| @@ -666,6 +666,8 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, | |||||||
| 		klet.runtimeClassManager, | 		klet.runtimeClassManager, | ||||||
| 		seccompDefault, | 		seccompDefault, | ||||||
| 		kubeCfg.MemorySwap.SwapBehavior, | 		kubeCfg.MemorySwap.SwapBehavior, | ||||||
|  | 		kubeDeps.ContainerManager.GetNodeAllocatableAbsolute, | ||||||
|  | 		*kubeCfg.MemoryThrottlingFactor, | ||||||
| 	) | 	) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		return nil, err | 		return nil, err | ||||||
|   | |||||||
| @@ -21,6 +21,8 @@ import ( | |||||||
| 	"time" | 	"time" | ||||||
|  |  | ||||||
| 	cadvisorapi "github.com/google/cadvisor/info/v1" | 	cadvisorapi "github.com/google/cadvisor/info/v1" | ||||||
|  | 	v1 "k8s.io/api/core/v1" | ||||||
|  | 	"k8s.io/apimachinery/pkg/api/resource" | ||||||
| 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||||
| 	"k8s.io/apimachinery/pkg/types" | 	"k8s.io/apimachinery/pkg/types" | ||||||
| 	"k8s.io/client-go/tools/record" | 	"k8s.io/client-go/tools/record" | ||||||
| @@ -38,6 +40,9 @@ import ( | |||||||
|  |  | ||||||
| const ( | const ( | ||||||
| 	fakeSeccompProfileRoot = "/fakeSeccompProfileRoot" | 	fakeSeccompProfileRoot = "/fakeSeccompProfileRoot" | ||||||
|  |  | ||||||
|  | 	fakeNodeAllocatableMemory = "32Gi" | ||||||
|  | 	fakeNodeAllocatableCPU    = "16" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| type fakeHTTP struct { | type fakeHTTP struct { | ||||||
| @@ -79,21 +84,22 @@ func newFakeKubeRuntimeManager(runtimeService internalapi.RuntimeService, imageS | |||||||
| 		return nil, err | 		return nil, err | ||||||
| 	} | 	} | ||||||
| 	kubeRuntimeManager := &kubeGenericRuntimeManager{ | 	kubeRuntimeManager := &kubeGenericRuntimeManager{ | ||||||
| 		recorder:           recorder, | 		recorder:               recorder, | ||||||
| 		cpuCFSQuota:        false, | 		cpuCFSQuota:            false, | ||||||
| 		cpuCFSQuotaPeriod:  metav1.Duration{Duration: time.Microsecond * 100}, | 		cpuCFSQuotaPeriod:      metav1.Duration{Duration: time.Microsecond * 100}, | ||||||
| 		livenessManager:    proberesults.NewManager(), | 		livenessManager:        proberesults.NewManager(), | ||||||
| 		startupManager:     proberesults.NewManager(), | 		startupManager:         proberesults.NewManager(), | ||||||
| 		machineInfo:        machineInfo, | 		machineInfo:            machineInfo, | ||||||
| 		osInterface:        osInterface, | 		osInterface:            osInterface, | ||||||
| 		runtimeHelper:      runtimeHelper, | 		runtimeHelper:          runtimeHelper, | ||||||
| 		runtimeService:     runtimeService, | 		runtimeService:         runtimeService, | ||||||
| 		imageService:       imageService, | 		imageService:           imageService, | ||||||
| 		keyring:            keyring, | 		keyring:                keyring, | ||||||
| 		seccompProfileRoot: fakeSeccompProfileRoot, | 		seccompProfileRoot:     fakeSeccompProfileRoot, | ||||||
| 		internalLifecycle:  cm.NewFakeInternalContainerLifecycle(), | 		internalLifecycle:      cm.NewFakeInternalContainerLifecycle(), | ||||||
| 		logReduction:       logreduction.NewLogReduction(identicalErrorDelay), | 		logReduction:           logreduction.NewLogReduction(identicalErrorDelay), | ||||||
| 		logManager:         logManager, | 		logManager:             logManager, | ||||||
|  | 		memoryThrottlingFactor: 0.8, | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	typedVersion, err := runtimeService.Version(kubeRuntimeAPIVersion) | 	typedVersion, err := runtimeService.Version(kubeRuntimeAPIVersion) | ||||||
| @@ -118,5 +124,12 @@ func newFakeKubeRuntimeManager(runtimeService internalapi.RuntimeService, imageS | |||||||
| 		kubeRuntimeManager, | 		kubeRuntimeManager, | ||||||
| 		kubeRuntimeManager) | 		kubeRuntimeManager) | ||||||
|  |  | ||||||
|  | 	kubeRuntimeManager.getNodeAllocatable = func() v1.ResourceList { | ||||||
|  | 		return v1.ResourceList{ | ||||||
|  | 			v1.ResourceMemory: resource.MustParse(fakeNodeAllocatableMemory), | ||||||
|  | 			v1.ResourceCPU:    resource.MustParse(fakeNodeAllocatableCPU), | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  |  | ||||||
| 	return kubeRuntimeManager, nil | 	return kubeRuntimeManager, nil | ||||||
| } | } | ||||||
|   | |||||||
| @@ -19,8 +19,10 @@ limitations under the License. | |||||||
| package kuberuntime | package kuberuntime | ||||||
|  |  | ||||||
| import ( | import ( | ||||||
|  | 	"strconv" | ||||||
| 	"time" | 	"time" | ||||||
|  |  | ||||||
|  | 	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" | ||||||
| 	cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs" | 	cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs" | ||||||
| 	v1 "k8s.io/api/core/v1" | 	v1 "k8s.io/api/core/v1" | ||||||
| 	utilfeature "k8s.io/apiserver/pkg/util/feature" | 	utilfeature "k8s.io/apiserver/pkg/util/feature" | ||||||
| @@ -28,6 +30,7 @@ import ( | |||||||
| 	"k8s.io/klog/v2" | 	"k8s.io/klog/v2" | ||||||
| 	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" | 	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" | ||||||
| 	kubefeatures "k8s.io/kubernetes/pkg/features" | 	kubefeatures "k8s.io/kubernetes/pkg/features" | ||||||
|  | 	"k8s.io/kubernetes/pkg/kubelet/cm" | ||||||
| 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" | 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" | ||||||
| 	"k8s.io/kubernetes/pkg/kubelet/qos" | 	"k8s.io/kubernetes/pkg/kubelet/qos" | ||||||
| 	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" | 	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" | ||||||
| @@ -35,12 +38,18 @@ import ( | |||||||
|  |  | ||||||
| // applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig. | // applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig. | ||||||
| func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error { | func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error { | ||||||
| 	config.Linux = m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget) | 	enforceMemoryQoS := false | ||||||
|  | 	// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2 | ||||||
|  | 	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) && | ||||||
|  | 		libcontainercgroups.IsCgroup2UnifiedMode() { | ||||||
|  | 		enforceMemoryQoS = true | ||||||
|  | 	} | ||||||
|  | 	config.Linux = m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS) | ||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
|  |  | ||||||
| // generateLinuxContainerConfig generates linux container config for kubelet runtime v1. | // generateLinuxContainerConfig generates linux container config for kubelet runtime v1. | ||||||
| func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) *runtimeapi.LinuxContainerConfig { | func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID, enforceMemoryQoS bool) *runtimeapi.LinuxContainerConfig { | ||||||
| 	lc := &runtimeapi.LinuxContainerConfig{ | 	lc := &runtimeapi.LinuxContainerConfig{ | ||||||
| 		Resources:       &runtimeapi.LinuxContainerResources{}, | 		Resources:       &runtimeapi.LinuxContainerResources{}, | ||||||
| 		SecurityContext: m.determineEffectiveSecurityContext(pod, container, uid, username), | 		SecurityContext: m.determineEffectiveSecurityContext(pod, container, uid, username), | ||||||
| @@ -56,6 +65,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.C | |||||||
| 	cpuRequest := container.Resources.Requests.Cpu() | 	cpuRequest := container.Resources.Requests.Cpu() | ||||||
| 	cpuLimit := container.Resources.Limits.Cpu() | 	cpuLimit := container.Resources.Limits.Cpu() | ||||||
| 	memoryLimit := container.Resources.Limits.Memory().Value() | 	memoryLimit := container.Resources.Limits.Memory().Value() | ||||||
|  | 	memoryRequest := container.Resources.Requests.Memory().Value() | ||||||
| 	oomScoreAdj := int64(qos.GetContainerOOMScoreAdjust(pod, container, | 	oomScoreAdj := int64(qos.GetContainerOOMScoreAdjust(pod, container, | ||||||
| 		int64(m.machineInfo.MemoryCapacity))) | 		int64(m.machineInfo.MemoryCapacity))) | ||||||
| 	// If request is not specified, but limit is, we want request to default to limit. | 	// If request is not specified, but limit is, we want request to default to limit. | ||||||
| @@ -107,6 +117,43 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.C | |||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  | 	// Set memory.min and memory.high to enforce MemoryQoS | ||||||
|  | 	if enforceMemoryQoS { | ||||||
|  | 		unified := map[string]string{} | ||||||
|  |  | ||||||
|  | 		if memoryRequest != 0 { | ||||||
|  | 			unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10) | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		// If container sets limits.memory, we set memory.high=pod.spec.containers[i].resources.limits[memory] * memory_throttling_factor | ||||||
|  | 		// for container level cgroup if memory.high>memory.min. | ||||||
|  | 		// If container doesn't set limits.memory, we set memory.high=node_allocatable_memory * memory_throttling_factor | ||||||
|  | 		// for container level cgroup. | ||||||
|  | 		memoryHigh := int64(0) | ||||||
|  | 		if memoryLimit != 0 { | ||||||
|  | 			memoryHigh = int64(float64(memoryLimit) * m.memoryThrottlingFactor) | ||||||
|  | 		} else { | ||||||
|  | 			allocatable := m.getNodeAllocatable() | ||||||
|  | 			allocatableMemory, ok := allocatable[v1.ResourceMemory] | ||||||
|  | 			if ok && allocatableMemory.Value() > 0 { | ||||||
|  | 				memoryHigh = int64(float64(allocatableMemory.Value()) * m.memoryThrottlingFactor) | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 		if memoryHigh > memoryRequest { | ||||||
|  | 			unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10) | ||||||
|  | 		} | ||||||
|  | 		if len(unified) > 0 { | ||||||
|  | 			if lc.Resources.Unified == nil { | ||||||
|  | 				lc.Resources.Unified = unified | ||||||
|  | 			} else { | ||||||
|  | 				for k, v := range unified { | ||||||
|  | 					lc.Resources.Unified[k] = v | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 			klog.V(4).InfoS("MemoryQoS config for container", "pod", klog.KObj(pod), "containerName", container.Name, "unified", unified) | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  |  | ||||||
| 	return lc | 	return lc | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -20,6 +20,7 @@ package kuberuntime | |||||||
|  |  | ||||||
| import ( | import ( | ||||||
| 	"reflect" | 	"reflect" | ||||||
|  | 	"strconv" | ||||||
| 	"testing" | 	"testing" | ||||||
|  |  | ||||||
| 	"github.com/google/go-cmp/cmp" | 	"github.com/google/go-cmp/cmp" | ||||||
| @@ -36,7 +37,7 @@ import ( | |||||||
| 	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" | 	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| func makeExpectedConfig(m *kubeGenericRuntimeManager, pod *v1.Pod, containerIndex int) *runtimeapi.ContainerConfig { | func makeExpectedConfig(m *kubeGenericRuntimeManager, pod *v1.Pod, containerIndex int, enforceMemoryQoS bool) *runtimeapi.ContainerConfig { | ||||||
| 	container := &pod.Spec.Containers[containerIndex] | 	container := &pod.Spec.Containers[containerIndex] | ||||||
| 	podIP := "" | 	podIP := "" | ||||||
| 	restartCount := 0 | 	restartCount := 0 | ||||||
| @@ -62,7 +63,7 @@ func makeExpectedConfig(m *kubeGenericRuntimeManager, pod *v1.Pod, containerInde | |||||||
| 		Stdin:       container.Stdin, | 		Stdin:       container.Stdin, | ||||||
| 		StdinOnce:   container.StdinOnce, | 		StdinOnce:   container.StdinOnce, | ||||||
| 		Tty:         container.TTY, | 		Tty:         container.TTY, | ||||||
| 		Linux:       m.generateLinuxContainerConfig(container, pod, new(int64), "", nil), | 		Linux:       m.generateLinuxContainerConfig(container, pod, new(int64), "", nil, enforceMemoryQoS), | ||||||
| 		Envs:        envs, | 		Envs:        envs, | ||||||
| 	} | 	} | ||||||
| 	return expectedConfig | 	return expectedConfig | ||||||
| @@ -97,7 +98,7 @@ func TestGenerateContainerConfig(t *testing.T) { | |||||||
| 		}, | 		}, | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	expectedConfig := makeExpectedConfig(m, pod, 0) | 	expectedConfig := makeExpectedConfig(m, pod, 0, false) | ||||||
| 	containerConfig, _, err := m.generateContainerConfig(&pod.Spec.Containers[0], pod, 0, "", pod.Spec.Containers[0].Image, []string{}, nil) | 	containerConfig, _, err := m.generateContainerConfig(&pod.Spec.Containers[0], pod, 0, "", pod.Spec.Containers[0].Image, []string{}, nil) | ||||||
| 	assert.NoError(t, err) | 	assert.NoError(t, err) | ||||||
| 	assert.Equal(t, expectedConfig, containerConfig, "generate container config for kubelet runtime v1.") | 	assert.Equal(t, expectedConfig, containerConfig, "generate container config for kubelet runtime v1.") | ||||||
| @@ -145,6 +146,101 @@ func TestGenerateContainerConfig(t *testing.T) { | |||||||
| 	assert.Error(t, err, "RunAsNonRoot should fail for non-numeric username") | 	assert.Error(t, err, "RunAsNonRoot should fail for non-numeric username") | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) { | ||||||
|  | 	_, _, m, err := createTestRuntimeManager() | ||||||
|  | 	assert.NoError(t, err) | ||||||
|  |  | ||||||
|  | 	pod1 := &v1.Pod{ | ||||||
|  | 		ObjectMeta: metav1.ObjectMeta{ | ||||||
|  | 			UID:       "12345678", | ||||||
|  | 			Name:      "bar", | ||||||
|  | 			Namespace: "new", | ||||||
|  | 		}, | ||||||
|  | 		Spec: v1.PodSpec{ | ||||||
|  | 			Containers: []v1.Container{ | ||||||
|  | 				{ | ||||||
|  | 					Name:            "foo", | ||||||
|  | 					Image:           "busybox", | ||||||
|  | 					ImagePullPolicy: v1.PullIfNotPresent, | ||||||
|  | 					Command:         []string{"testCommand"}, | ||||||
|  | 					WorkingDir:      "testWorkingDir", | ||||||
|  | 					Resources: v1.ResourceRequirements{ | ||||||
|  | 						Requests: v1.ResourceList{ | ||||||
|  | 							v1.ResourceMemory: resource.MustParse("128Mi"), | ||||||
|  | 						}, | ||||||
|  | 						Limits: v1.ResourceList{ | ||||||
|  | 							v1.ResourceMemory: resource.MustParse("256Mi"), | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	pod2 := &v1.Pod{ | ||||||
|  | 		ObjectMeta: metav1.ObjectMeta{ | ||||||
|  | 			UID:       "12345678", | ||||||
|  | 			Name:      "bar", | ||||||
|  | 			Namespace: "new", | ||||||
|  | 		}, | ||||||
|  | 		Spec: v1.PodSpec{ | ||||||
|  | 			Containers: []v1.Container{ | ||||||
|  | 				{ | ||||||
|  | 					Name:            "foo", | ||||||
|  | 					Image:           "busybox", | ||||||
|  | 					ImagePullPolicy: v1.PullIfNotPresent, | ||||||
|  | 					Command:         []string{"testCommand"}, | ||||||
|  | 					WorkingDir:      "testWorkingDir", | ||||||
|  | 					Resources: v1.ResourceRequirements{ | ||||||
|  | 						Requests: v1.ResourceList{ | ||||||
|  | 							v1.ResourceMemory: resource.MustParse("128Mi"), | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  | 	memoryNodeAllocatable := resource.MustParse(fakeNodeAllocatableMemory) | ||||||
|  | 	pod2MemoryHigh := float64(memoryNodeAllocatable.Value()) * m.memoryThrottlingFactor | ||||||
|  |  | ||||||
|  | 	type expectedResult struct { | ||||||
|  | 		containerConfig *runtimeapi.LinuxContainerConfig | ||||||
|  | 		memoryLow       int64 | ||||||
|  | 		memoryHigh      int64 | ||||||
|  | 	} | ||||||
|  | 	tests := []struct { | ||||||
|  | 		name     string | ||||||
|  | 		pod      *v1.Pod | ||||||
|  | 		expected *expectedResult | ||||||
|  | 	}{ | ||||||
|  | 		{ | ||||||
|  | 			name: "Request128MBLimit256MB", | ||||||
|  | 			pod:  pod1, | ||||||
|  | 			expected: &expectedResult{ | ||||||
|  | 				m.generateLinuxContainerConfig(&pod1.Spec.Containers[0], pod1, new(int64), "", nil, true), | ||||||
|  | 				128 * 1024 * 1024, | ||||||
|  | 				int64(float64(256*1024*1024) * m.memoryThrottlingFactor), | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			name: "Request128MBWithoutLimit", | ||||||
|  | 			pod:  pod2, | ||||||
|  | 			expected: &expectedResult{ | ||||||
|  | 				m.generateLinuxContainerConfig(&pod2.Spec.Containers[0], pod2, new(int64), "", nil, true), | ||||||
|  | 				128 * 1024 * 1024, | ||||||
|  | 				int64(pod2MemoryHigh), | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	for _, test := range tests { | ||||||
|  | 		linuxConfig := m.generateLinuxContainerConfig(&test.pod.Spec.Containers[0], test.pod, new(int64), "", nil, true) | ||||||
|  | 		assert.Equal(t, test.expected.containerConfig, linuxConfig, test.name) | ||||||
|  | 		assert.Equal(t, linuxConfig.GetResources().GetUnified()["memory.min"], strconv.FormatInt(test.expected.memoryLow, 10), test.name) | ||||||
|  | 		assert.Equal(t, linuxConfig.GetResources().GetUnified()["memory.high"], strconv.FormatInt(test.expected.memoryHigh, 10), test.name) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
| func TestGetHugepageLimitsFromResources(t *testing.T) { | func TestGetHugepageLimitsFromResources(t *testing.T) { | ||||||
| 	var baseHugepage []*runtimeapi.HugepageLimit | 	var baseHugepage []*runtimeapi.HugepageLimit | ||||||
|  |  | ||||||
| @@ -361,7 +457,7 @@ func TestGenerateLinuxContainerConfigNamespaces(t *testing.T) { | |||||||
| 		}, | 		}, | ||||||
| 	} { | 	} { | ||||||
| 		t.Run(tc.name, func(t *testing.T) { | 		t.Run(tc.name, func(t *testing.T) { | ||||||
| 			got := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", tc.target) | 			got := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", tc.target, false) | ||||||
| 			if diff := cmp.Diff(tc.want, got.SecurityContext.NamespaceOptions); diff != "" { | 			if diff := cmp.Diff(tc.want, got.SecurityContext.NamespaceOptions); diff != "" { | ||||||
| 				t.Errorf("%v: diff (-want +got):\n%v", t.Name(), diff) | 				t.Errorf("%v: diff (-want +got):\n%v", t.Name(), diff) | ||||||
| 			} | 			} | ||||||
| @@ -452,7 +548,7 @@ func TestGenerateLinuxContainerConfigSwap(t *testing.T) { | |||||||
| 	} { | 	} { | ||||||
| 		t.Run(tc.name, func(t *testing.T) { | 		t.Run(tc.name, func(t *testing.T) { | ||||||
| 			m.memorySwapBehavior = tc.swapSetting | 			m.memorySwapBehavior = tc.swapSetting | ||||||
| 			actual := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", nil) | 			actual := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", nil, false) | ||||||
| 			assert.Equal(t, tc.expected, actual.Resources.MemorySwapLimitInBytes, "memory swap config for %s", tc.name) | 			assert.Equal(t, tc.expected, actual.Resources.MemorySwapLimitInBytes, "memory swap config for %s", tc.name) | ||||||
| 		}) | 		}) | ||||||
| 	} | 	} | ||||||
|   | |||||||
| @@ -147,6 +147,12 @@ type kubeGenericRuntimeManager struct { | |||||||
|  |  | ||||||
| 	// MemorySwapBehavior defines how swap is used | 	// MemorySwapBehavior defines how swap is used | ||||||
| 	memorySwapBehavior string | 	memorySwapBehavior string | ||||||
|  |  | ||||||
|  | 	//Function to get node allocatable resources | ||||||
|  | 	getNodeAllocatable func() v1.ResourceList | ||||||
|  |  | ||||||
|  | 	// Memory throttling factor for MemoryQoS | ||||||
|  | 	memoryThrottlingFactor float64 | ||||||
| } | } | ||||||
|  |  | ||||||
| // KubeGenericRuntime is a interface contains interfaces for container runtime and command. | // KubeGenericRuntime is a interface contains interfaces for container runtime and command. | ||||||
| @@ -190,27 +196,31 @@ func NewKubeGenericRuntimeManager( | |||||||
| 	runtimeClassManager *runtimeclass.Manager, | 	runtimeClassManager *runtimeclass.Manager, | ||||||
| 	seccompDefault bool, | 	seccompDefault bool, | ||||||
| 	memorySwapBehavior string, | 	memorySwapBehavior string, | ||||||
|  | 	getNodeAllocatable func() v1.ResourceList, | ||||||
|  | 	memoryThrottlingFactor float64, | ||||||
| ) (KubeGenericRuntime, error) { | ) (KubeGenericRuntime, error) { | ||||||
| 	kubeRuntimeManager := &kubeGenericRuntimeManager{ | 	kubeRuntimeManager := &kubeGenericRuntimeManager{ | ||||||
| 		recorder:            recorder, | 		recorder:               recorder, | ||||||
| 		cpuCFSQuota:         cpuCFSQuota, | 		cpuCFSQuota:            cpuCFSQuota, | ||||||
| 		cpuCFSQuotaPeriod:   cpuCFSQuotaPeriod, | 		cpuCFSQuotaPeriod:      cpuCFSQuotaPeriod, | ||||||
| 		seccompProfileRoot:  seccompProfileRoot, | 		seccompProfileRoot:     seccompProfileRoot, | ||||||
| 		livenessManager:     livenessManager, | 		livenessManager:        livenessManager, | ||||||
| 		readinessManager:    readinessManager, | 		readinessManager:       readinessManager, | ||||||
| 		startupManager:      startupManager, | 		startupManager:         startupManager, | ||||||
| 		machineInfo:         machineInfo, | 		machineInfo:            machineInfo, | ||||||
| 		osInterface:         osInterface, | 		osInterface:            osInterface, | ||||||
| 		runtimeHelper:       runtimeHelper, | 		runtimeHelper:          runtimeHelper, | ||||||
| 		runtimeService:      newInstrumentedRuntimeService(runtimeService), | 		runtimeService:         newInstrumentedRuntimeService(runtimeService), | ||||||
| 		imageService:        newInstrumentedImageManagerService(imageService), | 		imageService:           newInstrumentedImageManagerService(imageService), | ||||||
| 		internalLifecycle:   internalLifecycle, | 		internalLifecycle:      internalLifecycle, | ||||||
| 		legacyLogProvider:   legacyLogProvider, | 		legacyLogProvider:      legacyLogProvider, | ||||||
| 		logManager:          logManager, | 		logManager:             logManager, | ||||||
| 		runtimeClassManager: runtimeClassManager, | 		runtimeClassManager:    runtimeClassManager, | ||||||
| 		logReduction:        logreduction.NewLogReduction(identicalErrorDelay), | 		logReduction:           logreduction.NewLogReduction(identicalErrorDelay), | ||||||
| 		seccompDefault:      seccompDefault, | 		seccompDefault:         seccompDefault, | ||||||
| 		memorySwapBehavior:  memorySwapBehavior, | 		memorySwapBehavior:     memorySwapBehavior, | ||||||
|  | 		getNodeAllocatable:     getNodeAllocatable, | ||||||
|  | 		memoryThrottlingFactor: memoryThrottlingFactor, | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	typedVersion, err := kubeRuntimeManager.getTypedVersion() | 	typedVersion, err := kubeRuntimeManager.getTypedVersion() | ||||||
|   | |||||||
| @@ -28,6 +28,7 @@ import ( | |||||||
| 	"github.com/stretchr/testify/require" | 	"github.com/stretchr/testify/require" | ||||||
|  |  | ||||||
| 	v1 "k8s.io/api/core/v1" | 	v1 "k8s.io/api/core/v1" | ||||||
|  | 	"k8s.io/apimachinery/pkg/api/resource" | ||||||
| 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||||
| 	"k8s.io/apimachinery/pkg/types" | 	"k8s.io/apimachinery/pkg/types" | ||||||
| 	"k8s.io/apimachinery/pkg/util/sets" | 	"k8s.io/apimachinery/pkg/util/sets" | ||||||
| @@ -58,7 +59,10 @@ func customTestRuntimeManager(keyring *credentialprovider.BasicDockerKeyring) (* | |||||||
| 	// Only an empty machineInfo is needed here, because in unit test all containers are besteffort, | 	// Only an empty machineInfo is needed here, because in unit test all containers are besteffort, | ||||||
| 	// data in machineInfo is not used. If burstable containers are used in unit test in the future, | 	// data in machineInfo is not used. If burstable containers are used in unit test in the future, | ||||||
| 	// we may want to set memory capacity. | 	// we may want to set memory capacity. | ||||||
| 	machineInfo := &cadvisorapi.MachineInfo{} | 	memoryCapacityQuantity := resource.MustParse(fakeNodeAllocatableMemory) | ||||||
|  | 	machineInfo := &cadvisorapi.MachineInfo{ | ||||||
|  | 		MemoryCapacity: uint64(memoryCapacityQuantity.Value()), | ||||||
|  | 	} | ||||||
| 	osInterface := &containertest.FakeOS{} | 	osInterface := &containertest.FakeOS{} | ||||||
| 	manager, err := newFakeKubeRuntimeManager(fakeRuntimeService, fakeImageService, machineInfo, osInterface, &containertest.FakeRuntimeHelper{}, keyring) | 	manager, err := newFakeKubeRuntimeManager(fakeRuntimeService, fakeImageService, machineInfo, osInterface, &containertest.FakeRuntimeHelper{}, keyring) | ||||||
| 	return fakeRuntimeService, fakeImageService, manager, err | 	return fakeRuntimeService, fakeImageService, manager, err | ||||||
|   | |||||||
| @@ -121,7 +121,7 @@ func calculateEmptyDirMemorySize(nodeAllocatableMemory *resource.Quantity, spec | |||||||
| 	// determine pod resource allocation | 	// determine pod resource allocation | ||||||
| 	// we use the same function for pod cgroup assigment to maintain consistent behavior | 	// we use the same function for pod cgroup assigment to maintain consistent behavior | ||||||
| 	// NOTE: this could be nil on systems that do not support pod memory containment (i.e. windows) | 	// NOTE: this could be nil on systems that do not support pod memory containment (i.e. windows) | ||||||
| 	podResourceConfig := cm.ResourceConfigForPod(pod, false, uint64(100000)) | 	podResourceConfig := cm.ResourceConfigForPod(pod, false, uint64(100000), false) | ||||||
| 	if podResourceConfig != nil && podResourceConfig.Memory != nil { | 	if podResourceConfig != nil && podResourceConfig.Memory != nil { | ||||||
| 		podMemoryLimit := resource.NewQuantity(*(podResourceConfig.Memory), resource.BinarySI) | 		podMemoryLimit := resource.NewQuantity(*(podResourceConfig.Memory), resource.BinarySI) | ||||||
| 		// ensure 0 < value < size | 		// ensure 0 < value < size | ||||||
|   | |||||||
| @@ -947,6 +947,15 @@ type KubeletConfiguration struct { | |||||||
| 	// Default: false | 	// Default: false | ||||||
| 	// +optional | 	// +optional | ||||||
| 	SeccompDefault *bool `json:"seccompDefault,omitempty"` | 	SeccompDefault *bool `json:"seccompDefault,omitempty"` | ||||||
|  | 	// MemoryThrottlingFactor specifies the factor multiplied by the memory limit or node allocatable memory | ||||||
|  | 	// when setting the cgroupv2 memory.high value to enforce MemoryQoS. | ||||||
|  | 	// Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure | ||||||
|  | 	// while increasing will put less reclaim pressure. | ||||||
|  | 	// See http://kep.k8s.io/2570 for more details. | ||||||
|  | 	// Default: 0.8 | ||||||
|  | 	// +featureGate=MemoryQoS | ||||||
|  | 	// +optional | ||||||
|  | 	MemoryThrottlingFactor *float64 `json:"memoryThrottlingFactor,omitempty"` | ||||||
| } | } | ||||||
|  |  | ||||||
| type KubeletAuthorizationMode string | type KubeletAuthorizationMode string | ||||||
|   | |||||||
| @@ -327,6 +327,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { | |||||||
| 		*out = new(bool) | 		*out = new(bool) | ||||||
| 		**out = **in | 		**out = **in | ||||||
| 	} | 	} | ||||||
|  | 	if in.MemoryThrottlingFactor != nil { | ||||||
|  | 		in, out := &in.MemoryThrottlingFactor, &out.MemoryThrottlingFactor | ||||||
|  | 		*out = new(float64) | ||||||
|  | 		**out = **in | ||||||
|  | 	} | ||||||
| 	return | 	return | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Li Bo
					Li Bo