mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-10-31 18:28:13 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			337 lines
		
	
	
		
			9.9 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			337 lines
		
	
	
		
			9.9 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| /*
 | |
| Copyright 2016 The Kubernetes Authors.
 | |
| 
 | |
| Licensed under the Apache License, Version 2.0 (the "License");
 | |
| you may not use this file except in compliance with the License.
 | |
| You may obtain a copy of the License at
 | |
| 
 | |
|     http://www.apache.org/licenses/LICENSE-2.0
 | |
| 
 | |
| Unless required by applicable law or agreed to in writing, software
 | |
| distributed under the License is distributed on an "AS IS" BASIS,
 | |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| See the License for the specific language governing permissions and
 | |
| limitations under the License.
 | |
| */
 | |
| 
 | |
| package cm
 | |
| 
 | |
| import (
 | |
| 	"bufio"
 | |
| 	"fmt"
 | |
| 	"os"
 | |
| 	"path/filepath"
 | |
| 	"strconv"
 | |
| 
 | |
| 	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
 | |
| 
 | |
| 	"k8s.io/api/core/v1"
 | |
| 	"k8s.io/apimachinery/pkg/types"
 | |
| 	utilfeature "k8s.io/apiserver/pkg/util/feature"
 | |
| 	"k8s.io/kubernetes/pkg/api/v1/resource"
 | |
| 	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
 | |
| 	v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
 | |
| 	kubefeatures "k8s.io/kubernetes/pkg/features"
 | |
| 	"k8s.io/kubernetes/pkg/kubelet/cm/util"
 | |
| )
 | |
| 
 | |
| const (
 | |
| 	// These limits are defined in the kernel:
 | |
| 	// https://github.com/torvalds/linux/blob/0bddd227f3dc55975e2b8dfa7fc6f959b062a2c7/kernel/sched/sched.h#L427-L428
 | |
| 	MinShares = 2
 | |
| 	MaxShares = 262144
 | |
| 
 | |
| 	SharesPerCPU  = 1024
 | |
| 	MilliCPUToCPU = 1000
 | |
| 
 | |
| 	// 100000 is equivalent to 100ms
 | |
| 	QuotaPeriod    = 100000
 | |
| 	MinQuotaPeriod = 1000
 | |
| )
 | |
| 
 | |
| // MilliCPUToQuota converts milliCPU to CFS quota and period values.
 | |
| func MilliCPUToQuota(milliCPU int64, period int64) (quota int64) {
 | |
| 	// CFS quota is measured in two values:
 | |
| 	//  - cfs_period_us=100ms (the amount of time to measure usage across given by period)
 | |
| 	//  - cfs_quota=20ms (the amount of cpu time allowed to be used across a period)
 | |
| 	// so in the above example, you are limited to 20% of a single CPU
 | |
| 	// for multi-cpu environments, you just scale equivalent amounts
 | |
| 	// see https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt for details
 | |
| 
 | |
| 	if milliCPU == 0 {
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUCFSQuotaPeriod) {
 | |
| 		period = QuotaPeriod
 | |
| 	}
 | |
| 
 | |
| 	// we then convert your milliCPU to a value normalized over a period
 | |
| 	quota = (milliCPU * period) / MilliCPUToCPU
 | |
| 
 | |
| 	// quota needs to be a minimum of 1ms.
 | |
| 	if quota < MinQuotaPeriod {
 | |
| 		quota = MinQuotaPeriod
 | |
| 	}
 | |
| 	return
 | |
| }
 | |
| 
 | |
| // MilliCPUToShares converts the milliCPU to CFS shares.
 | |
| func MilliCPUToShares(milliCPU int64) uint64 {
 | |
| 	if milliCPU == 0 {
 | |
| 		// Docker converts zero milliCPU to unset, which maps to kernel default
 | |
| 		// for unset: 1024. Return 2 here to really match kernel default for
 | |
| 		// zero milliCPU.
 | |
| 		return MinShares
 | |
| 	}
 | |
| 	// Conceptually (milliCPU / milliCPUToCPU) * sharesPerCPU, but factored to improve rounding.
 | |
| 	shares := (milliCPU * SharesPerCPU) / MilliCPUToCPU
 | |
| 	if shares < MinShares {
 | |
| 		return MinShares
 | |
| 	}
 | |
| 	if shares > MaxShares {
 | |
| 		return MaxShares
 | |
| 	}
 | |
| 	return uint64(shares)
 | |
| }
 | |
| 
 | |
| // HugePageLimits converts the API representation to a map
 | |
| // from huge page size (in bytes) to huge page limit (in bytes).
 | |
| func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 {
 | |
| 	hugePageLimits := map[int64]int64{}
 | |
| 	for k, v := range resourceList {
 | |
| 		if v1helper.IsHugePageResourceName(k) {
 | |
| 			pageSize, _ := v1helper.HugePageSizeFromResourceName(k)
 | |
| 			if value, exists := hugePageLimits[pageSize.Value()]; exists {
 | |
| 				hugePageLimits[pageSize.Value()] = value + v.Value()
 | |
| 			} else {
 | |
| 				hugePageLimits[pageSize.Value()] = v.Value()
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return hugePageLimits
 | |
| }
 | |
| 
 | |
| // ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
 | |
| func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig {
 | |
| 	// sum requests and limits.
 | |
| 	reqs, limits := resource.PodRequestsAndLimits(pod)
 | |
| 
 | |
| 	cpuRequests := int64(0)
 | |
| 	cpuLimits := int64(0)
 | |
| 	memoryLimits := int64(0)
 | |
| 	if request, found := reqs[v1.ResourceCPU]; found {
 | |
| 		cpuRequests = request.MilliValue()
 | |
| 	}
 | |
| 	if limit, found := limits[v1.ResourceCPU]; found {
 | |
| 		cpuLimits = limit.MilliValue()
 | |
| 	}
 | |
| 	if limit, found := limits[v1.ResourceMemory]; found {
 | |
| 		memoryLimits = limit.Value()
 | |
| 	}
 | |
| 
 | |
| 	// convert to CFS values
 | |
| 	cpuShares := MilliCPUToShares(cpuRequests)
 | |
| 	cpuQuota := MilliCPUToQuota(cpuLimits, int64(cpuPeriod))
 | |
| 
 | |
| 	// track if limits were applied for each resource.
 | |
| 	memoryLimitsDeclared := true
 | |
| 	cpuLimitsDeclared := true
 | |
| 	// map hugepage pagesize (bytes) to limits (bytes)
 | |
| 	hugePageLimits := map[int64]int64{}
 | |
| 	for _, container := range pod.Spec.Containers {
 | |
| 		if container.Resources.Limits.Cpu().IsZero() {
 | |
| 			cpuLimitsDeclared = false
 | |
| 		}
 | |
| 		if container.Resources.Limits.Memory().IsZero() {
 | |
| 			memoryLimitsDeclared = false
 | |
| 		}
 | |
| 		containerHugePageLimits := HugePageLimits(container.Resources.Requests)
 | |
| 		for k, v := range containerHugePageLimits {
 | |
| 			if value, exists := hugePageLimits[k]; exists {
 | |
| 				hugePageLimits[k] = value + v
 | |
| 			} else {
 | |
| 				hugePageLimits[k] = v
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// quota is not capped when cfs quota is disabled
 | |
| 	if !enforceCPULimits {
 | |
| 		cpuQuota = int64(-1)
 | |
| 	}
 | |
| 
 | |
| 	// determine the qos class
 | |
| 	qosClass := v1qos.GetPodQOS(pod)
 | |
| 
 | |
| 	// build the result
 | |
| 	result := &ResourceConfig{}
 | |
| 	if qosClass == v1.PodQOSGuaranteed {
 | |
| 		result.CpuShares = &cpuShares
 | |
| 		result.CpuQuota = &cpuQuota
 | |
| 		result.CpuPeriod = &cpuPeriod
 | |
| 		result.Memory = &memoryLimits
 | |
| 	} else if qosClass == v1.PodQOSBurstable {
 | |
| 		result.CpuShares = &cpuShares
 | |
| 		if cpuLimitsDeclared {
 | |
| 			result.CpuQuota = &cpuQuota
 | |
| 			result.CpuPeriod = &cpuPeriod
 | |
| 		}
 | |
| 		if memoryLimitsDeclared {
 | |
| 			result.Memory = &memoryLimits
 | |
| 		}
 | |
| 	} else {
 | |
| 		shares := uint64(MinShares)
 | |
| 		result.CpuShares = &shares
 | |
| 	}
 | |
| 	result.HugePageLimit = hugePageLimits
 | |
| 
 | |
| 	if enforceMemoryQoS {
 | |
| 		memoryMin := int64(0)
 | |
| 		if request, found := reqs[v1.ResourceMemory]; found {
 | |
| 			memoryMin = request.Value()
 | |
| 		}
 | |
| 		if memoryMin > 0 {
 | |
| 			result.Unified = map[string]string{
 | |
| 				MemoryMin: strconv.FormatInt(memoryMin, 10),
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return result
 | |
| }
 | |
| 
 | |
| // getCgroupSubsystemsV1 returns information about the mounted cgroup v1 subsystems
 | |
| func getCgroupSubsystemsV1() (*CgroupSubsystems, error) {
 | |
| 	// get all cgroup mounts.
 | |
| 	allCgroups, err := libcontainercgroups.GetCgroupMounts(true)
 | |
| 	if err != nil {
 | |
| 		return &CgroupSubsystems{}, err
 | |
| 	}
 | |
| 	if len(allCgroups) == 0 {
 | |
| 		return &CgroupSubsystems{}, fmt.Errorf("failed to find cgroup mounts")
 | |
| 	}
 | |
| 	mountPoints := make(map[string]string, len(allCgroups))
 | |
| 	for _, mount := range allCgroups {
 | |
| 		// BEFORE kubelet used a random mount point per cgroups subsystem;
 | |
| 		// NOW    more deterministic: kubelet use mount point with shortest path;
 | |
| 		// FUTURE is bright with clear expectation determined in doc.
 | |
| 		// ref. issue: https://github.com/kubernetes/kubernetes/issues/95488
 | |
| 
 | |
| 		for _, subsystem := range mount.Subsystems {
 | |
| 			previous := mountPoints[subsystem]
 | |
| 			if previous == "" || len(mount.Mountpoint) < len(previous) {
 | |
| 				mountPoints[subsystem] = mount.Mountpoint
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return &CgroupSubsystems{
 | |
| 		Mounts:      allCgroups,
 | |
| 		MountPoints: mountPoints,
 | |
| 	}, nil
 | |
| }
 | |
| 
 | |
| // getCgroupSubsystemsV2 returns information about the enabled cgroup v2 subsystems
 | |
| func getCgroupSubsystemsV2() (*CgroupSubsystems, error) {
 | |
| 	controllers, err := libcontainercgroups.GetAllSubsystems()
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 
 | |
| 	mounts := []libcontainercgroups.Mount{}
 | |
| 	mountPoints := make(map[string]string, len(controllers))
 | |
| 	for _, controller := range controllers {
 | |
| 		mountPoints[controller] = util.CgroupRoot
 | |
| 		m := libcontainercgroups.Mount{
 | |
| 			Mountpoint: util.CgroupRoot,
 | |
| 			Root:       util.CgroupRoot,
 | |
| 			Subsystems: []string{controller},
 | |
| 		}
 | |
| 		mounts = append(mounts, m)
 | |
| 	}
 | |
| 
 | |
| 	return &CgroupSubsystems{
 | |
| 		Mounts:      mounts,
 | |
| 		MountPoints: mountPoints,
 | |
| 	}, nil
 | |
| }
 | |
| 
 | |
| // GetCgroupSubsystems returns information about the mounted cgroup subsystems
 | |
| func GetCgroupSubsystems() (*CgroupSubsystems, error) {
 | |
| 	if libcontainercgroups.IsCgroup2UnifiedMode() {
 | |
| 		return getCgroupSubsystemsV2()
 | |
| 	}
 | |
| 
 | |
| 	return getCgroupSubsystemsV1()
 | |
| }
 | |
| 
 | |
| // getCgroupProcs takes a cgroup directory name as an argument
 | |
| // reads through the cgroup's procs file and returns a list of tgid's.
 | |
| // It returns an empty list if a procs file doesn't exists
 | |
| func getCgroupProcs(dir string) ([]int, error) {
 | |
| 	procsFile := filepath.Join(dir, "cgroup.procs")
 | |
| 	f, err := os.Open(procsFile)
 | |
| 	if err != nil {
 | |
| 		if os.IsNotExist(err) {
 | |
| 			// The procsFile does not exist, So no pids attached to this directory
 | |
| 			return []int{}, nil
 | |
| 		}
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	defer f.Close()
 | |
| 
 | |
| 	s := bufio.NewScanner(f)
 | |
| 	out := []int{}
 | |
| 	for s.Scan() {
 | |
| 		if t := s.Text(); t != "" {
 | |
| 			pid, err := strconv.Atoi(t)
 | |
| 			if err != nil {
 | |
| 				return nil, fmt.Errorf("unexpected line in %v; could not convert to pid: %v", procsFile, err)
 | |
| 			}
 | |
| 			out = append(out, pid)
 | |
| 		}
 | |
| 	}
 | |
| 	return out, nil
 | |
| }
 | |
| 
 | |
| // GetPodCgroupNameSuffix returns the last element of the pod CgroupName identifier
 | |
| func GetPodCgroupNameSuffix(podUID types.UID) string {
 | |
| 	return podCgroupNamePrefix + string(podUID)
 | |
| }
 | |
| 
 | |
| // NodeAllocatableRoot returns the literal cgroup path for the node allocatable cgroup
 | |
| func NodeAllocatableRoot(cgroupRoot string, cgroupsPerQOS bool, cgroupDriver string) string {
 | |
| 	nodeAllocatableRoot := ParseCgroupfsToCgroupName(cgroupRoot)
 | |
| 	if cgroupsPerQOS {
 | |
| 		nodeAllocatableRoot = NewCgroupName(nodeAllocatableRoot, defaultNodeAllocatableCgroupName)
 | |
| 	}
 | |
| 	if libcontainerCgroupManagerType(cgroupDriver) == libcontainerSystemd {
 | |
| 		return nodeAllocatableRoot.ToSystemd()
 | |
| 	}
 | |
| 	return nodeAllocatableRoot.ToCgroupfs()
 | |
| }
 | |
| 
 | |
| // GetKubeletContainer returns the cgroup the kubelet will use
 | |
| func GetKubeletContainer(kubeletCgroups string) (string, error) {
 | |
| 	if kubeletCgroups == "" {
 | |
| 		cont, err := getContainer(os.Getpid())
 | |
| 		if err != nil {
 | |
| 			return "", err
 | |
| 		}
 | |
| 		return cont, nil
 | |
| 	}
 | |
| 	return kubeletCgroups, nil
 | |
| }
 | |
| 
 | |
| // GetRuntimeContainer returns the cgroup used by the container runtime
 | |
| func GetRuntimeContainer(containerRuntime, runtimeCgroups string) (string, error) {
 | |
| 	if containerRuntime == "docker" {
 | |
| 		cont, err := getContainerNameForProcess(dockerProcessName, dockerPidFile)
 | |
| 		if err != nil {
 | |
| 			return "", fmt.Errorf("failed to get container name for docker process: %v", err)
 | |
| 		}
 | |
| 		return cont, nil
 | |
| 	}
 | |
| 	return runtimeCgroups, nil
 | |
| }
 | 
