mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-11-01 02:38:12 +00:00 
			
		
		
		
	Merge pull request #30311 from derekwaynecarr/inode_eviction
Automatic merge from submit-queue kubelet eviction on inode exhaustion Add support for kubelet to monitor for inode exhaustion of either image or rootfs, and in response, attempt to reclaim node level resources and/or evict pods.
This commit is contained in:
		| @@ -478,9 +478,19 @@ for eviction. Instead `DaemonSet` should ideally include Guaranteed pods only. | ||||
|  | ||||
| ## Known issues | ||||
|  | ||||
| ### kubelet may evict more pods than needed | ||||
|  | ||||
| The pod eviction may evict more pods than needed due to stats collection timing gap. This can be mitigated by adding | ||||
| the ability to get root container stats on an on-demand basis (https://github.com/google/cadvisor/issues/1247) in the future. | ||||
|  | ||||
| ### How kubelet ranks pods for eviction in response to inode exhaustion | ||||
|  | ||||
| At this time, it is not possible to know how many inodes were consumed by a particular container.  If the `kubelet` observes | ||||
| inode exhaustion, it will evict pods by ranking them by quality of service.  The following issue has been opened in cadvisor | ||||
| to track per container inode consumption (https://github.com/google/cadvisor/issues/1422) which would allow us to rank pods | ||||
| by inode consumption.  For example, this would let us identify a container that created large numbers of 0 byte files, and evict | ||||
| that pod over others. | ||||
|  | ||||
| <!-- BEGIN MUNGE: GENERATED_ANALYTICS --> | ||||
| []() | ||||
| <!-- END MUNGE: GENERATED_ANALYTICS --> | ||||
|   | ||||
| @@ -96,6 +96,7 @@ pkg/credentialprovider/aws | ||||
| pkg/hyperkube | ||||
| pkg/kubelet/api | ||||
| pkg/kubelet/container | ||||
| pkg/kubelet/eviction | ||||
| pkg/kubelet/envvars | ||||
| pkg/kubelet/util/format | ||||
| pkg/kubelet/util/ioutils | ||||
|   | ||||
| @@ -914,3 +914,228 @@ func TestNodeReclaimFuncs(t *testing.T) { | ||||
| 		t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod) | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func TestDiskPressureNodeFsInodes(t *testing.T) { | ||||
| 	// TODO: we need to know inodes used when cadvisor supports per container stats | ||||
| 	podMaker := func(name string, requests api.ResourceList, limits api.ResourceList) (*api.Pod, statsapi.PodStats) { | ||||
| 		pod := newPod(name, []api.Container{ | ||||
| 			newContainer(name, requests, limits), | ||||
| 		}, nil) | ||||
| 		podStats := newPodInodeStats(pod) | ||||
| 		return pod, podStats | ||||
| 	} | ||||
| 	summaryStatsMaker := func(rootFsInodesFree, rootFsInodes string, podStats map[*api.Pod]statsapi.PodStats) *statsapi.Summary { | ||||
| 		rootFsInodesFreeVal := resource.MustParse(rootFsInodesFree) | ||||
| 		internalRootFsInodesFree := uint64(rootFsInodesFreeVal.Value()) | ||||
| 		rootFsInodesVal := resource.MustParse(rootFsInodes) | ||||
| 		internalRootFsInodes := uint64(rootFsInodesVal.Value()) | ||||
| 		result := &statsapi.Summary{ | ||||
| 			Node: statsapi.NodeStats{ | ||||
| 				Fs: &statsapi.FsStats{ | ||||
| 					InodesFree: &internalRootFsInodesFree, | ||||
| 					Inodes:     &internalRootFsInodes, | ||||
| 				}, | ||||
| 			}, | ||||
| 			Pods: []statsapi.PodStats{}, | ||||
| 		} | ||||
| 		for _, podStat := range podStats { | ||||
| 			result.Pods = append(result.Pods, podStat) | ||||
| 		} | ||||
| 		return result | ||||
| 	} | ||||
| 	// TODO: pass inodes used in future when supported by cadvisor. | ||||
| 	podsToMake := []struct { | ||||
| 		name     string | ||||
| 		requests api.ResourceList | ||||
| 		limits   api.ResourceList | ||||
| 	}{ | ||||
| 		{name: "best-effort-high", requests: newResourceList("", ""), limits: newResourceList("", "")}, | ||||
| 		{name: "best-effort-low", requests: newResourceList("", ""), limits: newResourceList("", "")}, | ||||
| 		{name: "burstable-high", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi")}, | ||||
| 		{name: "burstable-low", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi")}, | ||||
| 		{name: "guaranteed-high", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi")}, | ||||
| 		{name: "guaranteed-low", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi")}, | ||||
| 	} | ||||
| 	pods := []*api.Pod{} | ||||
| 	podStats := map[*api.Pod]statsapi.PodStats{} | ||||
| 	for _, podToMake := range podsToMake { | ||||
| 		pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits) | ||||
| 		pods = append(pods, pod) | ||||
| 		podStats[pod] = podStat | ||||
| 	} | ||||
| 	activePodsFunc := func() []*api.Pod { | ||||
| 		return pods | ||||
| 	} | ||||
|  | ||||
| 	fakeClock := clock.NewFakeClock(time.Now()) | ||||
| 	podKiller := &mockPodKiller{} | ||||
| 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} | ||||
| 	imageGC := &mockImageGC{freed: int64(0), err: nil} | ||||
| 	nodeRef := &api.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} | ||||
|  | ||||
| 	config := Config{ | ||||
| 		MaxPodGracePeriodSeconds: 5, | ||||
| 		PressureTransitionPeriod: time.Minute * 5, | ||||
| 		Thresholds: []Threshold{ | ||||
| 			{ | ||||
| 				Signal:   SignalNodeFsInodesFree, | ||||
| 				Operator: OpLessThan, | ||||
| 				Value: ThresholdValue{ | ||||
| 					Quantity: quantityMustParse("1Mi"), | ||||
| 				}, | ||||
| 			}, | ||||
| 			{ | ||||
| 				Signal:   SignalNodeFsInodesFree, | ||||
| 				Operator: OpLessThan, | ||||
| 				Value: ThresholdValue{ | ||||
| 					Quantity: quantityMustParse("2Mi"), | ||||
| 				}, | ||||
| 				GracePeriod: time.Minute * 2, | ||||
| 			}, | ||||
| 		}, | ||||
| 	} | ||||
| 	summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("3Mi", "4Mi", podStats)} | ||||
| 	manager := &managerImpl{ | ||||
| 		clock:           fakeClock, | ||||
| 		killPodFunc:     podKiller.killPodNow, | ||||
| 		imageGC:         imageGC, | ||||
| 		config:          config, | ||||
| 		recorder:        &record.FakeRecorder{}, | ||||
| 		summaryProvider: summaryProvider, | ||||
| 		nodeRef:         nodeRef, | ||||
| 		nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, | ||||
| 		thresholdsFirstObservedAt:    thresholdsObservedAt{}, | ||||
| 	} | ||||
|  | ||||
| 	// create a best effort pod to test admission | ||||
| 	podToAdmit, _ := podMaker("pod-to-admit", newResourceList("", ""), newResourceList("", "")) | ||||
|  | ||||
| 	// synchronize | ||||
| 	manager.synchronize(diskInfoProvider, activePodsFunc) | ||||
|  | ||||
| 	// we should not have disk pressure | ||||
| 	if manager.IsUnderDiskPressure() { | ||||
| 		t.Errorf("Manager should not report disk pressure") | ||||
| 	} | ||||
|  | ||||
| 	// try to admit our pod (should succeed) | ||||
| 	if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit { | ||||
| 		t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) | ||||
| 	} | ||||
|  | ||||
| 	// induce soft threshold | ||||
| 	fakeClock.Step(1 * time.Minute) | ||||
| 	summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats) | ||||
| 	manager.synchronize(diskInfoProvider, activePodsFunc) | ||||
|  | ||||
| 	// we should have disk pressure | ||||
| 	if !manager.IsUnderDiskPressure() { | ||||
| 		t.Errorf("Manager should report disk pressure since soft threshold was met") | ||||
| 	} | ||||
|  | ||||
| 	// verify no pod was yet killed because there has not yet been enough time passed. | ||||
| 	if podKiller.pod != nil { | ||||
| 		t.Errorf("Manager should not have killed a pod yet, but killed: %v", podKiller.pod) | ||||
| 	} | ||||
|  | ||||
| 	// step forward in time pass the grace period | ||||
| 	fakeClock.Step(3 * time.Minute) | ||||
| 	summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats) | ||||
| 	manager.synchronize(diskInfoProvider, activePodsFunc) | ||||
|  | ||||
| 	// we should have disk pressure | ||||
| 	if !manager.IsUnderDiskPressure() { | ||||
| 		t.Errorf("Manager should report disk pressure since soft threshold was met") | ||||
| 	} | ||||
|  | ||||
| 	// verify the right pod was killed with the right grace period. | ||||
| 	if podKiller.pod != pods[0] { | ||||
| 		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0]) | ||||
| 	} | ||||
| 	if podKiller.gracePeriodOverride == nil { | ||||
| 		t.Errorf("Manager chose to kill pod but should have had a grace period override.") | ||||
| 	} | ||||
| 	observedGracePeriod := *podKiller.gracePeriodOverride | ||||
| 	if observedGracePeriod != manager.config.MaxPodGracePeriodSeconds { | ||||
| 		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", manager.config.MaxPodGracePeriodSeconds, observedGracePeriod) | ||||
| 	} | ||||
| 	// reset state | ||||
| 	podKiller.pod = nil | ||||
| 	podKiller.gracePeriodOverride = nil | ||||
|  | ||||
| 	// remove disk pressure | ||||
| 	fakeClock.Step(20 * time.Minute) | ||||
| 	summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) | ||||
| 	manager.synchronize(diskInfoProvider, activePodsFunc) | ||||
|  | ||||
| 	// we should not have disk pressure | ||||
| 	if manager.IsUnderDiskPressure() { | ||||
| 		t.Errorf("Manager should not report disk pressure") | ||||
| 	} | ||||
|  | ||||
| 	// induce disk pressure! | ||||
| 	fakeClock.Step(1 * time.Minute) | ||||
| 	summaryProvider.result = summaryStatsMaker("0.5Mi", "4Mi", podStats) | ||||
| 	manager.synchronize(diskInfoProvider, activePodsFunc) | ||||
|  | ||||
| 	// we should have disk pressure | ||||
| 	if !manager.IsUnderDiskPressure() { | ||||
| 		t.Errorf("Manager should report disk pressure") | ||||
| 	} | ||||
|  | ||||
| 	// check the right pod was killed | ||||
| 	if podKiller.pod != pods[0] { | ||||
| 		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0]) | ||||
| 	} | ||||
| 	observedGracePeriod = *podKiller.gracePeriodOverride | ||||
| 	if observedGracePeriod != int64(0) { | ||||
| 		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod) | ||||
| 	} | ||||
|  | ||||
| 	// try to admit our pod (should fail) | ||||
| 	if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit { | ||||
| 		t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) | ||||
| 	} | ||||
|  | ||||
| 	// reduce disk pressure | ||||
| 	fakeClock.Step(1 * time.Minute) | ||||
| 	summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) | ||||
| 	podKiller.pod = nil // reset state | ||||
| 	manager.synchronize(diskInfoProvider, activePodsFunc) | ||||
|  | ||||
| 	// we should have disk pressure (because transition period not yet met) | ||||
| 	if !manager.IsUnderDiskPressure() { | ||||
| 		t.Errorf("Manager should report disk pressure") | ||||
| 	} | ||||
|  | ||||
| 	// no pod should have been killed | ||||
| 	if podKiller.pod != nil { | ||||
| 		t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod) | ||||
| 	} | ||||
|  | ||||
| 	// try to admit our pod (should fail) | ||||
| 	if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit { | ||||
| 		t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) | ||||
| 	} | ||||
|  | ||||
| 	// move the clock past transition period to ensure that we stop reporting pressure | ||||
| 	fakeClock.Step(5 * time.Minute) | ||||
| 	summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) | ||||
| 	podKiller.pod = nil // reset state | ||||
| 	manager.synchronize(diskInfoProvider, activePodsFunc) | ||||
|  | ||||
| 	// we should not have disk pressure (because transition period met) | ||||
| 	if manager.IsUnderDiskPressure() { | ||||
| 		t.Errorf("Manager should not report disk pressure") | ||||
| 	} | ||||
|  | ||||
| 	// no pod should have been killed | ||||
| 	if podKiller.pod != nil { | ||||
| 		t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod) | ||||
| 	} | ||||
|  | ||||
| 	// try to admit our pod (should succeed) | ||||
| 	if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit { | ||||
| 		t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) | ||||
| 	} | ||||
| } | ||||
|   | ||||
| @@ -41,10 +41,16 @@ const ( | ||||
| 	message = "The node was low on compute resources." | ||||
| 	// disk, in bytes.  internal to this module, used to account for local disk usage. | ||||
| 	resourceDisk api.ResourceName = "disk" | ||||
| 	// inodes, number. internal to this module, used to account for local disk inode consumption. | ||||
| 	resourceInodes api.ResourceName = "inodes" | ||||
| 	// imagefs, in bytes.  internal to this module, used to account for local image filesystem usage. | ||||
| 	resourceImageFs api.ResourceName = "imagefs" | ||||
| 	// imagefs inodes, number.  internal to this module, used to account for local image filesystem inodes. | ||||
| 	resourceImageFsInodes api.ResourceName = "imagefsInodes" | ||||
| 	// nodefs, in bytes.  internal to this module, used to account for local node root filesystem usage. | ||||
| 	resourceNodeFs api.ResourceName = "nodefs" | ||||
| 	// nodefs inodes, number.  internal to this module, used to account for local node root filesystem inodes. | ||||
| 	resourceNodeFsInodes api.ResourceName = "nodefsInodes" | ||||
| ) | ||||
|  | ||||
| var ( | ||||
| @@ -62,12 +68,16 @@ func init() { | ||||
| 	signalToNodeCondition[SignalMemoryAvailable] = api.NodeMemoryPressure | ||||
| 	signalToNodeCondition[SignalImageFsAvailable] = api.NodeDiskPressure | ||||
| 	signalToNodeCondition[SignalNodeFsAvailable] = api.NodeDiskPressure | ||||
| 	signalToNodeCondition[SignalImageFsInodesFree] = api.NodeDiskPressure | ||||
| 	signalToNodeCondition[SignalNodeFsInodesFree] = api.NodeDiskPressure | ||||
|  | ||||
| 	// map signals to resources (and vice-versa) | ||||
| 	signalToResource = map[Signal]api.ResourceName{} | ||||
| 	signalToResource[SignalMemoryAvailable] = api.ResourceMemory | ||||
| 	signalToResource[SignalImageFsAvailable] = resourceImageFs | ||||
| 	signalToResource[SignalImageFsInodesFree] = resourceImageFsInodes | ||||
| 	signalToResource[SignalNodeFsAvailable] = resourceNodeFs | ||||
| 	signalToResource[SignalNodeFsInodesFree] = resourceNodeFsInodes | ||||
| 	resourceToSignal = map[api.ResourceName]Signal{} | ||||
| 	for key, value := range signalToResource { | ||||
| 		resourceToSignal[value] = key | ||||
| @@ -185,22 +195,21 @@ func parseThresholdStatement(statement string) (Threshold, error) { | ||||
| 				Percentage: percentage, | ||||
| 			}, | ||||
| 		}, nil | ||||
| 	} else { | ||||
| 		quantity, err := resource.ParseQuantity(quantityValue) | ||||
| 		if err != nil { | ||||
| 			return Threshold{}, err | ||||
| 		} | ||||
| 		if quantity.Sign() < 0 || quantity.IsZero() { | ||||
| 			return Threshold{}, fmt.Errorf("eviction threshold %v must be positive: %s", signal, &quantity) | ||||
| 		} | ||||
| 		return Threshold{ | ||||
| 			Signal:   signal, | ||||
| 			Operator: operator, | ||||
| 			Value: ThresholdValue{ | ||||
| 				Quantity: &quantity, | ||||
| 			}, | ||||
| 		}, nil | ||||
| 	} | ||||
| 	quantity, err := resource.ParseQuantity(quantityValue) | ||||
| 	if err != nil { | ||||
| 		return Threshold{}, err | ||||
| 	} | ||||
| 	if quantity.Sign() < 0 || quantity.IsZero() { | ||||
| 		return Threshold{}, fmt.Errorf("eviction threshold %v must be positive: %s", signal, &quantity) | ||||
| 	} | ||||
| 	return Threshold{ | ||||
| 		Signal:   signal, | ||||
| 		Operator: operator, | ||||
| 		Value: ThresholdValue{ | ||||
| 			Quantity: &quantity, | ||||
| 		}, | ||||
| 	}, nil | ||||
| } | ||||
|  | ||||
| // parsePercentage parses a string representing a percentage value | ||||
| @@ -287,6 +296,18 @@ func diskUsage(fsStats *statsapi.FsStats) *resource.Quantity { | ||||
| 	return resource.NewQuantity(usage, resource.BinarySI) | ||||
| } | ||||
|  | ||||
| // inodeUsage converts inodes consumed into a resource quantity. | ||||
| func inodeUsage(fsStats *statsapi.FsStats) *resource.Quantity { | ||||
| 	// TODO: cadvisor needs to support inodes used per container | ||||
| 	// right now, cadvisor reports total inodes and inodes free per filesystem. | ||||
| 	// this is insufficient to know how many inodes are consumed by the container. | ||||
| 	// for example, with the overlay driver, the rootfs and each container filesystem | ||||
| 	// will report the same total inode and inode free values but no way of knowing | ||||
| 	// how many inodes consumed in that filesystem are charged to this container. | ||||
| 	// for now, we report 0 as inode usage pending support in cadvisor. | ||||
| 	return resource.NewQuantity(int64(0), resource.BinarySI) | ||||
| } | ||||
|  | ||||
| // memoryUsage converts working set into a resource quantity. | ||||
| func memoryUsage(memStats *statsapi.MemoryStats) *resource.Quantity { | ||||
| 	if memStats == nil || memStats.WorkingSetBytes == nil { | ||||
| @@ -311,15 +332,18 @@ func localVolumeNames(pod *api.Pod) []string { | ||||
| 	return result | ||||
| } | ||||
|  | ||||
| // podDiskUsage aggregates pod disk usage for the specified stats to measure. | ||||
| // podDiskUsage aggregates pod disk usage and inode consumption for the specified stats to measure. | ||||
| func podDiskUsage(podStats statsapi.PodStats, pod *api.Pod, statsToMeasure []fsStatsType) (api.ResourceList, error) { | ||||
| 	disk := resource.Quantity{Format: resource.BinarySI} | ||||
| 	inodes := resource.Quantity{Format: resource.BinarySI} | ||||
| 	for _, container := range podStats.Containers { | ||||
| 		if hasFsStatsType(statsToMeasure, fsStatsRoot) { | ||||
| 			disk.Add(*diskUsage(container.Rootfs)) | ||||
| 			inodes.Add(*inodeUsage(container.Rootfs)) | ||||
| 		} | ||||
| 		if hasFsStatsType(statsToMeasure, fsStatsLogs) { | ||||
| 			disk.Add(*diskUsage(container.Logs)) | ||||
| 			inodes.Add(*inodeUsage(container.Logs)) | ||||
| 		} | ||||
| 	} | ||||
| 	if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) { | ||||
| @@ -328,13 +352,15 @@ func podDiskUsage(podStats statsapi.PodStats, pod *api.Pod, statsToMeasure []fsS | ||||
| 			for _, volumeStats := range podStats.VolumeStats { | ||||
| 				if volumeStats.Name == volumeName { | ||||
| 					disk.Add(*diskUsage(&volumeStats.FsStats)) | ||||
| 					inodes.Add(*inodeUsage(&volumeStats.FsStats)) | ||||
| 					break | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 	return api.ResourceList{ | ||||
| 		resourceDisk: disk, | ||||
| 		resourceDisk:   disk, | ||||
| 		resourceInodes: inodes, | ||||
| 	}, nil | ||||
| } | ||||
|  | ||||
| @@ -502,8 +528,8 @@ func memory(stats statsFunc) cmpFunc { | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // disk compares pods by largest consumer of disk relative to request. | ||||
| func disk(stats statsFunc, fsStatsToMeasure []fsStatsType) cmpFunc { | ||||
| // disk compares pods by largest consumer of disk relative to request for the specified disk resource. | ||||
| func disk(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource api.ResourceName) cmpFunc { | ||||
| 	return func(p1, p2 *api.Pod) int { | ||||
| 		p1Stats, found := stats(p1) | ||||
| 		// if we have no usage stats for p1, we want p2 first | ||||
| @@ -528,8 +554,8 @@ func disk(stats statsFunc, fsStatsToMeasure []fsStatsType) cmpFunc { | ||||
|  | ||||
| 		// disk is best effort, so we don't measure relative to a request. | ||||
| 		// TODO: add disk as a guaranteed resource | ||||
| 		p1Disk := p1Usage[resourceDisk] | ||||
| 		p2Disk := p2Usage[resourceDisk] | ||||
| 		p1Disk := p1Usage[diskResource] | ||||
| 		p2Disk := p2Usage[diskResource] | ||||
| 		// if p2 is using more than p1, we want p2 first | ||||
| 		return p2Disk.Cmp(p1Disk) | ||||
| 	} | ||||
| @@ -541,9 +567,9 @@ func rankMemoryPressure(pods []*api.Pod, stats statsFunc) { | ||||
| } | ||||
|  | ||||
| // rankDiskPressureFunc returns a rankFunc that measures the specified fs stats. | ||||
| func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType) rankFunc { | ||||
| func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType, diskResource api.ResourceName) rankFunc { | ||||
| 	return func(pods []*api.Pod, stats statsFunc) { | ||||
| 		orderedBy(qosComparator, disk(stats, fsStatsToMeasure)).Sort(pods) | ||||
| 		orderedBy(qosComparator, disk(stats, fsStatsToMeasure, diskResource)).Sort(pods) | ||||
| 	} | ||||
| } | ||||
|  | ||||
| @@ -564,6 +590,7 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv | ||||
| 	if err != nil { | ||||
| 		return nil, nil, err | ||||
| 	} | ||||
|  | ||||
| 	// build the function to work against for pod stats | ||||
| 	statsFunc := cachedStatsFunc(summary.Pods) | ||||
| 	// build an evaluation context for current eviction signals | ||||
| @@ -575,17 +602,33 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv | ||||
| 			capacity:  resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI), | ||||
| 		} | ||||
| 	} | ||||
| 	if nodeFs := summary.Node.Fs; nodeFs != nil && nodeFs.AvailableBytes != nil && nodeFs.CapacityBytes != nil { | ||||
| 		result[SignalNodeFsAvailable] = signalObservation{ | ||||
| 			available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI), | ||||
| 			capacity:  resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI), | ||||
| 	if nodeFs := summary.Node.Fs; nodeFs != nil { | ||||
| 		if nodeFs.AvailableBytes != nil && nodeFs.CapacityBytes != nil { | ||||
| 			result[SignalNodeFsAvailable] = signalObservation{ | ||||
| 				available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI), | ||||
| 				capacity:  resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI), | ||||
| 			} | ||||
| 		} | ||||
| 		if nodeFs.InodesFree != nil && nodeFs.Inodes != nil { | ||||
| 			result[SignalNodeFsInodesFree] = signalObservation{ | ||||
| 				available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.BinarySI), | ||||
| 				capacity:  resource.NewQuantity(int64(*nodeFs.Inodes), resource.BinarySI), | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 	if summary.Node.Runtime != nil { | ||||
| 		if imageFs := summary.Node.Runtime.ImageFs; imageFs != nil && imageFs.AvailableBytes != nil && imageFs.CapacityBytes != nil { | ||||
| 			result[SignalImageFsAvailable] = signalObservation{ | ||||
| 				available: resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI), | ||||
| 				capacity:  resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI), | ||||
| 		if imageFs := summary.Node.Runtime.ImageFs; imageFs != nil { | ||||
| 			if imageFs.AvailableBytes != nil && imageFs.CapacityBytes != nil { | ||||
| 				result[SignalImageFsAvailable] = signalObservation{ | ||||
| 					available: resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI), | ||||
| 					capacity:  resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI), | ||||
| 				} | ||||
| 				if imageFs.InodesFree != nil && imageFs.Inodes != nil { | ||||
| 					result[SignalImageFsInodesFree] = signalObservation{ | ||||
| 						available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.BinarySI), | ||||
| 						capacity:  resource.NewQuantity(int64(*imageFs.Inodes), resource.BinarySI), | ||||
| 					} | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| @@ -785,16 +828,20 @@ func buildResourceToRankFunc(withImageFs bool) map[api.ResourceName]rankFunc { | ||||
| 	// usage of an imagefs is optional | ||||
| 	if withImageFs { | ||||
| 		// with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes | ||||
| 		resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}) | ||||
| 		resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk) | ||||
| 		resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) | ||||
| 		// with an imagefs, imagefs pod rank func for eviction only includes rootfs | ||||
| 		resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}) | ||||
| 		resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceDisk) | ||||
| 		resourceToRankFunc[resourceImageFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceInodes) | ||||
| 	} else { | ||||
| 		// without an imagefs, nodefs pod rank func for eviction looks at all fs stats | ||||
| 		resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}) | ||||
| 		resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk) | ||||
| 		resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) | ||||
| 	} | ||||
| 	return resourceToRankFunc | ||||
| } | ||||
|  | ||||
| // PodIsEvicted returns true if the reported pod status is due to an eviction. | ||||
| func PodIsEvicted(podStatus api.PodStatus) bool { | ||||
| 	return podStatus.Phase == api.PodFailed && podStatus.Reason == reason | ||||
| } | ||||
| @@ -806,11 +853,14 @@ func buildResourceToNodeReclaimFuncs(imageGC ImageGC, withImageFs bool) map[api. | ||||
| 	if withImageFs { | ||||
| 		// with an imagefs, nodefs pressure should just delete logs | ||||
| 		resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs()} | ||||
| 		resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteLogs()} | ||||
| 		// with an imagefs, imagefs pressure should delete unused images | ||||
| 		resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteImages(imageGC)} | ||||
| 		resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteImages(imageGC, true)} | ||||
| 		resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{deleteImages(imageGC, false)} | ||||
| 	} else { | ||||
| 		// without an imagefs, nodefs pressure should delete logs, and unused images | ||||
| 		resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC)} | ||||
| 		resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, true)} | ||||
| 		resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, false)} | ||||
| 	} | ||||
| 	return resourceToReclaimFunc | ||||
| } | ||||
| @@ -824,13 +874,17 @@ func deleteLogs() nodeReclaimFunc { | ||||
| } | ||||
|  | ||||
| // deleteImages will delete unused images to free up disk pressure. | ||||
| func deleteImages(imageGC ImageGC) nodeReclaimFunc { | ||||
| func deleteImages(imageGC ImageGC, reportBytesFreed bool) nodeReclaimFunc { | ||||
| 	return func() (*resource.Quantity, error) { | ||||
| 		glog.Infof("eviction manager: attempting to delete unused images") | ||||
| 		reclaimed, err := imageGC.DeleteUnusedImages() | ||||
| 		bytesFreed, err := imageGC.DeleteUnusedImages() | ||||
| 		if err != nil { | ||||
| 			return nil, err | ||||
| 		} | ||||
| 		reclaimed := int64(0) | ||||
| 		if reportBytesFreed { | ||||
| 			reclaimed = bytesFreed | ||||
| 		} | ||||
| 		return resource.NewQuantity(reclaimed, resource.BinarySI), nil | ||||
| 	} | ||||
| } | ||||
|   | ||||
| @@ -191,6 +191,49 @@ func TestParseThresholdConfig(t *testing.T) { | ||||
| 				}, | ||||
| 			}, | ||||
| 		}, | ||||
| 		"inode flag values": { | ||||
| 			evictionHard:            "imagefs.inodesFree<150Mi,nodefs.inodesFree<100Mi", | ||||
| 			evictionSoft:            "imagefs.inodesFree<300Mi,nodefs.inodesFree<200Mi", | ||||
| 			evictionSoftGracePeriod: "imagefs.inodesFree=30s,nodefs.inodesFree=30s", | ||||
| 			evictionMinReclaim:      "imagefs.inodesFree=2Gi,nodefs.inodesFree=1Gi", | ||||
| 			expectErr:               false, | ||||
| 			expectThresholds: []Threshold{ | ||||
| 				{ | ||||
| 					Signal:   SignalImageFsInodesFree, | ||||
| 					Operator: OpLessThan, | ||||
| 					Value: ThresholdValue{ | ||||
| 						Quantity: quantityMustParse("150Mi"), | ||||
| 					}, | ||||
| 					MinReclaim: quantityMustParse("2Gi"), | ||||
| 				}, | ||||
| 				{ | ||||
| 					Signal:   SignalNodeFsInodesFree, | ||||
| 					Operator: OpLessThan, | ||||
| 					Value: ThresholdValue{ | ||||
| 						Quantity: quantityMustParse("100Mi"), | ||||
| 					}, | ||||
| 					MinReclaim: quantityMustParse("1Gi"), | ||||
| 				}, | ||||
| 				{ | ||||
| 					Signal:   SignalImageFsInodesFree, | ||||
| 					Operator: OpLessThan, | ||||
| 					Value: ThresholdValue{ | ||||
| 						Quantity: quantityMustParse("300Mi"), | ||||
| 					}, | ||||
| 					GracePeriod: gracePeriod, | ||||
| 					MinReclaim:  quantityMustParse("2Gi"), | ||||
| 				}, | ||||
| 				{ | ||||
| 					Signal:   SignalNodeFsInodesFree, | ||||
| 					Operator: OpLessThan, | ||||
| 					Value: ThresholdValue{ | ||||
| 						Quantity: quantityMustParse("200Mi"), | ||||
| 					}, | ||||
| 					GracePeriod: gracePeriod, | ||||
| 					MinReclaim:  quantityMustParse("1Gi"), | ||||
| 				}, | ||||
| 			}, | ||||
| 		}, | ||||
| 		"invalid-signal": { | ||||
| 			evictionHard:            "mem.available<150Mi", | ||||
| 			evictionSoft:            "", | ||||
| @@ -400,7 +443,7 @@ func TestOrderedByDisk(t *testing.T) { | ||||
| 		return result, found | ||||
| 	} | ||||
| 	pods := []*api.Pod{pod1, pod2, pod3, pod4, pod5, pod6} | ||||
| 	orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource})).Sort(pods) | ||||
| 	orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods) | ||||
| 	expected := []*api.Pod{pod6, pod5, pod4, pod3, pod2, pod1} | ||||
| 	for i := range expected { | ||||
| 		if pods[i] != expected[i] { | ||||
| @@ -466,7 +509,7 @@ func TestOrderedByQoSDisk(t *testing.T) { | ||||
| 		return result, found | ||||
| 	} | ||||
| 	pods := []*api.Pod{pod1, pod2, pod3, pod4, pod5, pod6} | ||||
| 	orderedBy(qosComparator, disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource})).Sort(pods) | ||||
| 	orderedBy(qosComparator, disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods) | ||||
| 	expected := []*api.Pod{pod2, pod1, pod4, pod3, pod6, pod5} | ||||
| 	for i := range expected { | ||||
| 		if pods[i] != expected[i] { | ||||
| @@ -608,6 +651,10 @@ func TestMakeSignalObservations(t *testing.T) { | ||||
| 	imageFsCapacityBytes := uint64(1024 * 1024 * 2) | ||||
| 	nodeFsAvailableBytes := uint64(1024) | ||||
| 	nodeFsCapacityBytes := uint64(1024 * 2) | ||||
| 	imageFsInodesFree := uint64(1024) | ||||
| 	imageFsInodes := uint64(1024 * 1024) | ||||
| 	nodeFsInodesFree := uint64(1024) | ||||
| 	nodeFsInodes := uint64(1024 * 1024) | ||||
| 	fakeStats := &statsapi.Summary{ | ||||
| 		Node: statsapi.NodeStats{ | ||||
| 			Memory: &statsapi.MemoryStats{ | ||||
| @@ -618,11 +665,15 @@ func TestMakeSignalObservations(t *testing.T) { | ||||
| 				ImageFs: &statsapi.FsStats{ | ||||
| 					AvailableBytes: &imageFsAvailableBytes, | ||||
| 					CapacityBytes:  &imageFsCapacityBytes, | ||||
| 					InodesFree:     &imageFsInodesFree, | ||||
| 					Inodes:         &imageFsInodes, | ||||
| 				}, | ||||
| 			}, | ||||
| 			Fs: &statsapi.FsStats{ | ||||
| 				AvailableBytes: &nodeFsAvailableBytes, | ||||
| 				CapacityBytes:  &nodeFsCapacityBytes, | ||||
| 				InodesFree:     &nodeFsInodesFree, | ||||
| 				Inodes:         &nodeFsInodes, | ||||
| 			}, | ||||
| 		}, | ||||
| 		Pods: []statsapi.PodStats{}, | ||||
| @@ -664,6 +715,16 @@ func TestMakeSignalObservations(t *testing.T) { | ||||
| 	if expectedBytes := int64(nodeFsCapacityBytes); nodeFsQuantity.capacity.Value() != expectedBytes { | ||||
| 		t.Errorf("Expected %v, actual: %v", expectedBytes, nodeFsQuantity.capacity.Value()) | ||||
| 	} | ||||
| 	nodeFsInodesQuantity, found := actualObservations[SignalNodeFsInodesFree] | ||||
| 	if !found { | ||||
| 		t.Errorf("Expected inodes free nodefs observation: %v", err) | ||||
| 	} | ||||
| 	if expected := int64(nodeFsInodesFree); nodeFsInodesQuantity.available.Value() != expected { | ||||
| 		t.Errorf("Expected %v, actual: %v", expected, nodeFsInodesQuantity.available.Value()) | ||||
| 	} | ||||
| 	if expected := int64(nodeFsInodes); nodeFsInodesQuantity.capacity.Value() != expected { | ||||
| 		t.Errorf("Expected %v, actual: %v", expected, nodeFsInodesQuantity.capacity.Value()) | ||||
| 	} | ||||
| 	imageFsQuantity, found := actualObservations[SignalImageFsAvailable] | ||||
| 	if !found { | ||||
| 		t.Errorf("Expected available imagefs observation: %v", err) | ||||
| @@ -674,6 +735,16 @@ func TestMakeSignalObservations(t *testing.T) { | ||||
| 	if expectedBytes := int64(imageFsCapacityBytes); imageFsQuantity.capacity.Value() != expectedBytes { | ||||
| 		t.Errorf("Expected %v, actual: %v", expectedBytes, imageFsQuantity.capacity.Value()) | ||||
| 	} | ||||
| 	imageFsInodesQuantity, found := actualObservations[SignalImageFsInodesFree] | ||||
| 	if !found { | ||||
| 		t.Errorf("Expected inodes free imagefs observation: %v", err) | ||||
| 	} | ||||
| 	if expected := int64(imageFsInodesFree); imageFsInodesQuantity.available.Value() != expected { | ||||
| 		t.Errorf("Expected %v, actual: %v", expected, imageFsInodesQuantity.available.Value()) | ||||
| 	} | ||||
| 	if expected := int64(imageFsInodes); imageFsInodesQuantity.capacity.Value() != expected { | ||||
| 		t.Errorf("Expected %v, actual: %v", expected, imageFsInodesQuantity.capacity.Value()) | ||||
| 	} | ||||
| 	for _, pod := range pods { | ||||
| 		podStats, found := statsFunc(pod) | ||||
| 		if !found { | ||||
| @@ -1204,6 +1275,22 @@ func testCompareThresholdValue(t *testing.T) { | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // newPodInodeStats returns stats with specified usage amounts. | ||||
| // TODO: in future, this should take a value for inodesUsed per container. | ||||
| func newPodInodeStats(pod *api.Pod) statsapi.PodStats { | ||||
| 	result := statsapi.PodStats{ | ||||
| 		PodRef: statsapi.PodReference{ | ||||
| 			Name: pod.Name, Namespace: pod.Namespace, UID: string(pod.UID), | ||||
| 		}, | ||||
| 	} | ||||
| 	for range pod.Spec.Containers { | ||||
| 		result.Containers = append(result.Containers, statsapi.ContainerStats{ | ||||
| 			Rootfs: &statsapi.FsStats{}, | ||||
| 		}) | ||||
| 	} | ||||
| 	return result | ||||
| } | ||||
|  | ||||
| // newPodDiskStats returns stats with specified usage amounts. | ||||
| func newPodDiskStats(pod *api.Pod, rootFsUsed, logsUsed, perLocalVolumeUsed resource.Quantity) statsapi.PodStats { | ||||
| 	result := statsapi.PodStats{ | ||||
|   | ||||
| @@ -32,8 +32,12 @@ const ( | ||||
| 	SignalMemoryAvailable Signal = "memory.available" | ||||
| 	// SignalNodeFsAvailable is amount of storage available on filesystem that kubelet uses for volumes, daemon logs, etc. | ||||
| 	SignalNodeFsAvailable Signal = "nodefs.available" | ||||
| 	// SignalNodeFsInodesFree is amount of inodes available on filesystem that kubelet uses for volumes, daemon logs, etc. | ||||
| 	SignalNodeFsInodesFree Signal = "nodefs.inodesFree" | ||||
| 	// SignalImageFsAvailable is amount of storage available on filesystem that container runtime uses for storing images and container writable layers. | ||||
| 	SignalImageFsAvailable Signal = "imagefs.available" | ||||
| 	// SignalImageFsInodesFree is amount of inodes available on filesystem that container runtime uses for storing images and container writeable layers. | ||||
| 	SignalImageFsInodesFree Signal = "imagefs.inodesFree" | ||||
| ) | ||||
|  | ||||
| // fsStatsType defines the types of filesystem stats to collect. | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kubernetes Submit Queue
					Kubernetes Submit Queue