Merge pull request #30311 from derekwaynecarr/inode_eviction

Automatic merge from submit-queue kubelet eviction on inode exhaustion Add support for kubelet to monitor for inode exhaustion of either image or rootfs, and in response, attempt to reclaim node level resources and/or evict pods.
2026-01-08 00:01:40 +00:00 · 2016-08-18 09:17:06 -07:00
parent b15c2d67e6 82615201b5
commit ff58d04a34
6 changed files with 421 additions and 40 deletions
--- a/docs/proposals/kubelet-eviction.md
+++ b/docs/proposals/kubelet-eviction.md
@@ -478,9 +478,19 @@ for eviction. Instead `DaemonSet` should ideally include Guaranteed pods only.

 ## Known issues

+### kubelet may evict more pods than needed
+
 The pod eviction may evict more pods than needed due to stats collection timing gap. This can be mitigated by adding
 the ability to get root container stats on an on-demand basis (https://github.com/google/cadvisor/issues/1247) in the future.

+### How kubelet ranks pods for eviction in response to inode exhaustion
+
+At this time, it is not possible to know how many inodes were consumed by a particular container.  If the `kubelet` observes
+inode exhaustion, it will evict pods by ranking them by quality of service.  The following issue has been opened in cadvisor
+to track per container inode consumption (https://github.com/google/cadvisor/issues/1422) which would allow us to rank pods
+by inode consumption.  For example, this would let us identify a container that created large numbers of 0 byte files, and evict
+that pod over others.
+
 <!-- BEGIN MUNGE: GENERATED_ANALYTICS -->
 [![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/docs/proposals/kubelet-eviction.md?pixel)]()
 <!-- END MUNGE: GENERATED_ANALYTICS -->
--- a/hack/.linted_packages
+++ b/hack/.linted_packages
@@ -96,6 +96,7 @@ pkg/credentialprovider/aws
 pkg/hyperkube
 pkg/kubelet/api
 pkg/kubelet/container
+pkg/kubelet/eviction
 pkg/kubelet/envvars
 pkg/kubelet/util/format
 pkg/kubelet/util/ioutils
--- a/pkg/kubelet/eviction/eviction_manager_test.go
+++ b/pkg/kubelet/eviction/eviction_manager_test.go
@@ -914,3 +914,228 @@ func TestNodeReclaimFuncs(t *testing.T) {
 		t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod)
 	}
 }
+
+func TestDiskPressureNodeFsInodes(t *testing.T) {
+	// TODO: we need to know inodes used when cadvisor supports per container stats
+	podMaker := func(name string, requests api.ResourceList, limits api.ResourceList) (*api.Pod, statsapi.PodStats) {
+		pod := newPod(name, []api.Container{
+			newContainer(name, requests, limits),
+		}, nil)
+		podStats := newPodInodeStats(pod)
+		return pod, podStats
+	}
+	summaryStatsMaker := func(rootFsInodesFree, rootFsInodes string, podStats map[*api.Pod]statsapi.PodStats) *statsapi.Summary {
+		rootFsInodesFreeVal := resource.MustParse(rootFsInodesFree)
+		internalRootFsInodesFree := uint64(rootFsInodesFreeVal.Value())
+		rootFsInodesVal := resource.MustParse(rootFsInodes)
+		internalRootFsInodes := uint64(rootFsInodesVal.Value())
+		result := &statsapi.Summary{
+			Node: statsapi.NodeStats{
+				Fs: &statsapi.FsStats{
+					InodesFree: &internalRootFsInodesFree,
+					Inodes:     &internalRootFsInodes,
+				},
+			},
+			Pods: []statsapi.PodStats{},
+		}
+		for _, podStat := range podStats {
+			result.Pods = append(result.Pods, podStat)
+		}
+		return result
+	}
+	// TODO: pass inodes used in future when supported by cadvisor.
+	podsToMake := []struct {
+		name     string
+		requests api.ResourceList
+		limits   api.ResourceList
+	}{
+		{name: "best-effort-high", requests: newResourceList("", ""), limits: newResourceList("", "")},
+		{name: "best-effort-low", requests: newResourceList("", ""), limits: newResourceList("", "")},
+		{name: "burstable-high", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi")},
+		{name: "burstable-low", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi")},
+		{name: "guaranteed-high", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi")},
+		{name: "guaranteed-low", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi")},
+	}
+	pods := []*api.Pod{}
+	podStats := map[*api.Pod]statsapi.PodStats{}
+	for _, podToMake := range podsToMake {
+		pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits)
+		pods = append(pods, pod)
+		podStats[pod] = podStat
+	}
+	activePodsFunc := func() []*api.Pod {
+		return pods
+	}
+
+	fakeClock := clock.NewFakeClock(time.Now())
+	podKiller := &mockPodKiller{}
+	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
+	imageGC := &mockImageGC{freed: int64(0), err: nil}
+	nodeRef := &api.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
+
+	config := Config{
+		MaxPodGracePeriodSeconds: 5,
+		PressureTransitionPeriod: time.Minute * 5,
+		Thresholds: []Threshold{
+			{
+				Signal:   SignalNodeFsInodesFree,
+				Operator: OpLessThan,
+				Value: ThresholdValue{
+					Quantity: quantityMustParse("1Mi"),
+				},
+			},
+			{
+				Signal:   SignalNodeFsInodesFree,
+				Operator: OpLessThan,
+				Value: ThresholdValue{
+					Quantity: quantityMustParse("2Mi"),
+				},
+				GracePeriod: time.Minute * 2,
+			},
+		},
+	}
+	summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("3Mi", "4Mi", podStats)}
+	manager := &managerImpl{
+		clock:           fakeClock,
+		killPodFunc:     podKiller.killPodNow,
+		imageGC:         imageGC,
+		config:          config,
+		recorder:        &record.FakeRecorder{},
+		summaryProvider: summaryProvider,
+		nodeRef:         nodeRef,
+		nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
+		thresholdsFirstObservedAt:    thresholdsObservedAt{},
+	}
+
+	// create a best effort pod to test admission
+	podToAdmit, _ := podMaker("pod-to-admit", newResourceList("", ""), newResourceList("", ""))
+
+	// synchronize
+	manager.synchronize(diskInfoProvider, activePodsFunc)
+
+	// we should not have disk pressure
+	if manager.IsUnderDiskPressure() {
+		t.Errorf("Manager should not report disk pressure")
+	}
+
+	// try to admit our pod (should succeed)
+	if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit {
+		t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit)
+	}
+
+	// induce soft threshold
+	fakeClock.Step(1 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats)
+	manager.synchronize(diskInfoProvider, activePodsFunc)
+
+	// we should have disk pressure
+	if !manager.IsUnderDiskPressure() {
+		t.Errorf("Manager should report disk pressure since soft threshold was met")
+	}
+
+	// verify no pod was yet killed because there has not yet been enough time passed.
+	if podKiller.pod != nil {
+		t.Errorf("Manager should not have killed a pod yet, but killed: %v", podKiller.pod)
+	}
+
+	// step forward in time pass the grace period
+	fakeClock.Step(3 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats)
+	manager.synchronize(diskInfoProvider, activePodsFunc)
+
+	// we should have disk pressure
+	if !manager.IsUnderDiskPressure() {
+		t.Errorf("Manager should report disk pressure since soft threshold was met")
+	}
+
+	// verify the right pod was killed with the right grace period.
+	if podKiller.pod != pods[0] {
+		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0])
+	}
+	if podKiller.gracePeriodOverride == nil {
+		t.Errorf("Manager chose to kill pod but should have had a grace period override.")
+	}
+	observedGracePeriod := *podKiller.gracePeriodOverride
+	if observedGracePeriod != manager.config.MaxPodGracePeriodSeconds {
+		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", manager.config.MaxPodGracePeriodSeconds, observedGracePeriod)
+	}
+	// reset state
+	podKiller.pod = nil
+	podKiller.gracePeriodOverride = nil
+
+	// remove disk pressure
+	fakeClock.Step(20 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats)
+	manager.synchronize(diskInfoProvider, activePodsFunc)
+
+	// we should not have disk pressure
+	if manager.IsUnderDiskPressure() {
+		t.Errorf("Manager should not report disk pressure")
+	}
+
+	// induce disk pressure!
+	fakeClock.Step(1 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("0.5Mi", "4Mi", podStats)
+	manager.synchronize(diskInfoProvider, activePodsFunc)
+
+	// we should have disk pressure
+	if !manager.IsUnderDiskPressure() {
+		t.Errorf("Manager should report disk pressure")
+	}
+
+	// check the right pod was killed
+	if podKiller.pod != pods[0] {
+		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0])
+	}
+	observedGracePeriod = *podKiller.gracePeriodOverride
+	if observedGracePeriod != int64(0) {
+		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
+	}
+
+	// try to admit our pod (should fail)
+	if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit {
+		t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit)
+	}
+
+	// reduce disk pressure
+	fakeClock.Step(1 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats)
+	podKiller.pod = nil // reset state
+	manager.synchronize(diskInfoProvider, activePodsFunc)
+
+	// we should have disk pressure (because transition period not yet met)
+	if !manager.IsUnderDiskPressure() {
+		t.Errorf("Manager should report disk pressure")
+	}
+
+	// no pod should have been killed
+	if podKiller.pod != nil {
+		t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod)
+	}
+
+	// try to admit our pod (should fail)
+	if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit {
+		t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit)
+	}
+
+	// move the clock past transition period to ensure that we stop reporting pressure
+	fakeClock.Step(5 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats)
+	podKiller.pod = nil // reset state
+	manager.synchronize(diskInfoProvider, activePodsFunc)
+
+	// we should not have disk pressure (because transition period met)
+	if manager.IsUnderDiskPressure() {
+		t.Errorf("Manager should not report disk pressure")
+	}
+
+	// no pod should have been killed
+	if podKiller.pod != nil {
+		t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod)
+	}
+
+	// try to admit our pod (should succeed)
+	if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit {
+		t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit)
+	}
+}
--- a/pkg/kubelet/eviction/helpers.go
+++ b/pkg/kubelet/eviction/helpers.go
@@ -41,10 +41,16 @@ const (
 	message = "The node was low on compute resources."
 	// disk, in bytes.  internal to this module, used to account for local disk usage.
 	resourceDisk api.ResourceName = "disk"
+	// inodes, number. internal to this module, used to account for local disk inode consumption.
+	resourceInodes api.ResourceName = "inodes"
 	// imagefs, in bytes.  internal to this module, used to account for local image filesystem usage.
 	resourceImageFs api.ResourceName = "imagefs"
+	// imagefs inodes, number.  internal to this module, used to account for local image filesystem inodes.
+	resourceImageFsInodes api.ResourceName = "imagefsInodes"
 	// nodefs, in bytes.  internal to this module, used to account for local node root filesystem usage.
 	resourceNodeFs api.ResourceName = "nodefs"
+	// nodefs inodes, number.  internal to this module, used to account for local node root filesystem inodes.
+	resourceNodeFsInodes api.ResourceName = "nodefsInodes"
 )

 var (
@@ -62,12 +68,16 @@ func init() {
 	signalToNodeCondition[SignalMemoryAvailable] = api.NodeMemoryPressure
 	signalToNodeCondition[SignalImageFsAvailable] = api.NodeDiskPressure
 	signalToNodeCondition[SignalNodeFsAvailable] = api.NodeDiskPressure
+	signalToNodeCondition[SignalImageFsInodesFree] = api.NodeDiskPressure
+	signalToNodeCondition[SignalNodeFsInodesFree] = api.NodeDiskPressure

 	// map signals to resources (and vice-versa)
 	signalToResource = map[Signal]api.ResourceName{}
 	signalToResource[SignalMemoryAvailable] = api.ResourceMemory
 	signalToResource[SignalImageFsAvailable] = resourceImageFs
+	signalToResource[SignalImageFsInodesFree] = resourceImageFsInodes
 	signalToResource[SignalNodeFsAvailable] = resourceNodeFs
+	signalToResource[SignalNodeFsInodesFree] = resourceNodeFsInodes
 	resourceToSignal = map[api.ResourceName]Signal{}
 	for key, value := range signalToResource {
 		resourceToSignal[value] = key
@@ -185,22 +195,21 @@ func parseThresholdStatement(statement string) (Threshold, error) {
 				Percentage: percentage,
 			},
 		}, nil
-	} else {
-		quantity, err := resource.ParseQuantity(quantityValue)
-		if err != nil {
-			return Threshold{}, err
-		}
-		if quantity.Sign() < 0 || quantity.IsZero() {
-			return Threshold{}, fmt.Errorf("eviction threshold %v must be positive: %s", signal, &quantity)
-		}
-		return Threshold{
-			Signal:   signal,
-			Operator: operator,
-			Value: ThresholdValue{
-				Quantity: &quantity,
-			},
-		}, nil
 	}
+	quantity, err := resource.ParseQuantity(quantityValue)
+	if err != nil {
+		return Threshold{}, err
+	}
+	if quantity.Sign() < 0 || quantity.IsZero() {
+		return Threshold{}, fmt.Errorf("eviction threshold %v must be positive: %s", signal, &quantity)
+	}
+	return Threshold{
+		Signal:   signal,
+		Operator: operator,
+		Value: ThresholdValue{
+			Quantity: &quantity,
+		},
+	}, nil
 }

 // parsePercentage parses a string representing a percentage value
@@ -287,6 +296,18 @@ func diskUsage(fsStats *statsapi.FsStats) *resource.Quantity {
 	return resource.NewQuantity(usage, resource.BinarySI)
 }

+// inodeUsage converts inodes consumed into a resource quantity.
+func inodeUsage(fsStats *statsapi.FsStats) *resource.Quantity {
+	// TODO: cadvisor needs to support inodes used per container
+	// right now, cadvisor reports total inodes and inodes free per filesystem.
+	// this is insufficient to know how many inodes are consumed by the container.
+	// for example, with the overlay driver, the rootfs and each container filesystem
+	// will report the same total inode and inode free values but no way of knowing
+	// how many inodes consumed in that filesystem are charged to this container.
+	// for now, we report 0 as inode usage pending support in cadvisor.
+	return resource.NewQuantity(int64(0), resource.BinarySI)
+}
+
 // memoryUsage converts working set into a resource quantity.
 func memoryUsage(memStats *statsapi.MemoryStats) *resource.Quantity {
 	if memStats == nil || memStats.WorkingSetBytes == nil {
@@ -311,15 +332,18 @@ func localVolumeNames(pod *api.Pod) []string {
 	return result
 }

-// podDiskUsage aggregates pod disk usage for the specified stats to measure.
+// podDiskUsage aggregates pod disk usage and inode consumption for the specified stats to measure.
 func podDiskUsage(podStats statsapi.PodStats, pod *api.Pod, statsToMeasure []fsStatsType) (api.ResourceList, error) {
 	disk := resource.Quantity{Format: resource.BinarySI}
+	inodes := resource.Quantity{Format: resource.BinarySI}
 	for _, container := range podStats.Containers {
 		if hasFsStatsType(statsToMeasure, fsStatsRoot) {
 			disk.Add(*diskUsage(container.Rootfs))
+			inodes.Add(*inodeUsage(container.Rootfs))
 		}
 		if hasFsStatsType(statsToMeasure, fsStatsLogs) {
 			disk.Add(*diskUsage(container.Logs))
+			inodes.Add(*inodeUsage(container.Logs))
 		}
 	}
 	if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) {
@@ -328,13 +352,15 @@ func podDiskUsage(podStats statsapi.PodStats, pod *api.Pod, statsToMeasure []fsS
 			for _, volumeStats := range podStats.VolumeStats {
 				if volumeStats.Name == volumeName {
 					disk.Add(*diskUsage(&volumeStats.FsStats))
+					inodes.Add(*inodeUsage(&volumeStats.FsStats))
 					break
 				}
 			}
 		}
 	}
 	return api.ResourceList{
-		resourceDisk: disk,
+		resourceDisk:   disk,
+		resourceInodes: inodes,
 	}, nil
 }

@@ -502,8 +528,8 @@ func memory(stats statsFunc) cmpFunc {
 	}
 }

-// disk compares pods by largest consumer of disk relative to request.
-func disk(stats statsFunc, fsStatsToMeasure []fsStatsType) cmpFunc {
+// disk compares pods by largest consumer of disk relative to request for the specified disk resource.
+func disk(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource api.ResourceName) cmpFunc {
 	return func(p1, p2 *api.Pod) int {
 		p1Stats, found := stats(p1)
 		// if we have no usage stats for p1, we want p2 first
@@ -528,8 +554,8 @@ func disk(stats statsFunc, fsStatsToMeasure []fsStatsType) cmpFunc {

 		// disk is best effort, so we don't measure relative to a request.
 		// TODO: add disk as a guaranteed resource
-		p1Disk := p1Usage[resourceDisk]
-		p2Disk := p2Usage[resourceDisk]
+		p1Disk := p1Usage[diskResource]
+		p2Disk := p2Usage[diskResource]
 		// if p2 is using more than p1, we want p2 first
 		return p2Disk.Cmp(p1Disk)
 	}
@@ -541,9 +567,9 @@ func rankMemoryPressure(pods []*api.Pod, stats statsFunc) {
 }

 // rankDiskPressureFunc returns a rankFunc that measures the specified fs stats.
-func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType) rankFunc {
+func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType, diskResource api.ResourceName) rankFunc {
 	return func(pods []*api.Pod, stats statsFunc) {
-		orderedBy(qosComparator, disk(stats, fsStatsToMeasure)).Sort(pods)
+		orderedBy(qosComparator, disk(stats, fsStatsToMeasure, diskResource)).Sort(pods)
 	}
 }

@@ -564,6 +590,7 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv
 	if err != nil {
 		return nil, nil, err
 	}
+
 	// build the function to work against for pod stats
 	statsFunc := cachedStatsFunc(summary.Pods)
 	// build an evaluation context for current eviction signals
@@ -575,17 +602,33 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv
 			capacity:  resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI),
 		}
 	}
-	if nodeFs := summary.Node.Fs; nodeFs != nil && nodeFs.AvailableBytes != nil && nodeFs.CapacityBytes != nil {
-		result[SignalNodeFsAvailable] = signalObservation{
-			available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI),
-			capacity:  resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI),
+	if nodeFs := summary.Node.Fs; nodeFs != nil {
+		if nodeFs.AvailableBytes != nil && nodeFs.CapacityBytes != nil {
+			result[SignalNodeFsAvailable] = signalObservation{
+				available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI),
+				capacity:  resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI),
+			}
+		}
+		if nodeFs.InodesFree != nil && nodeFs.Inodes != nil {
+			result[SignalNodeFsInodesFree] = signalObservation{
+				available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.BinarySI),
+				capacity:  resource.NewQuantity(int64(*nodeFs.Inodes), resource.BinarySI),
+			}
 		}
 	}
 	if summary.Node.Runtime != nil {
-		if imageFs := summary.Node.Runtime.ImageFs; imageFs != nil && imageFs.AvailableBytes != nil && imageFs.CapacityBytes != nil {
-			result[SignalImageFsAvailable] = signalObservation{
-				available: resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI),
-				capacity:  resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI),
+		if imageFs := summary.Node.Runtime.ImageFs; imageFs != nil {
+			if imageFs.AvailableBytes != nil && imageFs.CapacityBytes != nil {
+				result[SignalImageFsAvailable] = signalObservation{
+					available: resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI),
+					capacity:  resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI),
+				}
+				if imageFs.InodesFree != nil && imageFs.Inodes != nil {
+					result[SignalImageFsInodesFree] = signalObservation{
+						available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.BinarySI),
+						capacity:  resource.NewQuantity(int64(*imageFs.Inodes), resource.BinarySI),
+					}
+				}
 			}
 		}
 	}
@@ -785,16 +828,20 @@ func buildResourceToRankFunc(withImageFs bool) map[api.ResourceName]rankFunc {
 	// usage of an imagefs is optional
 	if withImageFs {
 		// with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes
-		resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource})
+		resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)
+		resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
 		// with an imagefs, imagefs pod rank func for eviction only includes rootfs
-		resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot})
+		resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceDisk)
+		resourceToRankFunc[resourceImageFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceInodes)
 	} else {
 		// without an imagefs, nodefs pod rank func for eviction looks at all fs stats
-		resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource})
+		resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)
+		resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
 	}
 	return resourceToRankFunc
 }

+// PodIsEvicted returns true if the reported pod status is due to an eviction.
 func PodIsEvicted(podStatus api.PodStatus) bool {
 	return podStatus.Phase == api.PodFailed && podStatus.Reason == reason
 }
@@ -806,11 +853,14 @@ func buildResourceToNodeReclaimFuncs(imageGC ImageGC, withImageFs bool) map[api.
 	if withImageFs {
 		// with an imagefs, nodefs pressure should just delete logs
 		resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs()}
+		resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteLogs()}
 		// with an imagefs, imagefs pressure should delete unused images
-		resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteImages(imageGC)}
+		resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteImages(imageGC, true)}
+		resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{deleteImages(imageGC, false)}
 	} else {
 		// without an imagefs, nodefs pressure should delete logs, and unused images
-		resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC)}
+		resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, true)}
+		resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, false)}
 	}
 	return resourceToReclaimFunc
 }
@@ -824,13 +874,17 @@ func deleteLogs() nodeReclaimFunc {
 }

 // deleteImages will delete unused images to free up disk pressure.
-func deleteImages(imageGC ImageGC) nodeReclaimFunc {
+func deleteImages(imageGC ImageGC, reportBytesFreed bool) nodeReclaimFunc {
 	return func() (*resource.Quantity, error) {
 		glog.Infof("eviction manager: attempting to delete unused images")
-		reclaimed, err := imageGC.DeleteUnusedImages()
+		bytesFreed, err := imageGC.DeleteUnusedImages()
 		if err != nil {
 			return nil, err
 		}
+		reclaimed := int64(0)
+		if reportBytesFreed {
+			reclaimed = bytesFreed
+		}
 		return resource.NewQuantity(reclaimed, resource.BinarySI), nil
 	}
 }
--- a/pkg/kubelet/eviction/helpers_test.go
+++ b/pkg/kubelet/eviction/helpers_test.go
@@ -191,6 +191,49 @@ func TestParseThresholdConfig(t *testing.T) {
 				},
 			},
 		},
+		"inode flag values": {
+			evictionHard:            "imagefs.inodesFree<150Mi,nodefs.inodesFree<100Mi",
+			evictionSoft:            "imagefs.inodesFree<300Mi,nodefs.inodesFree<200Mi",
+			evictionSoftGracePeriod: "imagefs.inodesFree=30s,nodefs.inodesFree=30s",
+			evictionMinReclaim:      "imagefs.inodesFree=2Gi,nodefs.inodesFree=1Gi",
+			expectErr:               false,
+			expectThresholds: []Threshold{
+				{
+					Signal:   SignalImageFsInodesFree,
+					Operator: OpLessThan,
+					Value: ThresholdValue{
+						Quantity: quantityMustParse("150Mi"),
+					},
+					MinReclaim: quantityMustParse("2Gi"),
+				},
+				{
+					Signal:   SignalNodeFsInodesFree,
+					Operator: OpLessThan,
+					Value: ThresholdValue{
+						Quantity: quantityMustParse("100Mi"),
+					},
+					MinReclaim: quantityMustParse("1Gi"),
+				},
+				{
+					Signal:   SignalImageFsInodesFree,
+					Operator: OpLessThan,
+					Value: ThresholdValue{
+						Quantity: quantityMustParse("300Mi"),
+					},
+					GracePeriod: gracePeriod,
+					MinReclaim:  quantityMustParse("2Gi"),
+				},
+				{
+					Signal:   SignalNodeFsInodesFree,
+					Operator: OpLessThan,
+					Value: ThresholdValue{
+						Quantity: quantityMustParse("200Mi"),
+					},
+					GracePeriod: gracePeriod,
+					MinReclaim:  quantityMustParse("1Gi"),
+				},
+			},
+		},
 		"invalid-signal": {
 			evictionHard:            "mem.available<150Mi",
 			evictionSoft:            "",
@@ -400,7 +443,7 @@ func TestOrderedByDisk(t *testing.T) {
 		return result, found
 	}
 	pods := []*api.Pod{pod1, pod2, pod3, pod4, pod5, pod6}
-	orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource})).Sort(pods)
+	orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods)
 	expected := []*api.Pod{pod6, pod5, pod4, pod3, pod2, pod1}
 	for i := range expected {
 		if pods[i] != expected[i] {
@@ -466,7 +509,7 @@ func TestOrderedByQoSDisk(t *testing.T) {
 		return result, found
 	}
 	pods := []*api.Pod{pod1, pod2, pod3, pod4, pod5, pod6}
-	orderedBy(qosComparator, disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource})).Sort(pods)
+	orderedBy(qosComparator, disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods)
 	expected := []*api.Pod{pod2, pod1, pod4, pod3, pod6, pod5}
 	for i := range expected {
 		if pods[i] != expected[i] {
@@ -608,6 +651,10 @@ func TestMakeSignalObservations(t *testing.T) {
 	imageFsCapacityBytes := uint64(1024 * 1024 * 2)
 	nodeFsAvailableBytes := uint64(1024)
 	nodeFsCapacityBytes := uint64(1024 * 2)
+	imageFsInodesFree := uint64(1024)
+	imageFsInodes := uint64(1024 * 1024)
+	nodeFsInodesFree := uint64(1024)
+	nodeFsInodes := uint64(1024 * 1024)
 	fakeStats := &statsapi.Summary{
 		Node: statsapi.NodeStats{
 			Memory: &statsapi.MemoryStats{
@@ -618,11 +665,15 @@ func TestMakeSignalObservations(t *testing.T) {
 				ImageFs: &statsapi.FsStats{
 					AvailableBytes: &imageFsAvailableBytes,
 					CapacityBytes:  &imageFsCapacityBytes,
+					InodesFree:     &imageFsInodesFree,
+					Inodes:         &imageFsInodes,
 				},
 			},
 			Fs: &statsapi.FsStats{
 				AvailableBytes: &nodeFsAvailableBytes,
 				CapacityBytes:  &nodeFsCapacityBytes,
+				InodesFree:     &nodeFsInodesFree,
+				Inodes:         &nodeFsInodes,
 			},
 		},
 		Pods: []statsapi.PodStats{},
@@ -664,6 +715,16 @@ func TestMakeSignalObservations(t *testing.T) {
 	if expectedBytes := int64(nodeFsCapacityBytes); nodeFsQuantity.capacity.Value() != expectedBytes {
 		t.Errorf("Expected %v, actual: %v", expectedBytes, nodeFsQuantity.capacity.Value())
 	}
+	nodeFsInodesQuantity, found := actualObservations[SignalNodeFsInodesFree]
+	if !found {
+		t.Errorf("Expected inodes free nodefs observation: %v", err)
+	}
+	if expected := int64(nodeFsInodesFree); nodeFsInodesQuantity.available.Value() != expected {
+		t.Errorf("Expected %v, actual: %v", expected, nodeFsInodesQuantity.available.Value())
+	}
+	if expected := int64(nodeFsInodes); nodeFsInodesQuantity.capacity.Value() != expected {
+		t.Errorf("Expected %v, actual: %v", expected, nodeFsInodesQuantity.capacity.Value())
+	}
 	imageFsQuantity, found := actualObservations[SignalImageFsAvailable]
 	if !found {
 		t.Errorf("Expected available imagefs observation: %v", err)
@@ -674,6 +735,16 @@ func TestMakeSignalObservations(t *testing.T) {
 	if expectedBytes := int64(imageFsCapacityBytes); imageFsQuantity.capacity.Value() != expectedBytes {
 		t.Errorf("Expected %v, actual: %v", expectedBytes, imageFsQuantity.capacity.Value())
 	}
+	imageFsInodesQuantity, found := actualObservations[SignalImageFsInodesFree]
+	if !found {
+		t.Errorf("Expected inodes free imagefs observation: %v", err)
+	}
+	if expected := int64(imageFsInodesFree); imageFsInodesQuantity.available.Value() != expected {
+		t.Errorf("Expected %v, actual: %v", expected, imageFsInodesQuantity.available.Value())
+	}
+	if expected := int64(imageFsInodes); imageFsInodesQuantity.capacity.Value() != expected {
+		t.Errorf("Expected %v, actual: %v", expected, imageFsInodesQuantity.capacity.Value())
+	}
 	for _, pod := range pods {
 		podStats, found := statsFunc(pod)
 		if !found {
@@ -1204,6 +1275,22 @@ func testCompareThresholdValue(t *testing.T) {
 	}
 }

+// newPodInodeStats returns stats with specified usage amounts.
+// TODO: in future, this should take a value for inodesUsed per container.
+func newPodInodeStats(pod *api.Pod) statsapi.PodStats {
+	result := statsapi.PodStats{
+		PodRef: statsapi.PodReference{
+			Name: pod.Name, Namespace: pod.Namespace, UID: string(pod.UID),
+		},
+	}
+	for range pod.Spec.Containers {
+		result.Containers = append(result.Containers, statsapi.ContainerStats{
+			Rootfs: &statsapi.FsStats{},
+		})
+	}
+	return result
+}
+
 // newPodDiskStats returns stats with specified usage amounts.
 func newPodDiskStats(pod *api.Pod, rootFsUsed, logsUsed, perLocalVolumeUsed resource.Quantity) statsapi.PodStats {
 	result := statsapi.PodStats{
--- a/pkg/kubelet/eviction/types.go
+++ b/pkg/kubelet/eviction/types.go
@@ -32,8 +32,12 @@ const (
 	SignalMemoryAvailable Signal = "memory.available"
 	// SignalNodeFsAvailable is amount of storage available on filesystem that kubelet uses for volumes, daemon logs, etc.
 	SignalNodeFsAvailable Signal = "nodefs.available"
+	// SignalNodeFsInodesFree is amount of inodes available on filesystem that kubelet uses for volumes, daemon logs, etc.
+	SignalNodeFsInodesFree Signal = "nodefs.inodesFree"
 	// SignalImageFsAvailable is amount of storage available on filesystem that container runtime uses for storing images and container writable layers.
 	SignalImageFsAvailable Signal = "imagefs.available"
+	// SignalImageFsInodesFree is amount of inodes available on filesystem that container runtime uses for storing images and container writeable layers.
+	SignalImageFsInodesFree Signal = "imagefs.inodesFree"
 )

 // fsStatsType defines the types of filesystem stats to collect.