vendor: bump runc to v1.2.1

For one thing, this release decouples device management from libcontainer/cgroups. You can see the result of this in a dropped cilium/ebpf dependency (which is only needed for device management). NOTE that due to an issue with go mod / go list, github.com/opencontainers/runc had to be added to hack/unwanted-dependencies.json under x/exp. This is bogus because opencontainers/runc does not use x/exp directly, only via cilium/ebpf dependency (which is not vendored here). Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2025-12-07 08:35:35 +00:00 · 2024-11-01 17:26:09 -07:00
parent 6bc0768c62
commit 1d4f88b1ee
187 changed files with 2948 additions and 25845 deletions
--- a/vendor/github.com/opencontainers/runc/NOTICE
+++ b/vendor/github.com/opencontainers/runc/NOTICE
@@ -8,9 +8,9 @@ The following is courtesy of our legal counsel:


 Use and transfer of Docker may be subject to certain restrictions by the
-United States and other governments.  
+United States and other governments.
 It is your responsibility to ensure that your use and/or transfer does not
-violate applicable laws. 
+violate applicable laws.

 For more information, please see http://www.bis.doc.gov

--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go
@@ -1,9 +1,30 @@
 package cgroups

 import (
+	"errors"
+
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

+var (
+	// ErrDevicesUnsupported is an error returned when a cgroup manager
+	// is not configured to set device rules.
+	ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules")
+
+	// ErrRootless is returned by [Manager.Apply] when there is an error
+	// creating cgroup directory, and cgroup.Rootless is set. In general,
+	// this error is to be ignored.
+	ErrRootless = errors.New("cgroup manager can not access cgroup (rootless container)")
+
+	// DevicesSetV1 and DevicesSetV2 are functions to set devices for
+	// cgroup v1 and v2, respectively. Unless
+	// [github.com/opencontainers/runc/libcontainer/cgroups/devices]
+	// package is imported, it is set to nil, so cgroup managers can't
+	// manage devices.
+	DevicesSetV1 func(path string, r *configs.Resources) error
+	DevicesSetV2 func(path string, r *configs.Resources) error
+)
+
 type Manager interface {
 	// Apply creates a cgroup, if not yet created, and adds a process
 	// with the specified pid into that cgroup.  A special value of -1
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go
@@ -1,386 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-/*
- * Copyright (C) 2020 Aleksa Sarai <cyphar@cyphar.com>
- * Copyright (C) 2020 SUSE LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package devices
-
-import (
-	"bufio"
-	"fmt"
-	"io"
-	"sort"
-	"strconv"
-	"strings"
-
-	"github.com/opencontainers/runc/libcontainer/devices"
-)
-
-// deviceMeta is a Rule without the Allow or Permissions fields, and no
-// wildcard-type support. It's effectively the "match" portion of a metadata
-// rule, for the purposes of our emulation.
-type deviceMeta struct {
-	node  devices.Type
-	major int64
-	minor int64
-}
-
-// deviceRule is effectively the tuple (deviceMeta, Permissions).
-type deviceRule struct {
-	meta  deviceMeta
-	perms devices.Permissions
-}
-
-// deviceRules is a mapping of device metadata rules to the associated
-// permissions in the ruleset.
-type deviceRules map[deviceMeta]devices.Permissions
-
-func (r deviceRules) orderedEntries() []deviceRule {
-	var rules []deviceRule
-	for meta, perms := range r {
-		rules = append(rules, deviceRule{meta: meta, perms: perms})
-	}
-	sort.Slice(rules, func(i, j int) bool {
-		// Sort by (major, minor, type).
-		a, b := rules[i].meta, rules[j].meta
-		return a.major < b.major ||
-			(a.major == b.major && a.minor < b.minor) ||
-			(a.major == b.major && a.minor == b.minor && a.node < b.node)
-	})
-	return rules
-}
-
-type Emulator struct {
-	defaultAllow bool
-	rules        deviceRules
-}
-
-func (e *Emulator) IsBlacklist() bool {
-	return e.defaultAllow
-}
-
-func (e *Emulator) IsAllowAll() bool {
-	return e.IsBlacklist() && len(e.rules) == 0
-}
-
-func parseLine(line string) (*deviceRule, error) {
-	// Input: node major:minor perms.
-	fields := strings.FieldsFunc(line, func(r rune) bool {
-		return r == ' ' || r == ':'
-	})
-	if len(fields) != 4 {
-		return nil, fmt.Errorf("malformed devices.list rule %s", line)
-	}
-
-	var (
-		rule  deviceRule
-		node  = fields[0]
-		major = fields[1]
-		minor = fields[2]
-		perms = fields[3]
-	)
-
-	// Parse the node type.
-	switch node {
-	case "a":
-		// Super-special case -- "a" always means every device with every
-		// access mode. In fact, for devices.list this actually indicates that
-		// the cgroup is in black-list mode.
-		// TODO: Double-check that the entire file is "a *:* rwm".
-		return nil, nil
-	case "b":
-		rule.meta.node = devices.BlockDevice
-	case "c":
-		rule.meta.node = devices.CharDevice
-	default:
-		return nil, fmt.Errorf("unknown device type %q", node)
-	}
-
-	// Parse the major number.
-	if major == "*" {
-		rule.meta.major = devices.Wildcard
-	} else {
-		val, err := strconv.ParseUint(major, 10, 32)
-		if err != nil {
-			return nil, fmt.Errorf("invalid major number: %w", err)
-		}
-		rule.meta.major = int64(val)
-	}
-
-	// Parse the minor number.
-	if minor == "*" {
-		rule.meta.minor = devices.Wildcard
-	} else {
-		val, err := strconv.ParseUint(minor, 10, 32)
-		if err != nil {
-			return nil, fmt.Errorf("invalid minor number: %w", err)
-		}
-		rule.meta.minor = int64(val)
-	}
-
-	// Parse the access permissions.
-	rule.perms = devices.Permissions(perms)
-	if !rule.perms.IsValid() || rule.perms.IsEmpty() {
-		return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
-	}
-	return &rule, nil
-}
-
-func (e *Emulator) addRule(rule deviceRule) error { //nolint:unparam
-	if e.rules == nil {
-		e.rules = make(map[deviceMeta]devices.Permissions)
-	}
-
-	// Merge with any pre-existing permissions.
-	oldPerms := e.rules[rule.meta]
-	newPerms := rule.perms.Union(oldPerms)
-	e.rules[rule.meta] = newPerms
-	return nil
-}
-
-func (e *Emulator) rmRule(rule deviceRule) error {
-	// Give an error if any of the permissions requested to be removed are
-	// present in a partially-matching wildcard rule, because such rules will
-	// be ignored by cgroupv1.
-	//
-	// This is a diversion from cgroupv1, but is necessary to avoid leading
-	// users into a false sense of security. cgroupv1 will silently(!) ignore
-	// requests to remove partial exceptions, but we really shouldn't do that.
-	//
-	// It may seem like we could just "split" wildcard rules which hit this
-	// issue, but unfortunately there are 2^32 possible major and minor
-	// numbers, which would exhaust kernel memory quickly if we did this. Not
-	// to mention it'd be really slow (the kernel side is implemented as a
-	// linked-list of exceptions).
-	for _, partialMeta := range []deviceMeta{
-		{node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor},
-		{node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard},
-		{node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard},
-	} {
-		// This wildcard rule is equivalent to the requested rule, so skip it.
-		if rule.meta == partialMeta {
-			continue
-		}
-		// Only give an error if the set of permissions overlap.
-		partialPerms := e.rules[partialMeta]
-		if !partialPerms.Intersection(rule.perms).IsEmpty() {
-			return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms)
-		}
-	}
-
-	// Subtract all of the permissions listed from the full match rule. If the
-	// rule didn't exist, all of this is a no-op.
-	newPerms := e.rules[rule.meta].Difference(rule.perms)
-	if newPerms.IsEmpty() {
-		delete(e.rules, rule.meta)
-	} else {
-		e.rules[rule.meta] = newPerms
-	}
-	// TODO: The actual cgroup code doesn't care if an exception didn't exist
-	//       during removal, so not erroring out here is /accurate/ but quite
-	//       worrying. Maybe we should do additional validation, but again we
-	//       have to worry about backwards-compatibility.
-	return nil
-}
-
-func (e *Emulator) allow(rule *deviceRule) error {
-	// This cgroup is configured as a black-list. Reset the entire emulator,
-	// and put is into black-list mode.
-	if rule == nil || rule.meta.node == devices.WildcardDevice {
-		*e = Emulator{
-			defaultAllow: true,
-			rules:        nil,
-		}
-		return nil
-	}
-
-	var err error
-	if e.defaultAllow {
-		err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception")
-	} else {
-		err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception")
-	}
-	return err
-}
-
-func (e *Emulator) deny(rule *deviceRule) error {
-	// This cgroup is configured as a white-list. Reset the entire emulator,
-	// and put is into white-list mode.
-	if rule == nil || rule.meta.node == devices.WildcardDevice {
-		*e = Emulator{
-			defaultAllow: false,
-			rules:        nil,
-		}
-		return nil
-	}
-
-	var err error
-	if e.defaultAllow {
-		err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception")
-	} else {
-		err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception")
-	}
-	return err
-}
-
-func (e *Emulator) Apply(rule devices.Rule) error {
-	if !rule.Type.CanCgroup() {
-		return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
-	}
-
-	innerRule := &deviceRule{
-		meta: deviceMeta{
-			node:  rule.Type,
-			major: rule.Major,
-			minor: rule.Minor,
-		},
-		perms: rule.Permissions,
-	}
-	if innerRule.meta.node == devices.WildcardDevice {
-		innerRule = nil
-	}
-
-	if rule.Allow {
-		return e.allow(innerRule)
-	}
-
-	return e.deny(innerRule)
-}
-
-// EmulatorFromList takes a reader to a "devices.list"-like source, and returns
-// a new Emulator that represents the state of the devices cgroup. Note that
-// black-list devices cgroups cannot be fully reconstructed, due to limitations
-// in the devices cgroup API. Instead, such cgroups are always treated as
-// "allow all" cgroups.
-func EmulatorFromList(list io.Reader) (*Emulator, error) {
-	// Normally cgroups are in black-list mode by default, but the way we
-	// figure out the current mode is whether or not devices.list has an
-	// allow-all rule. So we default to a white-list, and the existence of an
-	// "a *:* rwm" entry will tell us otherwise.
-	e := &Emulator{
-		defaultAllow: false,
-	}
-
-	// Parse the "devices.list".
-	s := bufio.NewScanner(list)
-	for s.Scan() {
-		line := s.Text()
-		deviceRule, err := parseLine(line)
-		if err != nil {
-			return nil, fmt.Errorf("error parsing line %q: %w", line, err)
-		}
-		// "devices.list" is an allow list. Note that this means that in
-		// black-list mode, we have no idea what rules are in play. As a
-		// result, we need to be very careful in Transition().
-		if err := e.allow(deviceRule); err != nil {
-			return nil, fmt.Errorf("error adding devices.list rule: %w", err)
-		}
-	}
-	if err := s.Err(); err != nil {
-		return nil, fmt.Errorf("error reading devices.list lines: %w", err)
-	}
-	return e, nil
-}
-
-// Transition calculates what is the minimally-disruptive set of rules need to
-// be applied to a devices cgroup in order to transition to the given target.
-// This means that any already-existing rules will not be applied, and
-// disruptive rules (like denying all device access) will only be applied if
-// necessary.
-//
-// This function is the sole reason for all of Emulator -- to allow us
-// to figure out how to update a containers' cgroups without causing spurious
-// device errors (if possible).
-func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) {
-	var transitionRules []*devices.Rule
-	oldRules := source.rules
-
-	// If the default policy doesn't match, we need to include a "disruptive"
-	// rule (either allow-all or deny-all) in order to switch the cgroup to the
-	// correct default policy.
-	//
-	// However, due to a limitation in "devices.list" we cannot be sure what
-	// deny rules are in place in a black-list cgroup. Thus if the source is a
-	// black-list we also have to include a disruptive rule.
-	if source.IsBlacklist() || source.defaultAllow != target.defaultAllow {
-		transitionRules = append(transitionRules, &devices.Rule{
-			Type:        'a',
-			Major:       -1,
-			Minor:       -1,
-			Permissions: devices.Permissions("rwm"),
-			Allow:       target.defaultAllow,
-		})
-		// The old rules are only relevant if we aren't starting out with a
-		// disruptive rule.
-		oldRules = nil
-	}
-
-	// NOTE: We traverse through the rules in a sorted order so we always write
-	//       the same set of rules (this is to aid testing).
-
-	// First, we create inverse rules for any old rules not in the new set.
-	// This includes partial-inverse rules for specific permissions. This is a
-	// no-op if we added a disruptive rule, since oldRules will be empty.
-	for _, rule := range oldRules.orderedEntries() {
-		meta, oldPerms := rule.meta, rule.perms
-		newPerms := target.rules[meta]
-		droppedPerms := oldPerms.Difference(newPerms)
-		if !droppedPerms.IsEmpty() {
-			transitionRules = append(transitionRules, &devices.Rule{
-				Type:        meta.node,
-				Major:       meta.major,
-				Minor:       meta.minor,
-				Permissions: droppedPerms,
-				Allow:       target.defaultAllow,
-			})
-		}
-	}
-
-	// Add any additional rules which weren't in the old set. We happen to
-	// filter out rules which are present in both sets, though this isn't
-	// strictly necessary.
-	for _, rule := range target.rules.orderedEntries() {
-		meta, newPerms := rule.meta, rule.perms
-		oldPerms := oldRules[meta]
-		gainedPerms := newPerms.Difference(oldPerms)
-		if !gainedPerms.IsEmpty() {
-			transitionRules = append(transitionRules, &devices.Rule{
-				Type:        meta.node,
-				Major:       meta.major,
-				Minor:       meta.minor,
-				Permissions: gainedPerms,
-				Allow:       !target.defaultAllow,
-			})
-		}
-	}
-	return transitionRules, nil
-}
-
-// Rules returns the minimum set of rules necessary to convert a *deny-all*
-// cgroup to the emulated filter state (note that this is not the same as a
-// default cgroupv1 cgroup -- which is allow-all). This is effectively just a
-// wrapper around Transition() with the source emulator being an empty cgroup.
-func (e *Emulator) Rules() ([]*devices.Rule, error) {
-	defaultCgroup := &Emulator{defaultAllow: false}
-	return defaultCgroup.Transition(e)
-}
-
-func wrapErr(err error, text string) error {
-	if err == nil {
-		return nil
-	}
-	return fmt.Errorf(text+": %w", err)
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
@@ -1,208 +0,0 @@
-// Package devicefilter contains eBPF device filter program
-//
-// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
-//
-// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
-// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
-package devicefilter
-
-import (
-	"errors"
-	"fmt"
-	"math"
-	"strconv"
-
-	"github.com/cilium/ebpf/asm"
-	devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices"
-	"github.com/opencontainers/runc/libcontainer/devices"
-	"golang.org/x/sys/unix"
-)
-
-const (
-	// license string format is same as kernel MODULE_LICENSE macro
-	license = "Apache"
-)
-
-// DeviceFilter returns eBPF device filter program and its license string
-func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
-	// Generate the minimum ruleset for the device rules we are given. While we
-	// don't care about minimum transitions in cgroupv2, using the emulator
-	// gives us a guarantee that the behaviour of devices filtering is the same
-	// as cgroupv1, including security hardenings to avoid misconfiguration
-	// (such as punching holes in wildcard rules).
-	emu := new(devicesemulator.Emulator)
-	for _, rule := range rules {
-		if err := emu.Apply(*rule); err != nil {
-			return nil, "", err
-		}
-	}
-	cleanRules, err := emu.Rules()
-	if err != nil {
-		return nil, "", err
-	}
-
-	p := &program{
-		defaultAllow: emu.IsBlacklist(),
-	}
-	p.init()
-
-	for idx, rule := range cleanRules {
-		if rule.Type == devices.WildcardDevice {
-			// We can safely skip over wildcard entries because there should
-			// only be one (at most) at the very start to instruct cgroupv1 to
-			// go into allow-list mode. However we do double-check this here.
-			if idx != 0 || rule.Allow != emu.IsBlacklist() {
-				return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
-			}
-			continue
-		}
-		if rule.Allow == p.defaultAllow {
-			// There should be no rules which have an action equal to the
-			// default action, the emulator removes those.
-			return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
-		}
-		if err := p.appendRule(rule); err != nil {
-			return nil, "", err
-		}
-	}
-	return p.finalize(), license, nil
-}
-
-type program struct {
-	insts        asm.Instructions
-	defaultAllow bool
-	blockID      int
-}
-
-func (p *program) init() {
-	// struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423
-	/*
-		u32 access_type
-		u32 major
-		u32 minor
-	*/
-	// R2 <- type (lower 16 bit of u32 access_type at R1[0])
-	p.insts = append(p.insts,
-		asm.LoadMem(asm.R2, asm.R1, 0, asm.Word),
-		asm.And.Imm32(asm.R2, 0xFFFF))
-
-	// R3 <- access (upper 16 bit of u32 access_type at R1[0])
-	p.insts = append(p.insts,
-		asm.LoadMem(asm.R3, asm.R1, 0, asm.Word),
-		// RSh: bitwise shift right
-		asm.RSh.Imm32(asm.R3, 16))
-
-	// R4 <- major (u32 major at R1[4])
-	p.insts = append(p.insts,
-		asm.LoadMem(asm.R4, asm.R1, 4, asm.Word))
-
-	// R5 <- minor (u32 minor at R1[8])
-	p.insts = append(p.insts,
-		asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
-}
-
-// appendRule rule converts an OCI rule to the relevant eBPF block and adds it
-// to the in-progress filter program. In order to operate properly, it must be
-// called with a "clean" rule list (generated by devices.Emulator.Rules() --
-// with any "a" rules removed).
-func (p *program) appendRule(rule *devices.Rule) error {
-	if p.blockID < 0 {
-		return errors.New("the program is finalized")
-	}
-
-	var bpfType int32
-	switch rule.Type {
-	case devices.CharDevice:
-		bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
-	case devices.BlockDevice:
-		bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
-	default:
-		// We do not permit 'a', nor any other types we don't know about.
-		return fmt.Errorf("invalid type %q", string(rule.Type))
-	}
-	if rule.Major > math.MaxUint32 {
-		return fmt.Errorf("invalid major %d", rule.Major)
-	}
-	if rule.Minor > math.MaxUint32 {
-		return fmt.Errorf("invalid minor %d", rule.Major)
-	}
-	hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
-	hasMinor := rule.Minor >= 0
-	bpfAccess := int32(0)
-	for _, r := range rule.Permissions {
-		switch r {
-		case 'r':
-			bpfAccess |= unix.BPF_DEVCG_ACC_READ
-		case 'w':
-			bpfAccess |= unix.BPF_DEVCG_ACC_WRITE
-		case 'm':
-			bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
-		default:
-			return fmt.Errorf("unknown device access %v", r)
-		}
-	}
-	// If the access is rwm, skip the check.
-	hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
-
-	var (
-		blockSym         = "block-" + strconv.Itoa(p.blockID)
-		nextBlockSym     = "block-" + strconv.Itoa(p.blockID+1)
-		prevBlockLastIdx = len(p.insts) - 1
-	)
-	p.insts = append(p.insts,
-		// if (R2 != bpfType) goto next
-		asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
-	)
-	if hasAccess {
-		p.insts = append(p.insts,
-			// if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next
-			asm.Mov.Reg32(asm.R1, asm.R3),
-			asm.And.Imm32(asm.R1, bpfAccess),
-			asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym),
-		)
-	}
-	if hasMajor {
-		p.insts = append(p.insts,
-			// if (R4 != major) goto next
-			asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym),
-		)
-	}
-	if hasMinor {
-		p.insts = append(p.insts,
-			// if (R5 != minor) goto next
-			asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym),
-		)
-	}
-	p.insts = append(p.insts, acceptBlock(rule.Allow)...)
-	// set blockSym to the first instruction we added in this iteration
-	p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
-	p.blockID++
-	return nil
-}
-
-func (p *program) finalize() asm.Instructions {
-	var v int32
-	if p.defaultAllow {
-		v = 1
-	}
-	blockSym := "block-" + strconv.Itoa(p.blockID)
-	p.insts = append(p.insts,
-		// R0 <- v
-		asm.Mov.Imm32(asm.R0, v).Sym(blockSym),
-		asm.Return(),
-	)
-	p.blockID = -1
-	return p.insts
-}
-
-func acceptBlock(accept bool) asm.Instructions {
-	var v int32
-	if accept {
-		v = 1
-	}
-	return []asm.Instruction{
-		// R0 <- v
-		asm.Mov.Imm32(asm.R0, v),
-		asm.Return(),
-	}
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go
@@ -1,253 +0,0 @@
-package ebpf
-
-import (
-	"errors"
-	"fmt"
-	"os"
-	"runtime"
-	"sync"
-	"unsafe"
-
-	"github.com/cilium/ebpf"
-	"github.com/cilium/ebpf/asm"
-	"github.com/cilium/ebpf/link"
-	"github.com/sirupsen/logrus"
-	"golang.org/x/sys/unix"
-)
-
-func nilCloser() error {
-	return nil
-}
-
-func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
-	type bpfAttrQuery struct {
-		TargetFd    uint32
-		AttachType  uint32
-		QueryType   uint32
-		AttachFlags uint32
-		ProgIds     uint64 // __aligned_u64
-		ProgCnt     uint32
-	}
-
-	// Currently you can only have 64 eBPF programs attached to a cgroup.
-	size := 64
-	retries := 0
-	for retries < 10 {
-		progIds := make([]uint32, size)
-		query := bpfAttrQuery{
-			TargetFd:   uint32(dirFd),
-			AttachType: uint32(unix.BPF_CGROUP_DEVICE),
-			ProgIds:    uint64(uintptr(unsafe.Pointer(&progIds[0]))),
-			ProgCnt:    uint32(len(progIds)),
-		}
-
-		// Fetch the list of program ids.
-		_, _, errno := unix.Syscall(unix.SYS_BPF,
-			uintptr(unix.BPF_PROG_QUERY),
-			uintptr(unsafe.Pointer(&query)),
-			unsafe.Sizeof(query))
-		size = int(query.ProgCnt)
-		runtime.KeepAlive(query)
-		if errno != 0 {
-			// On ENOSPC we get the correct number of programs.
-			if errno == unix.ENOSPC {
-				retries++
-				continue
-			}
-			return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
-		}
-
-		// Convert the ids to program handles.
-		progIds = progIds[:size]
-		programs := make([]*ebpf.Program, 0, len(progIds))
-		for _, progId := range progIds {
-			program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
-			if err != nil {
-				// We skip over programs that give us -EACCES or -EPERM. This
-				// is necessary because there may be BPF programs that have
-				// been attached (such as with --systemd-cgroup) which have an
-				// LSM label that blocks us from interacting with the program.
-				//
-				// Because additional BPF_CGROUP_DEVICE programs only can add
-				// restrictions, there's no real issue with just ignoring these
-				// programs (and stops runc from breaking on distributions with
-				// very strict SELinux policies).
-				if errors.Is(err, os.ErrPermission) {
-					logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err)
-					continue
-				}
-				return nil, fmt.Errorf("cannot fetch program from id: %w", err)
-			}
-			programs = append(programs, program)
-		}
-		runtime.KeepAlive(progIds)
-		return programs, nil
-	}
-
-	return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
-}
-
-var (
-	haveBpfProgReplaceBool bool
-	haveBpfProgReplaceOnce sync.Once
-)
-
-// Loosely based on the BPF_F_REPLACE support check in
-// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
-//
-// TODO: move this logic to cilium/ebpf
-func haveBpfProgReplace() bool {
-	haveBpfProgReplaceOnce.Do(func() {
-		prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
-			Type:    ebpf.CGroupDevice,
-			License: "MIT",
-			Instructions: asm.Instructions{
-				asm.Mov.Imm(asm.R0, 0),
-				asm.Return(),
-			},
-		})
-		if err != nil {
-			logrus.Debugf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
-			return
-		}
-		defer prog.Close()
-
-		devnull, err := os.Open("/dev/null")
-		if err != nil {
-			logrus.Debugf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err)
-			return
-		}
-		defer devnull.Close()
-
-		// We know that we have BPF_PROG_ATTACH since we can load
-		// BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
-		// we know that the feature isn't present.
-		err = link.RawAttachProgram(link.RawAttachProgramOptions{
-			// We rely on this fd being checked after attachFlags.
-			Target: int(devnull.Fd()),
-			// Attempt to "replace" bad fds with this program.
-			Program: prog,
-			Attach:  ebpf.AttachCGroupDevice,
-			Flags:   unix.BPF_F_ALLOW_MULTI | unix.BPF_F_REPLACE,
-		})
-		if errors.Is(err, unix.EINVAL) {
-			// not supported
-			return
-		}
-		// attach_flags test succeeded.
-		if !errors.Is(err, unix.EBADF) {
-			logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
-		}
-		haveBpfProgReplaceBool = true
-	})
-	return haveBpfProgReplaceBool
-}
-
-// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
-//
-// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
-//
-// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
-func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) {
-	// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
-	// This limit is not inherited into the container.
-	memlockLimit := &unix.Rlimit{
-		Cur: unix.RLIM_INFINITY,
-		Max: unix.RLIM_INFINITY,
-	}
-	_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
-
-	// Get the list of existing programs.
-	oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
-	if err != nil {
-		return nilCloser, err
-	}
-	useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1
-
-	// Generate new program.
-	spec := &ebpf.ProgramSpec{
-		Type:         ebpf.CGroupDevice,
-		Instructions: insts,
-		License:      license,
-	}
-	prog, err := ebpf.NewProgram(spec)
-	if err != nil {
-		return nilCloser, err
-	}
-
-	// If there is only one old program, we can just replace it directly.
-	var (
-		replaceProg *ebpf.Program
-		attachFlags uint32 = unix.BPF_F_ALLOW_MULTI
-	)
-	if useReplaceProg {
-		replaceProg = oldProgs[0]
-		attachFlags |= unix.BPF_F_REPLACE
-	}
-	err = link.RawAttachProgram(link.RawAttachProgramOptions{
-		Target:  dirFd,
-		Program: prog,
-		Replace: replaceProg,
-		Attach:  ebpf.AttachCGroupDevice,
-		Flags:   attachFlags,
-	})
-	if err != nil {
-		return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
-	}
-	closer := func() error {
-		err = link.RawDetachProgram(link.RawDetachProgramOptions{
-			Target:  dirFd,
-			Program: prog,
-			Attach:  ebpf.AttachCGroupDevice,
-		})
-		if err != nil {
-			return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
-		}
-		// TODO: Should we attach the old filters back in this case? Otherwise
-		//       we fail-open on a security feature, which is a bit scary.
-		return nil
-	}
-	if !useReplaceProg {
-		logLevel := logrus.DebugLevel
-		// If there was more than one old program, give a warning (since this
-		// really shouldn't happen with runc-managed cgroups) and then detach
-		// all the old programs.
-		if len(oldProgs) > 1 {
-			// NOTE: Ideally this should be a warning but it turns out that
-			//       systemd-managed cgroups trigger this warning (apparently
-			//       systemd doesn't delete old non-systemd programs when
-			//       setting properties).
-			logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
-			logLevel = logrus.InfoLevel
-		}
-		for idx, oldProg := range oldProgs {
-			// Output some extra debug info.
-			if info, err := oldProg.Info(); err == nil {
-				fields := logrus.Fields{
-					"type": info.Type.String(),
-					"tag":  info.Tag,
-					"name": info.Name,
-				}
-				if id, ok := info.ID(); ok {
-					fields["id"] = id
-				}
-				if runCount, ok := info.RunCount(); ok {
-					fields["run_count"] = runCount
-				}
-				if runtime, ok := info.Runtime(); ok {
-					fields["runtime"] = runtime.String()
-				}
-				logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
-			}
-			err = link.RawDetachProgram(link.RawDetachProgramOptions{
-				Target:  dirFd,
-				Program: oldProg,
-				Attach:  ebpf.AttachCGroupDevice,
-			})
-			if err != nil {
-				return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
-			}
-		}
-	}
-	return closer, nil
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go
@@ -50,22 +50,45 @@ func WriteFile(dir, file, data string) error {
 		return err
 	}
 	defer fd.Close()
-	if err := retryingWriteFile(fd, data); err != nil {
+	if _, err := fd.WriteString(data); err != nil {
 		// Having data in the error message helps in debugging.
 		return fmt.Errorf("failed to write %q: %w", data, err)
 	}
 	return nil
 }

-func retryingWriteFile(fd *os.File, data string) error {
-	for {
-		_, err := fd.Write([]byte(data))
-		if errors.Is(err, unix.EINTR) {
-			logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
-			continue
-		}
+// WriteFileByLine is the same as WriteFile, except if data contains newlines,
+// it is written line by line.
+func WriteFileByLine(dir, file, data string) error {
+	i := strings.Index(data, "\n")
+	if i == -1 {
+		return WriteFile(dir, file, data)
+	}
+
+	fd, err := OpenFile(dir, file, unix.O_WRONLY)
+	if err != nil {
 		return err
 	}
+	defer fd.Close()
+	start := 0
+	for {
+		var line string
+		if i == -1 {
+			line = data[start:]
+		} else {
+			line = data[start : start+i+1]
+		}
+		_, err := fd.WriteString(line)
+		if err != nil {
+			return fmt.Errorf("failed to write %q: %w", line, err)
+		}
+		if i == -1 {
+			break
+		}
+		start += i + 1
+		i = strings.Index(data[start:], "\n")
+	}
+	return nil
 }

 const (
@@ -90,7 +113,7 @@ func prepareOpenat2() error {
 		})
 		if err != nil {
 			prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
-			if err != unix.ENOSYS { //nolint:errorlint // unix errors are bare
+			if err != unix.ENOSYS {
 				logrus.Warnf("falling back to securejoin: %s", prepErr)
 			} else {
 				logrus.Debug("openat2 not available, falling back to securejoin")
@@ -148,8 +171,9 @@ func openFile(dir, file string, flags int) (*os.File, error) {
 		//
 		// TODO: if such usage will ever be common, amend this
 		// to reopen cgroupRootHandle and retry openat2.
-		fdStr := strconv.Itoa(int(cgroupRootHandle.Fd()))
-		fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr)
+		fdPath, closer := utils.ProcThreadSelf("fd/" + strconv.Itoa(int(cgroupRootHandle.Fd())))
+		defer closer()
+		fdDest, _ := os.Readlink(fdPath)
 		if fdDest != cgroupfsDir {
 			// Wrap the error so it is clear that cgroupRootHandle
 			// is opened to an unexpected/wrong directory.
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go
@@ -100,6 +100,30 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error {
 			period = ""
 		}
 	}
+
+	var burst string
+	if r.CpuBurst != nil {
+		burst = strconv.FormatUint(*r.CpuBurst, 10)
+		if err := cgroups.WriteFile(path, "cpu.cfs_burst_us", burst); err != nil {
+			if errors.Is(err, unix.ENOENT) {
+				// If CPU burst knob is not available (e.g.
+				// older kernel), ignore it.
+				burst = ""
+			} else {
+				// Sometimes when the burst to be set is larger
+				// than the current one, it is rejected by the kernel
+				// (EINVAL) as old_quota/new_burst exceeds the parent
+				// cgroup quota limit. If this happens and the quota is
+				// going to be set, ignore the error for now and retry
+				// after setting the quota.
+				if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 {
+					return err
+				}
+			}
+		} else {
+			burst = ""
+		}
+	}
 	if r.CpuQuota != 0 {
 		if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil {
 			return err
@@ -109,7 +133,20 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error {
 				return err
 			}
 		}
+		if burst != "" {
+			if err := cgroups.WriteFile(path, "cpu.cfs_burst_us", burst); err != nil {
+				return err
+			}
+		}
 	}
+
+	if r.CPUIdle != nil {
+		idle := strconv.FormatInt(*r.CPUIdle, 10)
+		if err := cgroups.WriteFile(path, "cpu.idle", idle); err != nil {
+			return err
+		}
+	}
+
 	return s.SetRtSched(path, r)
 }

--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go
@@ -91,7 +91,7 @@ func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
 	if err != nil {
 		return 0, 0, err
 	}
-	// TODO: use strings.SplitN instead.
+
 	fields := strings.Fields(data)
 	if len(fields) < 4 || fields[0] != userField || fields[2] != systemField {
 		return 0, 0, malformedLine(path, file, data)
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go
@@ -195,7 +195,7 @@ func cpusetEnsureParent(current string) error {
 	}
 	// Treat non-existing directory as cgroupfs as it will be created,
 	// and the root cpuset directory obviously exists.
-	if err != nil && err != unix.ENOENT { //nolint:errorlint // unix errors are bare
+	if err != nil && err != unix.ENOENT {
 		return &os.PathError{Op: "statfs", Path: parent, Err: err}
 	}

--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go
@@ -1,20 +1,11 @@
 package fs

 import (
-	"bytes"
-	"errors"
-	"reflect"
-
 	"github.com/opencontainers/runc/libcontainer/cgroups"
-	cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
 	"github.com/opencontainers/runc/libcontainer/configs"
-	"github.com/opencontainers/runc/libcontainer/devices"
-	"github.com/opencontainers/runc/libcontainer/userns"
 )

-type DevicesGroup struct {
-	TestingSkipFinalCheck bool
-}
+type DevicesGroup struct{}

 func (s *DevicesGroup) Name() string {
 	return "devices"
@@ -33,75 +24,14 @@ func (s *DevicesGroup) Apply(path string, r *configs.Resources, pid int) error {
 	return apply(path, pid)
 }

-func loadEmulator(path string) (*cgroupdevices.Emulator, error) {
-	list, err := cgroups.ReadFile(path, "devices.list")
-	if err != nil {
-		return nil, err
-	}
-	return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list))
-}
-
-func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) {
-	// This defaults to a white-list -- which is what we want!
-	emu := &cgroupdevices.Emulator{}
-	for _, rule := range rules {
-		if err := emu.Apply(*rule); err != nil {
-			return nil, err
-		}
-	}
-	return emu, nil
-}
-
 func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
-	if userns.RunningInUserNS() || r.SkipDevices {
-		return nil
-	}
-
-	// Generate two emulators, one for the current state of the cgroup and one
-	// for the requested state by the user.
-	current, err := loadEmulator(path)
-	if err != nil {
-		return err
-	}
-	target, err := buildEmulator(r.Devices)
-	if err != nil {
-		return err
-	}
-
-	// Compute the minimal set of transition rules needed to achieve the
-	// requested state.
-	transitionRules, err := current.Transition(target)
-	if err != nil {
-		return err
-	}
-	for _, rule := range transitionRules {
-		file := "devices.deny"
-		if rule.Allow {
-			file = "devices.allow"
-		}
-		if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil {
-			return err
+	if cgroups.DevicesSetV1 == nil {
+		if len(r.Devices) == 0 {
+			return nil
 		}
+		return cgroups.ErrDevicesUnsupported
 	}
-
-	// Final safety check -- ensure that the resulting state is what was
-	// requested. This is only really correct for white-lists, but for
-	// black-lists we can at least check that the cgroup is in the right mode.
-	//
-	// This safety-check is skipped for the unit tests because we cannot
-	// currently mock devices.list correctly.
-	if !s.TestingSkipFinalCheck {
-		currentAfter, err := loadEmulator(path)
-		if err != nil {
-			return err
-		}
-		if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) {
-			return errors.New("resulting devices cgroup doesn't precisely match target")
-		} else if target.IsBlacklist() != currentAfter.IsBlacklist() {
-			return errors.New("resulting devices cgroup doesn't match target mode")
-		}
-	}
-	return nil
+	return cgroups.DevicesSetV1(path, r)
 }

 func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error {
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go
@@ -54,13 +54,13 @@ type subsystem interface {
 	Set(path string, r *configs.Resources) error
 }

-type manager struct {
+type Manager struct {
 	mu      sync.Mutex
 	cgroups *configs.Cgroup
 	paths   map[string]string
 }

-func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
+func NewManager(cg *configs.Cgroup, paths map[string]string) (*Manager, error) {
 	// Some v1 controllers (cpu, cpuset, and devices) expect
 	// cgroups.Resources to not be nil in Apply.
 	if cg.Resources == nil {
@@ -78,7 +78,7 @@ func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, e
 		}
 	}

-	return &manager{
+	return &Manager{
 		cgroups: cg,
 		paths:   paths,
 	}, nil
@@ -105,7 +105,7 @@ func isIgnorableError(rootless bool, err error) bool {
 	return false
 }

-func (m *manager) Apply(pid int) (err error) {
+func (m *Manager) Apply(pid int) (retErr error) {
 	m.mu.Lock()
 	defer m.mu.Unlock()

@@ -129,6 +129,7 @@ func (m *manager) Apply(pid int) (err error) {
 			// later by Set, which fails with a friendly error (see
 			// if path == "" in Set).
 			if isIgnorableError(c.Rootless, err) && c.Path == "" {
+				retErr = cgroups.ErrRootless
 				delete(m.paths, name)
 				continue
 			}
@@ -136,22 +137,22 @@ func (m *manager) Apply(pid int) (err error) {
 		}

 	}
-	return nil
+	return retErr
 }

-func (m *manager) Destroy() error {
+func (m *Manager) Destroy() error {
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	return cgroups.RemovePaths(m.paths)
 }

-func (m *manager) Path(subsys string) string {
+func (m *Manager) Path(subsys string) string {
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	return m.paths[subsys]
 }

-func (m *manager) GetStats() (*cgroups.Stats, error) {
+func (m *Manager) GetStats() (*cgroups.Stats, error) {
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	stats := cgroups.NewStats()
@@ -167,7 +168,7 @@ func (m *manager) GetStats() (*cgroups.Stats, error) {
 	return stats, nil
 }

-func (m *manager) Set(r *configs.Resources) error {
+func (m *Manager) Set(r *configs.Resources) error {
 	if r == nil {
 		return nil
 	}
@@ -183,7 +184,7 @@ func (m *manager) Set(r *configs.Resources) error {
 		if err := sys.Set(path, r); err != nil {
 			// When rootless is true, errors from the device subsystem
 			// are ignored, as it is really not expected to work.
-			if m.cgroups.Rootless && sys.Name() == "devices" {
+			if m.cgroups.Rootless && sys.Name() == "devices" && !errors.Is(err, cgroups.ErrDevicesUnsupported) {
 				continue
 			}
 			// However, errors from other subsystems are not ignored.
@@ -202,7 +203,7 @@ func (m *manager) Set(r *configs.Resources) error {

 // Freeze toggles the container's freezer cgroup depending on the state
 // provided
-func (m *manager) Freeze(state configs.FreezerState) error {
+func (m *Manager) Freeze(state configs.FreezerState) error {
 	path := m.Path("freezer")
 	if path == "" {
 		return errors.New("cannot toggle freezer: cgroups not configured for container")
@@ -218,25 +219,25 @@ func (m *manager) Freeze(state configs.FreezerState) error {
 	return nil
 }

-func (m *manager) GetPids() ([]int, error) {
+func (m *Manager) GetPids() ([]int, error) {
 	return cgroups.GetPids(m.Path("devices"))
 }

-func (m *manager) GetAllPids() ([]int, error) {
+func (m *Manager) GetAllPids() ([]int, error) {
 	return cgroups.GetAllPids(m.Path("devices"))
 }

-func (m *manager) GetPaths() map[string]string {
+func (m *Manager) GetPaths() map[string]string {
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	return m.paths
 }

-func (m *manager) GetCgroups() (*configs.Cgroup, error) {
+func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
 	return m.cgroups, nil
 }

-func (m *manager) GetFreezerState() (configs.FreezerState, error) {
+func (m *Manager) GetFreezerState() (configs.FreezerState, error) {
 	dir := m.Path("freezer")
 	// If the container doesn't have the freezer cgroup, say it's undefined.
 	if dir == "" {
@@ -246,7 +247,7 @@ func (m *manager) GetFreezerState() (configs.FreezerState, error) {
 	return freezer.GetState(dir)
 }

-func (m *manager) Exists() bool {
+func (m *Manager) Exists() bool {
 	return cgroups.PathExists(m.Path("devices"))
 }

@@ -254,7 +255,7 @@ func OOMKillCount(path string) (uint64, error) {
 	return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
 }

-func (m *manager) OOMKillCount() (uint64, error) {
+func (m *Manager) OOMKillCount() (uint64, error) {
 	c, err := OOMKillCount(m.Path("memory"))
 	// Ignore ENOENT when rootless as it couldn't create cgroup.
 	if err != nil && m.cgroups.Rootless && os.IsNotExist(err) {
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go
@@ -282,11 +282,11 @@ func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) {
 		line := scanner.Text()
 		columns := strings.SplitN(line, " ", maxColumns)
 		for i, column := range columns {
-			byNode := strings.SplitN(column, "=", 2)
+			key, val, ok := strings.Cut(column, "=")
 			// Some custom kernels have non-standard fields, like
 			//   numa_locality 0 0 0 0 0 0 0 0 0 0
 			//   numa_exectime 0
-			if len(byNode) < 2 {
+			if !ok {
 				if i == 0 {
 					// Ignore/skip those.
 					break
@@ -296,7 +296,6 @@ func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) {
 					return stats, malformedLine(path, file, line)
 				}
 			}
-			key, val := byNode[0], byNode[1]
 			if i == 0 { // First column: key is name, val is total.
 				field = getNUMAField(&stats, key)
 				if field == nil { // unknown field (new kernel?)
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go
@@ -165,9 +165,8 @@ func subsysPath(root, inner, subsystem string) (string, error) {
 		return filepath.Join(root, filepath.Base(mnt), inner), nil
 	}

-	// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
-	// process could in container and shared pid namespace with host, and
-	// /proc/1/cgroup could point to whole other world of cgroups.
+	// Use GetOwnCgroupPath for dind-like cases, when cgroupns is not
+	// available. This is ugly.
 	parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
 	if err != nil {
 		return "", err
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go
@@ -2,16 +2,19 @@ package fs2

 import (
 	"bufio"
+	"errors"
 	"os"
 	"strconv"

+	"golang.org/x/sys/unix"
+
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

 func isCpuSet(r *configs.Resources) bool {
-	return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0
+	return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0 || r.CPUIdle != nil || r.CpuBurst != nil
 }

 func setCpu(dirPath string, r *configs.Resources) error {
@@ -19,6 +22,12 @@ func setCpu(dirPath string, r *configs.Resources) error {
 		return nil
 	}

+	if r.CPUIdle != nil {
+		if err := cgroups.WriteFile(dirPath, "cpu.idle", strconv.FormatInt(*r.CPUIdle, 10)); err != nil {
+			return err
+		}
+	}
+
 	// NOTE: .CpuShares is not used here. Conversion is the caller's responsibility.
 	if r.CpuWeight != 0 {
 		if err := cgroups.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil {
@@ -26,6 +35,23 @@ func setCpu(dirPath string, r *configs.Resources) error {
 		}
 	}

+	var burst string
+	if r.CpuBurst != nil {
+		burst = strconv.FormatUint(*r.CpuBurst, 10)
+		if err := cgroups.WriteFile(dirPath, "cpu.max.burst", burst); err != nil {
+			// Sometimes when the burst to be set is larger
+			// than the current one, it is rejected by the kernel
+			// (EINVAL) as old_quota/new_burst exceeds the parent
+			// cgroup quota limit. If this happens and the quota is
+			// going to be set, ignore the error for now and retry
+			// after setting the quota.
+			if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 {
+				return err
+			}
+		} else {
+			burst = ""
+		}
+	}
 	if r.CpuQuota != 0 || r.CpuPeriod != 0 {
 		str := "max"
 		if r.CpuQuota > 0 {
@@ -41,6 +67,11 @@ func setCpu(dirPath string, r *configs.Resources) error {
 		if err := cgroups.WriteFile(dirPath, "cpu.max", str); err != nil {
 			return err
 		}
+		if burst != "" {
+			if err := cgroups.WriteFile(dirPath, "cpu.max.burst", burst); err != nil {
+				return err
+			}
+		}
 	}

 	return nil
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go
@@ -55,6 +55,9 @@ func _defaultDirPath(root, cgPath, cgParent, cgName string) (string, error) {
 		return filepath.Join(root, innerPath), nil
 	}

+	// we don't need to use /proc/thread-self here because runc always runs
+	// with every thread in the same cgroup. This lets us avoid having to do
+	// runtime.LockOSThread.
 	ownCgroup, err := parseCgroupFile("/proc/self/cgroup")
 	if err != nil {
 		return "", err
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/devices.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/devices.go
@@ -1,75 +0,0 @@
-package fs2
-
-import (
-	"fmt"
-
-	"golang.org/x/sys/unix"
-
-	"github.com/opencontainers/runc/libcontainer/cgroups/ebpf"
-	"github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter"
-	"github.com/opencontainers/runc/libcontainer/configs"
-	"github.com/opencontainers/runc/libcontainer/devices"
-	"github.com/opencontainers/runc/libcontainer/userns"
-)
-
-func isRWM(perms devices.Permissions) bool {
-	var r, w, m bool
-	for _, perm := range perms {
-		switch perm {
-		case 'r':
-			r = true
-		case 'w':
-			w = true
-		case 'm':
-			m = true
-		}
-	}
-	return r && w && m
-}
-
-// This is similar to the logic applied in crun for handling errors from bpf(2)
-// <https://github.com/containers/crun/blob/0.17/src/libcrun/cgroup.c#L2438-L2470>.
-func canSkipEBPFError(r *configs.Resources) bool {
-	// If we're running in a user namespace we can ignore eBPF rules because we
-	// usually cannot use bpf(2), as well as rootless containers usually don't
-	// have the necessary privileges to mknod(2) device inodes or access
-	// host-level instances (though ideally we would be blocking device access
-	// for rootless containers anyway).
-	if userns.RunningInUserNS() {
-		return true
-	}
-
-	// We cannot ignore an eBPF load error if any rule if is a block rule or it
-	// doesn't permit all access modes.
-	//
-	// NOTE: This will sometimes trigger in cases where access modes are split
-	//       between different rules but to handle this correctly would require
-	//       using ".../libcontainer/cgroup/devices".Emulator.
-	for _, dev := range r.Devices {
-		if !dev.Allow || !isRWM(dev.Permissions) {
-			return false
-		}
-	}
-	return true
-}
-
-func setDevices(dirPath string, r *configs.Resources) error {
-	if r.SkipDevices {
-		return nil
-	}
-	insts, license, err := devicefilter.DeviceFilter(r.Devices)
-	if err != nil {
-		return err
-	}
-	dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0o600)
-	if err != nil {
-		return fmt.Errorf("cannot get dir FD for %s", dirPath)
-	}
-	defer unix.Close(dirFD)
-	if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
-		if !canSkipEBPFError(r) {
-			return err
-		}
-	}
-	return nil
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go
@@ -13,7 +13,7 @@ import (

 type parseError = fscommon.ParseError

-type manager struct {
+type Manager struct {
 	config *configs.Cgroup
 	// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
 	dirPath string
@@ -25,7 +25,7 @@ type manager struct {
 // NewManager creates a manager for cgroup v2 unified hierarchy.
 // dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope".
 // If dirPath is empty, it is automatically set using config.
-func NewManager(config *configs.Cgroup, dirPath string) (cgroups.Manager, error) {
+func NewManager(config *configs.Cgroup, dirPath string) (*Manager, error) {
 	if dirPath == "" {
 		var err error
 		dirPath, err = defaultDirPath(config)
@@ -34,14 +34,14 @@ func NewManager(config *configs.Cgroup, dirPath string) (cgroups.Manager, error)
 		}
 	}

-	m := &manager{
+	m := &Manager{
 		config:  config,
 		dirPath: dirPath,
 	}
 	return m, nil
 }

-func (m *manager) getControllers() error {
+func (m *Manager) getControllers() error {
 	if m.controllers != nil {
 		return nil
 	}
@@ -62,7 +62,7 @@ func (m *manager) getControllers() error {
 	return nil
 }

-func (m *manager) Apply(pid int) error {
+func (m *Manager) Apply(pid int) error {
 	if err := CreateCgroupPath(m.dirPath, m.config); err != nil {
 		// Related tests:
 		// - "runc create (no limits + no cgrouppath + no permission) succeeds"
@@ -71,7 +71,7 @@ func (m *manager) Apply(pid int) error {
 		if m.config.Rootless {
 			if m.config.Path == "" {
 				if blNeed, nErr := needAnyControllers(m.config.Resources); nErr == nil && !blNeed {
-					return nil
+					return cgroups.ErrRootless
 				}
 				return fmt.Errorf("rootless needs no limits + no cgrouppath when no permission is granted for cgroups: %w", err)
 			}
@@ -84,15 +84,15 @@ func (m *manager) Apply(pid int) error {
 	return nil
 }

-func (m *manager) GetPids() ([]int, error) {
+func (m *Manager) GetPids() ([]int, error) {
 	return cgroups.GetPids(m.dirPath)
 }

-func (m *manager) GetAllPids() ([]int, error) {
+func (m *Manager) GetAllPids() ([]int, error) {
 	return cgroups.GetAllPids(m.dirPath)
 }

-func (m *manager) GetStats() (*cgroups.Stats, error) {
+func (m *Manager) GetStats() (*cgroups.Stats, error) {
 	var errs []error

 	st := cgroups.NewStats()
@@ -114,6 +114,17 @@ func (m *manager) GetStats() (*cgroups.Stats, error) {
 	if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) {
 		errs = append(errs, err)
 	}
+	// PSI (since kernel 4.20).
+	var err error
+	if st.CpuStats.PSI, err = statPSI(m.dirPath, "cpu.pressure"); err != nil {
+		errs = append(errs, err)
+	}
+	if st.MemoryStats.PSI, err = statPSI(m.dirPath, "memory.pressure"); err != nil {
+		errs = append(errs, err)
+	}
+	if st.BlkioStats.PSI, err = statPSI(m.dirPath, "io.pressure"); err != nil {
+		errs = append(errs, err)
+	}
 	// hugetlb (since kernel 5.6)
 	if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) {
 		errs = append(errs, err)
@@ -122,13 +133,17 @@ func (m *manager) GetStats() (*cgroups.Stats, error) {
 	if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) {
 		errs = append(errs, err)
 	}
+	// misc (since kernel 5.13)
+	if err := statMisc(m.dirPath, st); err != nil && !os.IsNotExist(err) {
+		errs = append(errs, err)
+	}
 	if len(errs) > 0 && !m.config.Rootless {
 		return st, fmt.Errorf("error while statting cgroup v2: %+v", errs)
 	}
 	return st, nil
 }

-func (m *manager) Freeze(state configs.FreezerState) error {
+func (m *Manager) Freeze(state configs.FreezerState) error {
 	if m.config.Resources == nil {
 		return errors.New("cannot toggle freezer: cgroups not configured for container")
 	}
@@ -139,15 +154,15 @@ func (m *manager) Freeze(state configs.FreezerState) error {
 	return nil
 }

-func (m *manager) Destroy() error {
+func (m *Manager) Destroy() error {
 	return cgroups.RemovePath(m.dirPath)
 }

-func (m *manager) Path(_ string) string {
+func (m *Manager) Path(_ string) string {
 	return m.dirPath
 }

-func (m *manager) Set(r *configs.Resources) error {
+func (m *Manager) Set(r *configs.Resources) error {
 	if r == nil {
 		return nil
 	}
@@ -175,8 +190,10 @@ func (m *manager) Set(r *configs.Resources) error {
 	// When rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
 	// However, errors from other subsystems are not ignored.
 	// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
-	if err := setDevices(m.dirPath, r); err != nil && !m.config.Rootless {
-		return err
+	if err := setDevices(m.dirPath, r); err != nil {
+		if !m.config.Rootless || errors.Is(err, cgroups.ErrDevicesUnsupported) {
+			return err
+		}
 	}
 	// cpuset (since kernel 5.0)
 	if err := setCpuset(m.dirPath, r); err != nil {
@@ -201,12 +218,22 @@ func (m *manager) Set(r *configs.Resources) error {
 	return nil
 }

-func (m *manager) setUnified(res map[string]string) error {
+func setDevices(dirPath string, r *configs.Resources) error {
+	if cgroups.DevicesSetV2 == nil {
+		if len(r.Devices) > 0 {
+			return cgroups.ErrDevicesUnsupported
+		}
+		return nil
+	}
+	return cgroups.DevicesSetV2(dirPath, r)
+}
+
+func (m *Manager) setUnified(res map[string]string) error {
 	for k, v := range res {
 		if strings.Contains(k, "/") {
 			return fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
 		}
-		if err := cgroups.WriteFile(m.dirPath, k, v); err != nil {
+		if err := cgroups.WriteFileByLine(m.dirPath, k, v); err != nil {
 			// Check for both EPERM and ENOENT since O_CREAT is used by WriteFile.
 			if errors.Is(err, os.ErrPermission) || errors.Is(err, os.ErrNotExist) {
 				// Check if a controller is available,
@@ -227,21 +254,21 @@ func (m *manager) setUnified(res map[string]string) error {
 	return nil
 }

-func (m *manager) GetPaths() map[string]string {
+func (m *Manager) GetPaths() map[string]string {
 	paths := make(map[string]string, 1)
 	paths[""] = m.dirPath
 	return paths
 }

-func (m *manager) GetCgroups() (*configs.Cgroup, error) {
+func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
 	return m.config, nil
 }

-func (m *manager) GetFreezerState() (configs.FreezerState, error) {
+func (m *Manager) GetFreezerState() (configs.FreezerState, error) {
 	return getFreezer(m.dirPath)
 }

-func (m *manager) Exists() bool {
+func (m *Manager) Exists() bool {
 	return cgroups.PathExists(m.dirPath)
 }

@@ -249,7 +276,7 @@ func OOMKillCount(path string) (uint64, error) {
 	return fscommon.GetValueByKey(path, "memory.events", "oom_kill")
 }

-func (m *manager) OOMKillCount() (uint64, error) {
+func (m *Manager) OOMKillCount() (uint64, error) {
 	c, err := OOMKillCount(m.dirPath)
 	if err != nil && m.config.Rootless && os.IsNotExist(err) {
 		err = nil
@@ -257,3 +284,35 @@ func (m *manager) OOMKillCount() (uint64, error) {

 	return c, err
 }
+
+func CheckMemoryUsage(dirPath string, r *configs.Resources) error {
+	if !r.MemoryCheckBeforeUpdate {
+		return nil
+	}
+
+	if r.Memory <= 0 && r.MemorySwap <= 0 {
+		return nil
+	}
+
+	usage, err := fscommon.GetCgroupParamUint(dirPath, "memory.current")
+	if err != nil {
+		// This check is on best-effort basis, so if we can't read the
+		// current usage (cgroup not yet created, or any other error),
+		// we should not fail.
+		return nil
+	}
+
+	if r.MemorySwap > 0 {
+		if uint64(r.MemorySwap) <= usage {
+			return fmt.Errorf("rejecting memory+swap limit %d <= usage %d", r.MemorySwap, usage)
+		}
+	}
+
+	if r.Memory > 0 {
+		if uint64(r.Memory) <= usage {
+			return fmt.Errorf("rejecting memory limit %d <= usage %d", r.Memory, usage)
+		}
+	}
+
+	return nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go
@@ -40,6 +40,11 @@ func setMemory(dirPath string, r *configs.Resources) error {
 	if !isMemorySet(r) {
 		return nil
 	}
+
+	if err := CheckMemoryUsage(dirPath, r); err != nil {
+		return err
+	}
+
 	swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
 	if err != nil {
 		return err
@@ -52,7 +57,10 @@ func setMemory(dirPath string, r *configs.Resources) error {
 	// never write empty string to `memory.swap.max`, it means set to 0.
 	if swapStr != "" {
 		if err := cgroups.WriteFile(dirPath, "memory.swap.max", swapStr); err != nil {
-			return err
+			// If swap is not enabled, silently ignore setting to max or disabling it.
+			if !(errors.Is(err, os.ErrNotExist) && (swapStr == "max" || swapStr == "0")) {
+				return err
+			}
 		}
 	}

--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/misc.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/misc.go
@@ -0,0 +1,52 @@
+package fs2
+
+import (
+	"bufio"
+	"os"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+)
+
+func statMisc(dirPath string, stats *cgroups.Stats) error {
+	for _, file := range []string{"current", "events"} {
+		fd, err := cgroups.OpenFile(dirPath, "misc."+file, os.O_RDONLY)
+		if err != nil {
+			return err
+		}
+
+		s := bufio.NewScanner(fd)
+		for s.Scan() {
+			key, value, err := fscommon.ParseKeyValue(s.Text())
+			if err != nil {
+				fd.Close()
+				return err
+			}
+
+			key = strings.TrimSuffix(key, ".max")
+
+			if _, ok := stats.MiscStats[key]; !ok {
+				stats.MiscStats[key] = cgroups.MiscStats{}
+			}
+
+			tmp := stats.MiscStats[key]
+
+			switch file {
+			case "current":
+				tmp.Usage = value
+			case "events":
+				tmp.Events = value
+			}
+
+			stats.MiscStats[key] = tmp
+		}
+		fd.Close()
+
+		if err := s.Err(); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/psi.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/psi.go
@@ -0,0 +1,89 @@
+package fs2
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+)
+
+func statPSI(dirPath string, file string) (*cgroups.PSIStats, error) {
+	f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			// Kernel < 4.20, or CONFIG_PSI is not set,
+			// or PSI stats are turned off for the cgroup
+			// ("echo 0 > cgroup.pressure", kernel >= 6.1).
+			return nil, nil
+		}
+		return nil, err
+	}
+	defer f.Close()
+
+	var psistats cgroups.PSIStats
+	sc := bufio.NewScanner(f)
+	for sc.Scan() {
+		parts := strings.Fields(sc.Text())
+		var pv *cgroups.PSIData
+		switch parts[0] {
+		case "some":
+			pv = &psistats.Some
+		case "full":
+			pv = &psistats.Full
+		}
+		if pv != nil {
+			*pv, err = parsePSIData(parts[1:])
+			if err != nil {
+				return nil, &parseError{Path: dirPath, File: file, Err: err}
+			}
+		}
+	}
+	if err := sc.Err(); err != nil {
+		if errors.Is(err, unix.ENOTSUP) {
+			// Some kernels (e.g. CS9) may return ENOTSUP on read
+			// if psi=1 kernel cmdline parameter is required.
+			return nil, nil
+		}
+		return nil, &parseError{Path: dirPath, File: file, Err: err}
+	}
+	return &psistats, nil
+}
+
+func parsePSIData(psi []string) (cgroups.PSIData, error) {
+	data := cgroups.PSIData{}
+	for _, f := range psi {
+		kv := strings.SplitN(f, "=", 2)
+		if len(kv) != 2 {
+			return data, fmt.Errorf("invalid psi data: %q", f)
+		}
+		var pv *float64
+		switch kv[0] {
+		case "avg10":
+			pv = &data.Avg10
+		case "avg60":
+			pv = &data.Avg60
+		case "avg300":
+			pv = &data.Avg300
+		case "total":
+			v, err := strconv.ParseUint(kv[1], 10, 64)
+			if err != nil {
+				return data, fmt.Errorf("invalid %s PSI value: %w", kv[0], err)
+			}
+			data.Total = v
+		}
+		if pv != nil {
+			v, err := strconv.ParseFloat(kv[1], 64)
+			if err != nil {
+				return data, fmt.Errorf("invalid %s PSI value: %w", kv[0], err)
+			}
+			*pv = v
+		}
+	}
+	return data, nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/manager/new.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/manager/new.go
@@ -55,10 +55,10 @@ func NewWithPaths(config *configs.Cgroup, paths map[string]string) (cgroups.Mana
 	return fs.NewManager(config, paths)
 }

-// getUnifiedPath is an implementation detail of libcontainer factory.
-// Historically, it saves cgroup paths as per-subsystem path map (as returned
-// by cm.GetPaths(""), but with v2 we only have one single unified path
-// (with "" as a key).
+// getUnifiedPath is an implementation detail of libcontainer.
+// Historically, libcontainer.Create saves cgroup paths as per-subsystem path
+// map (as returned by cm.GetPaths(""), but with v2 we only have one single
+// unified path (with "" as a key).
 //
 // This function converts from that map to string (using "" as a key),
 // and also checks that the map itself is sane.
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
@@ -32,9 +32,22 @@ type CpuUsage struct {
 	UsageInUsermode uint64 `json:"usage_in_usermode"`
 }

+type PSIData struct {
+	Avg10  float64 `json:"avg10"`
+	Avg60  float64 `json:"avg60"`
+	Avg300 float64 `json:"avg300"`
+	Total  uint64  `json:"total"`
+}
+
+type PSIStats struct {
+	Some PSIData `json:"some,omitempty"`
+	Full PSIData `json:"full,omitempty"`
+}
+
 type CpuStats struct {
 	CpuUsage       CpuUsage       `json:"cpu_usage,omitempty"`
 	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
+	PSI            *PSIStats      `json:"psi,omitempty"`
 }

 type CPUSetStats struct {
@@ -91,6 +104,7 @@ type MemoryStats struct {
 	UseHierarchy bool `json:"use_hierarchy"`

 	Stats map[string]uint64 `json:"stats,omitempty"`
+	PSI   *PSIStats         `json:"psi,omitempty"`
 }

 type PageUsageByNUMA struct {
@@ -135,6 +149,7 @@ type BlkioStats struct {
 	IoMergedRecursive       []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
 	IoTimeRecursive         []BlkioStatEntry `json:"io_time_recursive,omitempty"`
 	SectorsRecursive        []BlkioStatEntry `json:"sectors_recursive,omitempty"`
+	PSI                     *PSIStats        `json:"psi,omitempty"`
 }

 type HugetlbStats struct {
@@ -157,6 +172,13 @@ type RdmaStats struct {
 	RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"`
 }

+type MiscStats struct {
+	// current resource usage for a key in misc
+	Usage uint64 `json:"usage,omitempty"`
+	// number of times the resource usage was about to go over the max boundary
+	Events uint64 `json:"events,omitempty"`
+}
+
 type Stats struct {
 	CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
 	CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
@@ -166,10 +188,13 @@ type Stats struct {
 	// the map is in the format "size of hugepage: stats of the hugepage"
 	HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
 	RdmaStats    RdmaStats               `json:"rdma_stats,omitempty"`
+	// the map is in the format "misc resource name: stats of the key"
+	MiscStats map[string]MiscStats `json:"misc_stats,omitempty"`
 }

 func NewStats() *Stats {
 	memoryStats := MemoryStats{Stats: make(map[string]uint64)}
 	hugetlbStats := make(map[string]HugetlbStats)
-	return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats}
+	miscStats := make(map[string]MiscStats)
+	return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats, MiscStats: miscStats}
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go
@@ -1,13 +1,11 @@
 package systemd

 import (
-	"bufio"
 	"context"
 	"errors"
 	"fmt"
 	"math"
 	"os"
-	"regexp"
 	"strconv"
 	"strings"
 	"sync"
@@ -17,9 +15,8 @@ import (
 	dbus "github.com/godbus/dbus/v5"
 	"github.com/sirupsen/logrus"

-	cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
+	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
-	"github.com/opencontainers/runc/libcontainer/devices"
 )

 const (
@@ -35,6 +32,13 @@ var (

 	isRunningSystemdOnce sync.Once
 	isRunningSystemd     bool
+
+	// GenerateDeviceProps is a function to generate systemd device
+	// properties, used by Set methods. Unless
+	// [github.com/opencontainers/runc/libcontainer/cgroups/devices]
+	// package is imported, it is set to nil, so cgroup managers can't
+	// configure devices.
+	GenerateDeviceProps func(r *configs.Resources, sdVer int) ([]systemdDbus.Property, error)
 )

 // NOTE: This function comes from package github.com/coreos/go-systemd/util
@@ -86,228 +90,6 @@ func ExpandSlice(slice string) (string, error) {
 	return path, nil
 }

-func groupPrefix(ruleType devices.Type) (string, error) {
-	switch ruleType {
-	case devices.BlockDevice:
-		return "block-", nil
-	case devices.CharDevice:
-		return "char-", nil
-	default:
-		return "", fmt.Errorf("device type %v has no group prefix", ruleType)
-	}
-}
-
-// findDeviceGroup tries to find the device group name (as listed in
-// /proc/devices) with the type prefixed as required for DeviceAllow, for a
-// given (type, major) combination. If more than one device group exists, an
-// arbitrary one is chosen.
-func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) {
-	fh, err := os.Open("/proc/devices")
-	if err != nil {
-		return "", err
-	}
-	defer fh.Close()
-
-	prefix, err := groupPrefix(ruleType)
-	if err != nil {
-		return "", err
-	}
-
-	scanner := bufio.NewScanner(fh)
-	var currentType devices.Type
-	for scanner.Scan() {
-		// We need to strip spaces because the first number is column-aligned.
-		line := strings.TrimSpace(scanner.Text())
-
-		// Handle the "header" lines.
-		switch line {
-		case "Block devices:":
-			currentType = devices.BlockDevice
-			continue
-		case "Character devices:":
-			currentType = devices.CharDevice
-			continue
-		case "":
-			continue
-		}
-
-		// Skip lines unrelated to our type.
-		if currentType != ruleType {
-			continue
-		}
-
-		// Parse out the (major, name).
-		var (
-			currMajor int64
-			currName  string
-		)
-		if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 {
-			if err == nil {
-				err = errors.New("wrong number of fields")
-			}
-			return "", fmt.Errorf("scan /proc/devices line %q: %w", line, err)
-		}
-
-		if currMajor == ruleMajor {
-			return prefix + currName, nil
-		}
-	}
-	if err := scanner.Err(); err != nil {
-		return "", fmt.Errorf("reading /proc/devices: %w", err)
-	}
-	// Couldn't find the device group.
-	return "", nil
-}
-
-// DeviceAllow is the dbus type "a(ss)" which means we need a struct
-// to represent it in Go.
-type deviceAllowEntry struct {
-	Path  string
-	Perms string
-}
-
-func allowAllDevices() []systemdDbus.Property {
-	// Setting mode to auto and removing all DeviceAllow rules
-	// results in allowing access to all devices.
-	return []systemdDbus.Property{
-		newProp("DevicePolicy", "auto"),
-		newProp("DeviceAllow", []deviceAllowEntry{}),
-	}
-}
-
-// generateDeviceProperties takes the configured device rules and generates a
-// corresponding set of systemd properties to configure the devices correctly.
-func generateDeviceProperties(r *configs.Resources, sdVer int) ([]systemdDbus.Property, error) {
-	if r.SkipDevices {
-		return nil, nil
-	}
-
-	properties := []systemdDbus.Property{
-		// Always run in the strictest white-list mode.
-		newProp("DevicePolicy", "strict"),
-		// Empty the DeviceAllow array before filling it.
-		newProp("DeviceAllow", []deviceAllowEntry{}),
-	}
-
-	// Figure out the set of rules.
-	configEmu := &cgroupdevices.Emulator{}
-	for _, rule := range r.Devices {
-		if err := configEmu.Apply(*rule); err != nil {
-			return nil, fmt.Errorf("unable to apply rule for systemd: %w", err)
-		}
-	}
-	// systemd doesn't support blacklists. So we log a warning, and tell
-	// systemd to act as a deny-all whitelist. This ruleset will be replaced
-	// with our normal fallback code. This may result in spurious errors, but
-	// the only other option is to error out here.
-	if configEmu.IsBlacklist() {
-		// However, if we're dealing with an allow-all rule then we can do it.
-		if configEmu.IsAllowAll() {
-			return allowAllDevices(), nil
-		}
-		logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule")
-		return properties, nil
-	}
-
-	// Now generate the set of rules we actually need to apply. Unlike the
-	// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
-	// whitelist which is the default for devices.Emulator.
-	finalRules, err := configEmu.Rules()
-	if err != nil {
-		return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err)
-	}
-	var deviceAllowList []deviceAllowEntry
-	for _, rule := range finalRules {
-		if !rule.Allow {
-			// Should never happen.
-			return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
-		}
-		switch rule.Type {
-		case devices.BlockDevice, devices.CharDevice:
-		default:
-			// Should never happen.
-			return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
-		}
-
-		entry := deviceAllowEntry{
-			Perms: string(rule.Permissions),
-		}
-
-		// systemd has a fairly odd (though understandable) syntax here, and
-		// because of the OCI configuration format we have to do quite a bit of
-		// trickery to convert things:
-		//
-		//  * Concrete rules with non-wildcard major/minor numbers have to use
-		//    /dev/{block,char}/MAJOR:minor paths. Before v240, systemd uses
-		//    stat(2) on such paths to look up device properties, meaning we
-		//    cannot add whitelist rules for devices that don't exist. Since v240,
-		//    device properties are parsed from the path string.
-		//
-		//    However, path globbing is not support for path-based rules so we
-		//    need to handle wildcards in some other manner.
-		//
-		//  * Wildcard-minor rules have to specify a "device group name" (the
-		//    second column in /proc/devices).
-		//
-		//  * Wildcard (major and minor) rules can just specify a glob with the
-		//    type ("char-*" or "block-*").
-		//
-		// The only type of rule we can't handle is wildcard-major rules, and
-		// so we'll give a warning in that case (note that the fallback code
-		// will insert any rules systemd couldn't handle). What amazing fun.
-
-		if rule.Major == devices.Wildcard {
-			// "_ *:n _" rules aren't supported by systemd.
-			if rule.Minor != devices.Wildcard {
-				logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
-				continue
-			}
-
-			// "_ *:* _" rules just wildcard everything.
-			prefix, err := groupPrefix(rule.Type)
-			if err != nil {
-				return nil, err
-			}
-			entry.Path = prefix + "*"
-		} else if rule.Minor == devices.Wildcard {
-			// "_ n:* _" rules require a device group from /proc/devices.
-			group, err := findDeviceGroup(rule.Type, rule.Major)
-			if err != nil {
-				return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err)
-			}
-			if group == "" {
-				// Couldn't find a group.
-				logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule)
-				continue
-			}
-			entry.Path = group
-		} else {
-			// "_ n:m _" rules are just a path in /dev/{block,char}/.
-			switch rule.Type {
-			case devices.BlockDevice:
-				entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
-			case devices.CharDevice:
-				entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
-			}
-			if sdVer < 240 {
-				// Old systemd versions use stat(2) on path to find out device major:minor
-				// numbers and type. If the path doesn't exist, it will not add the rule,
-				// emitting a warning instead.
-				// Since all of this logic is best-effort anyway (we manually set these
-				// rules separately to systemd) we can safely skip entries that don't
-				// have a corresponding path.
-				if _, err := os.Stat(entry.Path); err != nil {
-					continue
-				}
-			}
-		}
-		deviceAllowList = append(deviceAllowList, entry)
-	}
-
-	properties = append(properties, newProp("DeviceAllow", deviceAllowList))
-	return properties, nil
-}
-
 func newProp(name string, units interface{}) systemdDbus.Property {
 	return systemdDbus.Property{
 		Name:  name,
@@ -477,18 +259,22 @@ func systemdVersion(cm *dbusConnManager) int {
 	return version
 }

-func systemdVersionAtoi(verStr string) (int, error) {
-	// verStr should be of the form:
-	// "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32" (without quotes).
-	// The result for all of the above should be 245.
-	// Thus, we unconditionally remove the "v" prefix
-	// and then match on the first integer we can grab.
-	re := regexp.MustCompile(`v?([0-9]+)`)
-	matches := re.FindStringSubmatch(verStr)
-	if len(matches) < 2 {
-		return 0, fmt.Errorf("can't parse version %s: incorrect number of matches %v", verStr, matches)
+// systemdVersionAtoi extracts a numeric systemd version from the argument.
+// The argument should be of the form: "v245.4-1.fc32", "245", "v245-1.fc32",
+// "245-1.fc32" (with or without quotes). The result for all of the above
+// should be 245.
+func systemdVersionAtoi(str string) (int, error) {
+	// Unconditionally remove the leading prefix ("v).
+	str = strings.TrimLeft(str, `"v`)
+	// Match on the first integer we can grab.
+	for i := 0; i < len(str); i++ {
+		if str[i] < '0' || str[i] > '9' {
+			// First non-digit: cut the tail.
+			str = str[:i]
+			break
+		}
 	}
-	ver, err := strconv.Atoi(matches[1])
+	ver, err := strconv.Atoi(str)
 	if err != nil {
 		return -1, fmt.Errorf("can't parse version: %w", err)
 	}
@@ -562,3 +348,16 @@ func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems st
 	}
 	return nil
 }
+
+// generateDeviceProperties takes the configured device rules and generates a
+// corresponding set of systemd properties to configure the devices correctly.
+func generateDeviceProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
+	if GenerateDeviceProps == nil {
+		if len(r.Devices) > 0 {
+			return nil, cgroups.ErrDevicesUnsupported
+		}
+		return nil, nil
+	}
+
+	return GenerateDeviceProps(r, systemdVersion(cm))
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go
@@ -21,13 +21,13 @@ func RangeToBits(str string) ([]byte, error) {
 		if r == "" {
 			continue
 		}
-		ranges := strings.SplitN(r, "-", 2)
-		if len(ranges) > 1 {
-			start, err := strconv.ParseUint(ranges[0], 10, 32)
+		startr, endr, ok := strings.Cut(r, "-")
+		if ok {
+			start, err := strconv.ParseUint(startr, 10, 32)
 			if err != nil {
 				return nil, err
 			}
-			end, err := strconv.ParseUint(ranges[1], 10, 32)
+			end, err := strconv.ParseUint(endr, 10, 32)
 			if err != nil {
 				return nil, err
 			}
@@ -38,7 +38,7 @@ func RangeToBits(str string) ([]byte, error) {
 				bits.SetBit(bits, int(i), 1)
 			}
 		} else {
-			val, err := strconv.ParseUint(ranges[0], 10, 32)
+			val, err := strconv.ParseUint(startr, 10, 32)
 			if err != nil {
 				return nil, err
 			}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/devices.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/devices.go
@@ -0,0 +1,74 @@
+package systemd
+
+import (
+	"reflect"
+
+	dbus "github.com/godbus/dbus/v5"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+// freezeBeforeSet answers whether there is a need to freeze the cgroup before
+// applying its systemd unit properties, and thaw after, while avoiding
+// unnecessary freezer state changes.
+//
+// The reason why we have to freeze is that systemd's application of device
+// rules is done disruptively, resulting in spurious errors to common devices
+// (unlike our fs driver, they will happily write deny-all rules to running
+// containers). So we have to freeze the container to avoid the container get
+// an occasional "permission denied" error.
+func (m *LegacyManager) freezeBeforeSet(unitName string, r *configs.Resources) (needsFreeze, needsThaw bool, err error) {
+	// Special case for SkipDevices, as used by Kubernetes to create pod
+	// cgroups with allow-all device policy).
+	if r.SkipDevices {
+		if r.SkipFreezeOnSet {
+			// Both needsFreeze and needsThaw are false.
+			return
+		}
+
+		// No need to freeze if SkipDevices is set, and either
+		// (1) systemd unit does not (yet) exist, or
+		// (2) it has DevicePolicy=auto and empty DeviceAllow list.
+		//
+		// Interestingly, (1) and (2) are the same here because
+		// a non-existent unit returns default properties,
+		// and settings in (2) are the defaults.
+		//
+		// Do not return errors from getUnitTypeProperty, as they alone
+		// should not prevent Set from working.
+
+		unitType := getUnitType(unitName)
+
+		devPolicy, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DevicePolicy")
+		if e == nil && devPolicy.Value == dbus.MakeVariant("auto") {
+			devAllow, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DeviceAllow")
+			if e == nil {
+				if rv := reflect.ValueOf(devAllow.Value.Value()); rv.Kind() == reflect.Slice && rv.Len() == 0 {
+					needsFreeze = false
+					needsThaw = false
+					return
+				}
+			}
+		}
+	}
+
+	needsFreeze = true
+	needsThaw = true
+
+	// Check the current freezer state.
+	freezerState, err := m.GetFreezerState()
+	if err != nil {
+		return
+	}
+	if freezerState == configs.Frozen {
+		// Already frozen, and should stay frozen.
+		needsFreeze = false
+		needsThaw = false
+	}
+
+	if r.Freezer == configs.Frozen {
+		// Will be frozen anyway -- no need to thaw.
+		needsThaw = false
+	}
+	return
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/user.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/user.go
@@ -13,8 +13,7 @@ import (

 	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
 	dbus "github.com/godbus/dbus/v5"
-
-	"github.com/opencontainers/runc/libcontainer/userns"
+	"github.com/moby/sys/userns"
 )

 // newUserSystemdDbus creates a connection for systemd user-instance.
@@ -77,9 +76,8 @@ func DetectUID() (int, error) {
 	return -1, errors.New("could not detect the OwnerUID")
 }

-// DetectUserDbusSessionBusAddress returns $DBUS_SESSION_BUS_ADDRESS if set.
-// Otherwise returns "unix:path=$XDG_RUNTIME_DIR/bus" if $XDG_RUNTIME_DIR/bus exists.
-// Otherwise parses the value from `systemctl --user show-environment` .
+// DetectUserDbusSessionBusAddress returns $DBUS_SESSION_BUS_ADDRESS, if set.
+// Otherwise it returns "unix:path=$XDG_RUNTIME_DIR/bus", if $XDG_RUNTIME_DIR/bus exists.
 func DetectUserDbusSessionBusAddress() (string, error) {
 	if env := os.Getenv("DBUS_SESSION_BUS_ADDRESS"); env != "" {
 		return env, nil
@@ -87,20 +85,9 @@ func DetectUserDbusSessionBusAddress() (string, error) {
 	if xdr := os.Getenv("XDG_RUNTIME_DIR"); xdr != "" {
 		busPath := filepath.Join(xdr, "bus")
 		if _, err := os.Stat(busPath); err == nil {
-			busAddress := "unix:path=" + busPath
+			busAddress := "unix:path=" + dbus.EscapeBusAddressValue(busPath)
 			return busAddress, nil
 		}
 	}
-	b, err := exec.Command("systemctl", "--user", "--no-pager", "show-environment").CombinedOutput()
-	if err != nil {
-		return "", fmt.Errorf("could not execute `systemctl --user --no-pager show-environment` (output=%q): %w", string(b), err)
-	}
-	scanner := bufio.NewScanner(bytes.NewReader(b))
-	for scanner.Scan() {
-		s := strings.TrimSpace(scanner.Text())
-		if strings.HasPrefix(s, "DBUS_SESSION_BUS_ADDRESS=") {
-			return strings.TrimPrefix(s, "DBUS_SESSION_BUS_ADDRESS="), nil
-		}
-	}
-	return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from `systemctl --user --no-pager show-environment`. Make sure you have installed the dbus-user-session or dbus-daemon package and then run: `systemctl --user start dbus`")
+	return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from the environment; make sure you have installed the dbus-user-session or dbus-daemon package; note you may need to re-login")
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go
@@ -4,12 +4,10 @@ import (
 	"errors"
 	"os"
 	"path/filepath"
-	"reflect"
 	"strings"
 	"sync"

 	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
-	"github.com/godbus/dbus/v5"
 	"github.com/sirupsen/logrus"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
@@ -17,14 +15,14 @@ import (
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

-type legacyManager struct {
+type LegacyManager struct {
 	mu      sync.Mutex
 	cgroups *configs.Cgroup
 	paths   map[string]string
 	dbus    *dbusConnManager
 }

-func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
+func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) (*LegacyManager, error) {
 	if cg.Rootless {
 		return nil, errors.New("cannot use rootless systemd cgroups manager on cgroup v1")
 	}
@@ -38,7 +36,7 @@ func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Mana
 			return nil, err
 		}
 	}
-	return &legacyManager{
+	return &LegacyManager{
 		cgroups: cg,
 		paths:   paths,
 		dbus:    newDbusConnManager(false),
@@ -48,7 +46,7 @@ func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Mana
 type subsystem interface {
 	// Name returns the name of the subsystem.
 	Name() string
-	// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
+	// GetStats returns the stats, as 'stats', corresponding to the cgroup under 'path'.
 	GetStats(path string, stats *cgroups.Stats) error
 	// Set sets cgroup resource limits.
 	Set(path string, r *configs.Resources) error
@@ -77,7 +75,7 @@ var legacySubsystems = []subsystem{
 func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
 	var properties []systemdDbus.Property

-	deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm))
+	deviceProperties, err := generateDeviceProperties(r, cm)
 	if err != nil {
 		return nil, err
 	}
@@ -160,7 +158,7 @@ func initPaths(c *configs.Cgroup) (map[string]string, error) {
 	return paths, nil
 }

-func (m *legacyManager) Apply(pid int) error {
+func (m *LegacyManager) Apply(pid int) error {
 	var (
 		c          = m.cgroups
 		unitName   = getUnitName(c)
@@ -218,7 +216,7 @@ func (m *legacyManager) Apply(pid int) error {
 	return nil
 }

-func (m *legacyManager) Destroy() error {
+func (m *LegacyManager) Destroy() error {
 	m.mu.Lock()
 	defer m.mu.Unlock()

@@ -234,13 +232,13 @@ func (m *legacyManager) Destroy() error {
 	return stopErr
 }

-func (m *legacyManager) Path(subsys string) string {
+func (m *LegacyManager) Path(subsys string) string {
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	return m.paths[subsys]
 }

-func (m *legacyManager) joinCgroups(pid int) error {
+func (m *LegacyManager) joinCgroups(pid int) error {
 	for _, sys := range legacySubsystems {
 		name := sys.Name()
 		switch name {
@@ -277,7 +275,7 @@ func getSubsystemPath(slice, unit, subsystem string) (string, error) {
 	return filepath.Join(mountpoint, slice, unit), nil
 }

-func (m *legacyManager) Freeze(state configs.FreezerState) error {
+func (m *LegacyManager) Freeze(state configs.FreezerState) error {
 	err := m.doFreeze(state)
 	if err == nil {
 		m.cgroups.Resources.Freezer = state
@@ -287,7 +285,7 @@ func (m *legacyManager) Freeze(state configs.FreezerState) error {

 // doFreeze is the same as Freeze but without
 // changing the m.cgroups.Resources.Frozen field.
-func (m *legacyManager) doFreeze(state configs.FreezerState) error {
+func (m *LegacyManager) doFreeze(state configs.FreezerState) error {
 	path, ok := m.paths["freezer"]
 	if !ok {
 		return errSubsystemDoesNotExist
@@ -297,7 +295,7 @@ func (m *legacyManager) doFreeze(state configs.FreezerState) error {
 	return freezer.Set(path, resources)
 }

-func (m *legacyManager) GetPids() ([]int, error) {
+func (m *LegacyManager) GetPids() ([]int, error) {
 	path, ok := m.paths["devices"]
 	if !ok {
 		return nil, errSubsystemDoesNotExist
@@ -305,7 +303,7 @@ func (m *legacyManager) GetPids() ([]int, error) {
 	return cgroups.GetPids(path)
 }

-func (m *legacyManager) GetAllPids() ([]int, error) {
+func (m *LegacyManager) GetAllPids() ([]int, error) {
 	path, ok := m.paths["devices"]
 	if !ok {
 		return nil, errSubsystemDoesNotExist
@@ -313,7 +311,7 @@ func (m *legacyManager) GetAllPids() ([]int, error) {
 	return cgroups.GetAllPids(path)
 }

-func (m *legacyManager) GetStats() (*cgroups.Stats, error) {
+func (m *LegacyManager) GetStats() (*cgroups.Stats, error) {
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	stats := cgroups.NewStats()
@@ -330,72 +328,7 @@ func (m *legacyManager) GetStats() (*cgroups.Stats, error) {
 	return stats, nil
 }

-// freezeBeforeSet answers whether there is a need to freeze the cgroup before
-// applying its systemd unit properties, and thaw after, while avoiding
-// unnecessary freezer state changes.
-//
-// The reason why we have to freeze is that systemd's application of device
-// rules is done disruptively, resulting in spurious errors to common devices
-// (unlike our fs driver, they will happily write deny-all rules to running
-// containers). So we have to freeze the container to avoid the container get
-// an occasional "permission denied" error.
-func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) (needsFreeze, needsThaw bool, err error) {
-	// Special case for SkipDevices, as used by Kubernetes to create pod
-	// cgroups with allow-all device policy).
-	if r.SkipDevices {
-		if r.SkipFreezeOnSet {
-			// Both needsFreeze and needsThaw are false.
-			return
-		}
-
-		// No need to freeze if SkipDevices is set, and either
-		// (1) systemd unit does not (yet) exist, or
-		// (2) it has DevicePolicy=auto and empty DeviceAllow list.
-		//
-		// Interestingly, (1) and (2) are the same here because
-		// a non-existent unit returns default properties,
-		// and settings in (2) are the defaults.
-		//
-		// Do not return errors from getUnitTypeProperty, as they alone
-		// should not prevent Set from working.
-
-		unitType := getUnitType(unitName)
-
-		devPolicy, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DevicePolicy")
-		if e == nil && devPolicy.Value == dbus.MakeVariant("auto") {
-			devAllow, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DeviceAllow")
-			if e == nil {
-				if rv := reflect.ValueOf(devAllow.Value.Value()); rv.Kind() == reflect.Slice && rv.Len() == 0 {
-					needsFreeze = false
-					needsThaw = false
-					return
-				}
-			}
-		}
-	}
-
-	needsFreeze = true
-	needsThaw = true
-
-	// Check the current freezer state.
-	freezerState, err := m.GetFreezerState()
-	if err != nil {
-		return
-	}
-	if freezerState == configs.Frozen {
-		// Already frozen, and should stay frozen.
-		needsFreeze = false
-		needsThaw = false
-	}
-
-	if r.Freezer == configs.Frozen {
-		// Will be frozen anyway -- no need to thaw.
-		needsThaw = false
-	}
-	return
-}
-
-func (m *legacyManager) Set(r *configs.Resources) error {
+func (m *LegacyManager) Set(r *configs.Resources) error {
 	if r == nil {
 		return nil
 	}
@@ -452,17 +385,17 @@ func (m *legacyManager) Set(r *configs.Resources) error {
 	return nil
 }

-func (m *legacyManager) GetPaths() map[string]string {
+func (m *LegacyManager) GetPaths() map[string]string {
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	return m.paths
 }

-func (m *legacyManager) GetCgroups() (*configs.Cgroup, error) {
+func (m *LegacyManager) GetCgroups() (*configs.Cgroup, error) {
 	return m.cgroups, nil
 }

-func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) {
+func (m *LegacyManager) GetFreezerState() (configs.FreezerState, error) {
 	path, ok := m.paths["freezer"]
 	if !ok {
 		return configs.Undefined, nil
@@ -471,10 +404,10 @@ func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) {
 	return freezer.GetState(path)
 }

-func (m *legacyManager) Exists() bool {
+func (m *LegacyManager) Exists() bool {
 	return cgroups.PathExists(m.Path("devices"))
 }

-func (m *legacyManager) OOMKillCount() (uint64, error) {
+func (m *LegacyManager) OOMKillCount() (uint64, error) {
 	return fs.OOMKillCount(m.Path("memory"))
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go
@@ -20,7 +20,11 @@ import (
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

-type unifiedManager struct {
+const (
+	cpuIdleSupportedVersion = 252
+)
+
+type UnifiedManager struct {
 	mu      sync.Mutex
 	cgroups *configs.Cgroup
 	// path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
@@ -29,8 +33,8 @@ type unifiedManager struct {
 	fsMgr cgroups.Manager
 }

-func NewUnifiedManager(config *configs.Cgroup, path string) (cgroups.Manager, error) {
-	m := &unifiedManager{
+func NewUnifiedManager(config *configs.Cgroup, path string) (*UnifiedManager, error) {
+	m := &UnifiedManager{
 		cgroups: config,
 		path:    path,
 		dbus:    newDbusConnManager(config.Rootless),
@@ -48,6 +52,14 @@ func NewUnifiedManager(config *configs.Cgroup, path string) (cgroups.Manager, er
 	return m, nil
 }

+func shouldSetCPUIdle(cm *dbusConnManager, v string) bool {
+	// The only valid values for cpu.idle are 0 and 1. As it is
+	// not possible to directly set cpu.idle to 0 via systemd,
+	// ignore 0. Ignore other values as we'll error out later
+	// in Set() while calling fsMgr.Set().
+	return v == "1" && systemdVersion(cm) >= cpuIdleSupportedVersion
+}
+
 // unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified
 // key/value map (where key is cgroupfs file name) to systemd unit properties.
 // This is on a best-effort basis, so the properties that are not known
@@ -64,8 +76,7 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props
 		if strings.Contains(k, "/") {
 			return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
 		}
-		sk := strings.SplitN(k, ".", 2)
-		if len(sk) != 2 {
+		if strings.IndexByte(k, '.') <= 0 {
 			return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
 		}
 		// Kernel is quite forgiving to extra whitespace
@@ -73,6 +84,14 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props
 		v = strings.TrimSpace(v)
 		// Please keep cases in alphabetical order.
 		switch k {
+		case "cpu.idle":
+			if shouldSetCPUIdle(cm, v) {
+				// Setting CPUWeight to 0 tells systemd
+				// to set cpu.idle to 1.
+				props = append(props,
+					newProp("CPUWeight", uint64(0)))
+			}
+
 		case "cpu.max":
 			// value: quota [period]
 			quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set
@@ -98,6 +117,12 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props
 			addCpuQuota(cm, &props, quota, period)

 		case "cpu.weight":
+			if shouldSetCPUIdle(cm, strings.TrimSpace(res["cpu.idle"])) {
+				// Do not add duplicate CPUWeight property
+				// (see case "cpu.idle" above).
+				logrus.Warn("unable to apply both cpu.weight and cpu.idle to systemd, ignoring cpu.weight")
+				continue
+			}
 			num, err := strconv.ParseUint(v, 10, 64)
 			if err != nil {
 				return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
@@ -174,7 +199,14 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props
 	return props, nil
 }

-func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
+func genV2ResourcesProperties(dirPath string, r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
+	// We need this check before setting systemd properties, otherwise
+	// the container is OOM-killed and the systemd unit is removed
+	// before we get to fsMgr.Set().
+	if err := fs2.CheckMemoryUsage(dirPath, r); err != nil {
+		return nil, err
+	}
+
 	var properties []systemdDbus.Property

 	// NOTE: This is of questionable correctness because we insert our own
@@ -182,7 +214,7 @@ func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst
 	//       aren't the end of the world, but it is a bit concerning. However
 	//       it's unclear if systemd removes all eBPF programs attached when
 	//       doing SetUnitProperties...
-	deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm))
+	deviceProperties, err := generateDeviceProperties(r, cm)
 	if err != nil {
 		return nil, err
 	}
@@ -206,9 +238,21 @@ func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst
 			newProp("MemorySwapMax", uint64(swap)))
 	}

-	if r.CpuWeight != 0 {
+	idleSet := false
+	// The logic here is the same as in shouldSetCPUIdle.
+	if r.CPUIdle != nil && *r.CPUIdle == 1 && systemdVersion(cm) >= cpuIdleSupportedVersion {
 		properties = append(properties,
-			newProp("CPUWeight", r.CpuWeight))
+			newProp("CPUWeight", uint64(0)))
+		idleSet = true
+	}
+	if r.CpuWeight != 0 {
+		if idleSet {
+			// Ignore CpuWeight if CPUIdle is already set.
+			logrus.Warn("unable to apply both CPUWeight and CpuIdle to systemd, ignoring CPUWeight")
+		} else {
+			properties = append(properties,
+				newProp("CPUWeight", r.CpuWeight))
+		}
 	}

 	addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
@@ -237,7 +281,7 @@ func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst
 	return properties, nil
 }

-func (m *unifiedManager) Apply(pid int) error {
+func (m *UnifiedManager) Apply(pid int) error {
 	var (
 		c          = m.cgroups
 		unitName   = getUnitName(c)
@@ -340,7 +384,7 @@ func cgroupFilesToChown() ([]string, error) {
 	return filesToChown, nil
 }

-func (m *unifiedManager) Destroy() error {
+func (m *UnifiedManager) Destroy() error {
 	m.mu.Lock()
 	defer m.mu.Unlock()

@@ -359,13 +403,13 @@ func (m *unifiedManager) Destroy() error {
 	return nil
 }

-func (m *unifiedManager) Path(_ string) string {
+func (m *UnifiedManager) Path(_ string) string {
 	return m.path
 }

 // getSliceFull value is used in initPath.
 // The value is incompatible with systemdDbus.PropSlice.
-func (m *unifiedManager) getSliceFull() (string, error) {
+func (m *UnifiedManager) getSliceFull() (string, error) {
 	c := m.cgroups
 	slice := "system.slice"
 	if c.Rootless {
@@ -393,7 +437,7 @@ func (m *unifiedManager) getSliceFull() (string, error) {
 	return slice, nil
 }

-func (m *unifiedManager) initPath() error {
+func (m *UnifiedManager) initPath() error {
 	if m.path != "" {
 		return nil
 	}
@@ -417,27 +461,27 @@ func (m *unifiedManager) initPath() error {
 	return nil
 }

-func (m *unifiedManager) Freeze(state configs.FreezerState) error {
+func (m *UnifiedManager) Freeze(state configs.FreezerState) error {
 	return m.fsMgr.Freeze(state)
 }

-func (m *unifiedManager) GetPids() ([]int, error) {
+func (m *UnifiedManager) GetPids() ([]int, error) {
 	return cgroups.GetPids(m.path)
 }

-func (m *unifiedManager) GetAllPids() ([]int, error) {
+func (m *UnifiedManager) GetAllPids() ([]int, error) {
 	return cgroups.GetAllPids(m.path)
 }

-func (m *unifiedManager) GetStats() (*cgroups.Stats, error) {
+func (m *UnifiedManager) GetStats() (*cgroups.Stats, error) {
 	return m.fsMgr.GetStats()
 }

-func (m *unifiedManager) Set(r *configs.Resources) error {
+func (m *UnifiedManager) Set(r *configs.Resources) error {
 	if r == nil {
 		return nil
 	}
-	properties, err := genV2ResourcesProperties(r, m.dbus)
+	properties, err := genV2ResourcesProperties(m.fsMgr.Path(""), r, m.dbus)
 	if err != nil {
 		return err
 	}
@@ -449,24 +493,24 @@ func (m *unifiedManager) Set(r *configs.Resources) error {
 	return m.fsMgr.Set(r)
 }

-func (m *unifiedManager) GetPaths() map[string]string {
+func (m *UnifiedManager) GetPaths() map[string]string {
 	paths := make(map[string]string, 1)
 	paths[""] = m.path
 	return paths
 }

-func (m *unifiedManager) GetCgroups() (*configs.Cgroup, error) {
+func (m *UnifiedManager) GetCgroups() (*configs.Cgroup, error) {
 	return m.cgroups, nil
 }

-func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) {
+func (m *UnifiedManager) GetFreezerState() (configs.FreezerState, error) {
 	return m.fsMgr.GetFreezerState()
 }

-func (m *unifiedManager) Exists() bool {
+func (m *UnifiedManager) Exists() bool {
 	return cgroups.PathExists(m.path)
 }

-func (m *unifiedManager) OOMKillCount() (uint64, error) {
+func (m *UnifiedManager) OOMKillCount() (uint64, error) {
 	return m.fsMgr.OOMKillCount()
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
@@ -12,7 +12,7 @@ import (
 	"sync"
 	"time"

-	"github.com/opencontainers/runc/libcontainer/userns"
+	"github.com/moby/sys/userns"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
@@ -36,13 +36,13 @@ func IsCgroup2UnifiedMode() bool {
 		var st unix.Statfs_t
 		err := unix.Statfs(unifiedMountpoint, &st)
 		if err != nil {
+			level := logrus.WarnLevel
 			if os.IsNotExist(err) && userns.RunningInUserNS() {
-				// ignore the "not found" error if running in userns
-				logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
-				isUnified = false
-				return
+				// For rootless containers, sweep it under the rug.
+				level = logrus.DebugLevel
 			}
-			panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
+			logrus.StandardLogger().Logf(level,
+				"statfs %s: %v; assuming cgroup v1", unifiedMountpoint, err)
 		}
 		isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
 	})
@@ -136,18 +136,18 @@ func GetAllSubsystems() ([]string, error) {
 	return subsystems, nil
 }

-func readProcsFile(dir string) ([]int, error) {
-	f, err := OpenFile(dir, CgroupProcesses, os.O_RDONLY)
+func readProcsFile(dir string) (out []int, _ error) {
+	file := CgroupProcesses
+	retry := true
+
+again:
+	f, err := OpenFile(dir, file, os.O_RDONLY)
 	if err != nil {
 		return nil, err
 	}
 	defer f.Close()

-	var (
-		s   = bufio.NewScanner(f)
-		out = []int{}
-	)
-
+	s := bufio.NewScanner(f)
 	for s.Scan() {
 		if t := s.Text(); t != "" {
 			pid, err := strconv.Atoi(t)
@@ -157,6 +157,13 @@ func readProcsFile(dir string) ([]int, error) {
 			out = append(out, pid)
 		}
 	}
+	if errors.Is(s.Err(), unix.ENOTSUP) && retry {
+		// For a threaded cgroup, read returns ENOTSUP, and we should
+		// read from cgroup.threads instead.
+		file = "cgroup.threads"
+		retry = false
+		goto again
+	}
 	return out, s.Err()
 }

@@ -217,21 +224,26 @@ func PathExists(path string) bool {
 	return true
 }

-func EnterPid(cgroupPaths map[string]string, pid int) error {
-	for _, path := range cgroupPaths {
-		if PathExists(path) {
-			if err := WriteCgroupProc(path, pid); err != nil {
-				return err
-			}
-		}
-	}
-	return nil
-}
+// rmdir tries to remove a directory, optionally retrying on EBUSY.
+func rmdir(path string, retry bool) error {
+	delay := time.Millisecond
+	tries := 10

-func rmdir(path string) error {
+again:
 	err := unix.Rmdir(path)
-	if err == nil || err == unix.ENOENT { //nolint:errorlint // unix errors are bare
+	switch err { // nolint:errorlint // unix errors are bare
+	case nil, unix.ENOENT:
 		return nil
+	case unix.EINTR:
+		goto again
+	case unix.EBUSY:
+		if retry && tries > 0 {
+			time.Sleep(delay)
+			delay *= 2
+			tries--
+			goto again
+
+		}
 	}
 	return &os.PathError{Op: "rmdir", Path: path, Err: err}
 }
@@ -239,68 +251,40 @@ func rmdir(path string) error {
 // RemovePath aims to remove cgroup path. It does so recursively,
 // by removing any subdirectories (sub-cgroups) first.
 func RemovePath(path string) error {
-	// try the fast path first
-	if err := rmdir(path); err == nil {
+	// Try the fast path first.
+	if err := rmdir(path, false); err == nil {
 		return nil
 	}

 	infos, err := os.ReadDir(path)
-	if err != nil {
-		if os.IsNotExist(err) {
-			err = nil
-		}
+	if err != nil && !os.IsNotExist(err) {
 		return err
 	}
 	for _, info := range infos {
 		if info.IsDir() {
-			// We should remove subcgroups dir first
+			// We should remove subcgroup first.
 			if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
 				break
 			}
 		}
 	}
 	if err == nil {
-		err = rmdir(path)
+		err = rmdir(path, true)
 	}
 	return err
 }

 // RemovePaths iterates over the provided paths removing them.
-// We trying to remove all paths five times with increasing delay between tries.
-// If after all there are not removed cgroups - appropriate error will be
-// returned.
 func RemovePaths(paths map[string]string) (err error) {
-	const retries = 5
-	delay := 10 * time.Millisecond
-	for i := 0; i < retries; i++ {
-		if i != 0 {
-			time.Sleep(delay)
-			delay *= 2
-		}
-		for s, p := range paths {
-			if err := RemovePath(p); err != nil {
-				// do not log intermediate iterations
-				switch i {
-				case 0:
-					logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
-				case retries - 1:
-					logrus.WithError(err).Error("Failed to remove cgroup")
-				}
-			}
-			_, err := os.Stat(p)
-			// We need this strange way of checking cgroups existence because
-			// RemoveAll almost always returns error, even on already removed
-			// cgroups
-			if os.IsNotExist(err) {
-				delete(paths, s)
-			}
-		}
-		if len(paths) == 0 {
-			//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
-			paths = make(map[string]string)
-			return nil
+	for s, p := range paths {
+		if err := RemovePath(p); err == nil {
+			delete(paths, s)
 		}
 	}
+	if len(paths) == 0 {
+		clear(paths)
+		return nil
+	}
 	return fmt.Errorf("Failed to remove paths: %v", paths)
 }

@@ -431,26 +415,29 @@ func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 {

 // ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec
 // for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap
-// is defined as memory+swap combined, while in cgroup v2 swap is a separate value.
+// is defined as memory+swap combined, while in cgroup v2 swap is a separate value,
+// so we need to subtract memory from it where it makes sense.
 func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
-	// for compatibility with cgroup1 controller, set swap to unlimited in
-	// case the memory is set to unlimited, and swap is not explicitly set,
-	// treating the request as "set both memory and swap to unlimited".
-	if memory == -1 && memorySwap == 0 {
+	switch {
+	case memory == -1 && memorySwap == 0:
+		// For compatibility with cgroup1 controller, set swap to unlimited in
+		// case the memory is set to unlimited and the swap is not explicitly set,
+		// treating the request as "set both memory and swap to unlimited".
 		return -1, nil
-	}
-	if memorySwap == -1 || memorySwap == 0 {
-		// -1 is "max", 0 is "unset", so treat as is
+	case memorySwap == -1, memorySwap == 0:
+		// Treat -1 ("max") and 0 ("unset") swap as is.
 		return memorySwap, nil
-	}
-	// sanity checks
-	if memory == 0 || memory == -1 {
+	case memory == -1:
+		// Unlimited memory, so treat swap as is.
+		return memorySwap, nil
+	case memory == 0:
+		// Unset or unknown memory, can't calculate swap.
 		return 0, errors.New("unable to set swap limit without memory limit")
-	}
-	if memory < 0 {
+	case memory < 0:
+		// Does not make sense to subtract a negative value.
 		return 0, fmt.Errorf("invalid memory value: %d", memory)
-	}
-	if memorySwap < memory {
+	case memorySwap < memory:
+		// Sanity check.
 		return 0, errors.New("memory+swap limit should be >= memory limit")
 	}

--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go
@@ -99,11 +99,12 @@ func tryDefaultPath(cgroupPath, subsystem string) string {
 // expensive), so it is assumed that cgroup mounts are not being changed.
 func readCgroupMountinfo() ([]*mountinfo.Info, error) {
 	readMountinfoOnce.Do(func() {
+		// mountinfo.GetMounts uses /proc/thread-self, so we can use it without
+		// issues.
 		cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
 			mountinfo.FSTypeFilter("cgroup"),
 		)
 	})
-
 	return cgroupMountinfo, readMountinfoErr
 }

@@ -196,6 +197,9 @@ func getCgroupMountsV1(all bool) ([]Mount, error) {
 		return nil, err
 	}

+	// We don't need to use /proc/thread-self here because runc always runs
+	// with every thread in the same cgroup. This lets us avoid having to do
+	// runtime.LockOSThread.
 	allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
 	if err != nil {
 		return nil, err
@@ -214,6 +218,10 @@ func GetOwnCgroup(subsystem string) (string, error) {
 	if IsCgroup2UnifiedMode() {
 		return "", errUnified
 	}
+
+	// We don't need to use /proc/thread-self here because runc always runs
+	// with every thread in the same cgroup. This lets us avoid having to do
+	// runtime.LockOSThread.
 	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
 	if err != nil {
 		return "", err
@@ -236,27 +244,6 @@ func GetOwnCgroupPath(subsystem string) (string, error) {
 	return getCgroupPathHelper(subsystem, cgroup)
 }

-func GetInitCgroup(subsystem string) (string, error) {
-	if IsCgroup2UnifiedMode() {
-		return "", errUnified
-	}
-	cgroups, err := ParseCgroupFile("/proc/1/cgroup")
-	if err != nil {
-		return "", err
-	}
-
-	return getControllerPath(subsystem, cgroups)
-}
-
-func GetInitCgroupPath(subsystem string) (string, error) {
-	cgroup, err := GetInitCgroup(subsystem)
-	if err != nil {
-		return "", err
-	}
-
-	return getCgroupPathHelper(subsystem, cgroup)
-}
-
 func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
 	mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
 	if err != nil {
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go
@@ -2,8 +2,8 @@ package configs

 import "fmt"

-// blockIODevice holds major:minor format supported in blkio cgroup
-type blockIODevice struct {
+// BlockIODevice holds major:minor format supported in blkio cgroup.
+type BlockIODevice struct {
 	// Major is the device's major number
 	Major int64 `json:"major"`
 	// Minor is the device's minor number
@@ -12,7 +12,7 @@ type blockIODevice struct {

 // WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
 type WeightDevice struct {
-	blockIODevice
+	BlockIODevice
 	// Weight is the bandwidth rate for the device, range is from 10 to 1000
 	Weight uint16 `json:"weight"`
 	// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
@@ -41,7 +41,7 @@ func (wd *WeightDevice) LeafWeightString() string {

 // ThrottleDevice struct holds a `major:minor rate_per_second` pair
 type ThrottleDevice struct {
-	blockIODevice
+	BlockIODevice
 	// Rate is the IO rate limit per cgroup per device
 	Rate uint64 `json:"rate"`
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go
@@ -69,6 +69,9 @@ type Resources struct {
 	// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
 	CpuQuota int64 `json:"cpu_quota"`

+	// CPU hardcap burst limit (in usecs). Allowed accumulated cpu time additionally for burst in a given period.
+	CpuBurst *uint64 `json:"cpu_burst"` //nolint:revive
+
 	// CPU period to be used for hardcapping (in usecs). 0 to use system default.
 	CpuPeriod uint64 `json:"cpu_period"`

@@ -84,6 +87,9 @@ type Resources struct {
 	// MEM to use
 	CpusetMems string `json:"cpuset_mems"`

+	// cgroup SCHED_IDLE
+	CPUIdle *int64 `json:"cpu_idle,omitempty"`
+
 	// Process limit; set <= `0' to disable limit.
 	PidsLimit int64 `json:"pids_limit"`

@@ -155,4 +161,9 @@ type Resources struct {
 	// during Set() to figure out whether the freeze is required. Those
 	// methods may be relatively slow, thus this flag.
 	SkipFreezeOnSet bool `json:"-"`
+
+	// MemoryCheckBeforeUpdate is a flag for cgroup v2 managers to check
+	// if the new memory limits (Memory and MemorySwap) being set are lower
+	// than the current memory usage, and reject if so.
+	MemoryCheckBeforeUpdate bool `json:"memory_check_before_update"`
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go
@@ -1,5 +1,4 @@
 //go:build !linux
-// +build !linux

 package configs

--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
@@ -8,6 +8,7 @@ import (
 	"time"

 	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"

 	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runtime-spec/specs-go"
@@ -31,12 +32,13 @@ type IDMap struct {
 // for syscalls. Additional architectures can be added by specifying them in
 // Architectures.
 type Seccomp struct {
-	DefaultAction    Action     `json:"default_action"`
-	Architectures    []string   `json:"architectures"`
-	Syscalls         []*Syscall `json:"syscalls"`
-	DefaultErrnoRet  *uint      `json:"default_errno_ret"`
-	ListenerPath     string     `json:"listener_path,omitempty"`
-	ListenerMetadata string     `json:"listener_metadata,omitempty"`
+	DefaultAction    Action                   `json:"default_action"`
+	Architectures    []string                 `json:"architectures"`
+	Flags            []specs.LinuxSeccompFlag `json:"flags"`
+	Syscalls         []*Syscall               `json:"syscalls"`
+	DefaultErrnoRet  *uint                    `json:"default_errno_ret"`
+	ListenerPath     string                   `json:"listener_path,omitempty"`
+	ListenerMetadata string                   `json:"listener_metadata,omitempty"`
 }

 // Action is taken upon rule match in Seccomp
@@ -83,9 +85,6 @@ type Syscall struct {
 	Args     []*Arg `json:"args"`
 }

-// TODO Windows. Many of these fields should be factored out into those parts
-// which are common across platforms, and those which are platform specific.
-
 // Config defines configuration options for executing a process inside a contained environment.
 type Config struct {
 	// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
@@ -121,6 +120,9 @@ type Config struct {
 	// Hostname optionally sets the container's hostname if provided
 	Hostname string `json:"hostname"`

+	// Domainname optionally sets the container's domainname if provided
+	Domainname string `json:"domainname"`
+
 	// Namespaces specifies the container's namespaces that it should setup when cloning the init process
 	// If a namespace is not provided that namespace is shared from the container's parent process
 	Namespaces Namespaces `json:"namespaces"`
@@ -158,11 +160,11 @@ type Config struct {
 	// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
 	OomScoreAdj *int `json:"oom_score_adj,omitempty"`

-	// UidMappings is an array of User ID mappings for User Namespaces
-	UidMappings []IDMap `json:"uid_mappings"`
+	// UIDMappings is an array of User ID mappings for User Namespaces
+	UIDMappings []IDMap `json:"uid_mappings"`

-	// GidMappings is an array of Group ID mappings for User Namespaces
-	GidMappings []IDMap `json:"gid_mappings"`
+	// GIDMappings is an array of Group ID mappings for User Namespaces
+	GIDMappings []IDMap `json:"gid_mappings"`

 	// MaskPaths specifies paths within the container's rootfs to mask over with a bind
 	// mount pointing to /dev/null as to prevent reads of the file.
@@ -211,8 +213,87 @@ type Config struct {
 	// RootlessCgroups is set when unlikely to have the full access to cgroups.
 	// When RootlessCgroups is set, cgroups errors are ignored.
 	RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
+
+	// TimeOffsets specifies the offset for supporting time namespaces.
+	TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"`
+
+	// Scheduler represents the scheduling attributes for a process.
+	Scheduler *Scheduler `json:"scheduler,omitempty"`
+
+	// Personality contains configuration for the Linux personality syscall.
+	Personality *LinuxPersonality `json:"personality,omitempty"`
+
+	// IOPriority is the container's I/O priority.
+	IOPriority *IOPriority `json:"io_priority,omitempty"`
 }

+// Scheduler is based on the Linux sched_setattr(2) syscall.
+type Scheduler = specs.Scheduler
+
+// ToSchedAttr is to convert *configs.Scheduler to *unix.SchedAttr.
+func ToSchedAttr(scheduler *Scheduler) (*unix.SchedAttr, error) {
+	var policy uint32
+	switch scheduler.Policy {
+	case specs.SchedOther:
+		policy = 0
+	case specs.SchedFIFO:
+		policy = 1
+	case specs.SchedRR:
+		policy = 2
+	case specs.SchedBatch:
+		policy = 3
+	case specs.SchedISO:
+		policy = 4
+	case specs.SchedIdle:
+		policy = 5
+	case specs.SchedDeadline:
+		policy = 6
+	default:
+		return nil, fmt.Errorf("invalid scheduler policy: %s", scheduler.Policy)
+	}
+
+	var flags uint64
+	for _, flag := range scheduler.Flags {
+		switch flag {
+		case specs.SchedFlagResetOnFork:
+			flags |= 0x01
+		case specs.SchedFlagReclaim:
+			flags |= 0x02
+		case specs.SchedFlagDLOverrun:
+			flags |= 0x04
+		case specs.SchedFlagKeepPolicy:
+			flags |= 0x08
+		case specs.SchedFlagKeepParams:
+			flags |= 0x10
+		case specs.SchedFlagUtilClampMin:
+			flags |= 0x20
+		case specs.SchedFlagUtilClampMax:
+			flags |= 0x40
+		default:
+			return nil, fmt.Errorf("invalid scheduler flag: %s", flag)
+		}
+	}
+
+	return &unix.SchedAttr{
+		Size:     unix.SizeofSchedAttr,
+		Policy:   policy,
+		Flags:    flags,
+		Nice:     scheduler.Nice,
+		Priority: uint32(scheduler.Priority),
+		Runtime:  scheduler.Runtime,
+		Deadline: scheduler.Deadline,
+		Period:   scheduler.Period,
+	}, nil
+}
+
+var IOPrioClassMapping = map[specs.IOPriorityClass]int{
+	specs.IOPRIO_CLASS_RT:   1,
+	specs.IOPRIO_CLASS_BE:   2,
+	specs.IOPRIO_CLASS_IDLE: 3,
+}
+
+type IOPriority = specs.LinuxIOPriority
+
 type (
 	HookName string
 	HookList []Hook
@@ -277,6 +358,7 @@ type Capabilities struct {
 	Ambient []string
 }

+// Deprecated: use (Hooks).Run instead.
 func (hooks HookList) RunHooks(state *specs.State) error {
 	for i, h := range hooks {
 		if err := h.Run(state); err != nil {
@@ -333,6 +415,18 @@ func (hooks *Hooks) MarshalJSON() ([]byte, error) {
 	})
 }

+// Run executes all hooks for the given hook name.
+func (hooks Hooks) Run(name HookName, state *specs.State) error {
+	list := hooks[name]
+	for i, h := range list {
+		if err := h.Run(state); err != nil {
+			return fmt.Errorf("error running %s hook #%d: %w", name, i, err)
+		}
+	}
+
+	return nil
+}
+
 type Hook interface {
 	// Run executes the hook with the provided state.
 	Run(*specs.State) error
@@ -393,7 +487,7 @@ func (c Command) Run(s *specs.State) error {
 	go func() {
 		err := cmd.Wait()
 		if err != nil {
-			err = fmt.Errorf("error running hook: %w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
+			err = fmt.Errorf("%w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
 		}
 		errC <- err
 	}()
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go
@@ -7,22 +7,33 @@ import (
 )

 var (
-	errNoUIDMap   = errors.New("User namespaces enabled, but no uid mappings found.")
-	errNoUserMap  = errors.New("User namespaces enabled, but no user mapping found.")
-	errNoGIDMap   = errors.New("User namespaces enabled, but no gid mappings found.")
-	errNoGroupMap = errors.New("User namespaces enabled, but no group mapping found.")
+	errNoUIDMap = errors.New("user namespaces enabled, but no uid mappings found")
+	errNoGIDMap = errors.New("user namespaces enabled, but no gid mappings found")
 )

+// Please check https://man7.org/linux/man-pages/man2/personality.2.html for const details.
+// https://raw.githubusercontent.com/torvalds/linux/master/include/uapi/linux/personality.h
+const (
+	PerLinux   = 0x0000
+	PerLinux32 = 0x0008
+)
+
+type LinuxPersonality struct {
+	// Domain for the personality
+	// can only contain values "LINUX" and "LINUX32"
+	Domain int `json:"domain"`
+}
+
 // HostUID gets the translated uid for the process on host which could be
 // different when user namespaces are enabled.
 func (c Config) HostUID(containerId int) (int, error) {
 	if c.Namespaces.Contains(NEWUSER) {
-		if c.UidMappings == nil {
+		if len(c.UIDMappings) == 0 {
 			return -1, errNoUIDMap
 		}
-		id, found := c.hostIDFromMapping(int64(containerId), c.UidMappings)
+		id, found := c.hostIDFromMapping(int64(containerId), c.UIDMappings)
 		if !found {
-			return -1, errNoUserMap
+			return -1, fmt.Errorf("user namespaces enabled, but no mapping found for uid %d", containerId)
 		}
 		// If we are a 32-bit binary running on a 64-bit system, it's possible
 		// the mapped user is too large to store in an int, which means we
@@ -47,12 +58,12 @@ func (c Config) HostRootUID() (int, error) {
 // different when user namespaces are enabled.
 func (c Config) HostGID(containerId int) (int, error) {
 	if c.Namespaces.Contains(NEWUSER) {
-		if c.GidMappings == nil {
+		if len(c.GIDMappings) == 0 {
 			return -1, errNoGIDMap
 		}
-		id, found := c.hostIDFromMapping(int64(containerId), c.GidMappings)
+		id, found := c.hostIDFromMapping(int64(containerId), c.GIDMappings)
 		if !found {
-			return -1, errNoGroupMap
+			return -1, fmt.Errorf("user namespaces enabled, but no mapping found for gid %d", containerId)
 		}
 		// If we are a 32-bit binary running on a 64-bit system, it's possible
 		// the mapped user is too large to store in an int, which means we
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go
@@ -1,5 +1,4 @@
 //go:build gofuzz
-// +build gofuzz

 package configs

--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go
@@ -1,48 +1,7 @@
 package configs

-import "golang.org/x/sys/unix"
-
 const (
 	// EXT_COPYUP is a directive to copy up the contents of a directory when
 	// a tmpfs is mounted over it.
-	EXT_COPYUP = 1 << iota //nolint:golint // ignore "don't use ALL_CAPS" warning
+	EXT_COPYUP = 1 << iota //nolint:golint,revive // ignore "don't use ALL_CAPS" warning
 )
-
-type Mount struct {
-	// Source path for the mount.
-	Source string `json:"source"`
-
-	// Destination path for the mount inside the container.
-	Destination string `json:"destination"`
-
-	// Device the mount is for.
-	Device string `json:"device"`
-
-	// Mount flags.
-	Flags int `json:"flags"`
-
-	// Propagation Flags
-	PropagationFlags []int `json:"propagation_flags"`
-
-	// Mount data applied to the mount.
-	Data string `json:"data"`
-
-	// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
-	Relabel string `json:"relabel"`
-
-	// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
-	RecAttr *unix.MountAttr `json:"rec_attr"`
-
-	// Extensions are additional flags that are specific to runc.
-	Extensions int `json:"extensions"`
-
-	// Optional Command to be run before Source is mounted.
-	PremountCmds []Command `json:"premount_cmds"`
-
-	// Optional Command to be run after Source is mounted.
-	PostmountCmds []Command `json:"postmount_cmds"`
-}
-
-func (m *Mount) IsBind() bool {
-	return m.Flags&unix.MS_BIND != 0
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_linux.go
@@ -0,0 +1,66 @@
+package configs
+
+import "golang.org/x/sys/unix"
+
+type MountIDMapping struct {
+	// Recursive indicates if the mapping needs to be recursive.
+	Recursive bool `json:"recursive"`
+
+	// UserNSPath is a path to a user namespace that indicates the necessary
+	// id-mappings for MOUNT_ATTR_IDMAP. If set to non-"", UIDMappings and
+	// GIDMappings must be set to nil.
+	UserNSPath string `json:"userns_path,omitempty"`
+
+	// UIDMappings is the uid mapping set for this mount, to be used with
+	// MOUNT_ATTR_IDMAP.
+	UIDMappings []IDMap `json:"uid_mappings,omitempty"`
+
+	// GIDMappings is the gid mapping set for this mount, to be used with
+	// MOUNT_ATTR_IDMAP.
+	GIDMappings []IDMap `json:"gid_mappings,omitempty"`
+}
+
+type Mount struct {
+	// Source path for the mount.
+	Source string `json:"source"`
+
+	// Destination path for the mount inside the container.
+	Destination string `json:"destination"`
+
+	// Device the mount is for.
+	Device string `json:"device"`
+
+	// Mount flags.
+	Flags int `json:"flags"`
+
+	// Mount flags that were explicitly cleared in the configuration (meaning
+	// the user explicitly requested that these flags *not* be set).
+	ClearedFlags int `json:"cleared_flags"`
+
+	// Propagation Flags
+	PropagationFlags []int `json:"propagation_flags"`
+
+	// Mount data applied to the mount.
+	Data string `json:"data"`
+
+	// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
+	Relabel string `json:"relabel"`
+
+	// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
+	RecAttr *unix.MountAttr `json:"rec_attr"`
+
+	// Extensions are additional flags that are specific to runc.
+	Extensions int `json:"extensions"`
+
+	// Mapping is the MOUNT_ATTR_IDMAP configuration for the mount. If non-nil,
+	// the mount is configured to use MOUNT_ATTR_IDMAP-style id mappings.
+	IDMapping *MountIDMapping `json:"id_mapping,omitempty"`
+}
+
+func (m *Mount) IsBind() bool {
+	return m.Flags&unix.MS_BIND != 0
+}
+
+func (m *Mount) IsIDMapped() bool {
+	return m.IDMapping != nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_unsupported.go
@@ -0,0 +1,9 @@
+//go:build !linux
+
+package configs
+
+type Mount struct{}
+
+func (m *Mount) IsBind() bool {
+	return false
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go
@@ -14,6 +14,7 @@ const (
 	NEWIPC    NamespaceType = "NEWIPC"
 	NEWUSER   NamespaceType = "NEWUSER"
 	NEWCGROUP NamespaceType = "NEWCGROUP"
+	NEWTIME   NamespaceType = "NEWTIME"
 )

 var (
@@ -38,6 +39,8 @@ func NsName(ns NamespaceType) string {
 		return "uts"
 	case NEWCGROUP:
 		return "cgroup"
+	case NEWTIME:
+		return "time"
 	}
 	return ""
 }
@@ -56,6 +59,9 @@ func IsNamespaceSupported(ns NamespaceType) bool {
 	if nsFile == "" {
 		return false
 	}
+	// We don't need to use /proc/thread-self here because the list of
+	// namespace types is unrelated to the thread. This lets us avoid having to
+	// do runtime.LockOSThread.
 	_, err := os.Stat("/proc/self/ns/" + nsFile)
 	// a namespace is supported if it exists and we have permissions to read it
 	supported = err == nil
@@ -72,6 +78,7 @@ func NamespaceTypes() []NamespaceType {
 		NEWPID,
 		NEWNS,
 		NEWCGROUP,
+		NEWTIME,
 	}
 }

--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go
@@ -1,5 +1,4 @@
 //go:build linux
-// +build linux

 package configs

@@ -17,6 +16,7 @@ var namespaceInfo = map[NamespaceType]int{
 	NEWUTS:    unix.CLONE_NEWUTS,
 	NEWPID:    unix.CLONE_NEWPID,
 	NEWCGROUP: unix.CLONE_NEWCGROUP,
+	NEWTIME:   unix.CLONE_NEWTIME,
 }

 // CloneFlags parses the container's Namespaces options to set the correct
@@ -31,3 +31,15 @@ func (n *Namespaces) CloneFlags() uintptr {
 	}
 	return uintptr(flag)
 }
+
+// IsPrivate tells whether the namespace of type t is configured as private
+// (i.e. it exists and is not shared).
+func (n Namespaces) IsPrivate(t NamespaceType) bool {
+	for _, v := range n {
+		if v.Type == t {
+			return v.Path == ""
+		}
+	}
+	// Not found, so implicitly sharing a parent namespace.
+	return false
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go
@@ -1,5 +1,4 @@
 //go:build !linux && !windows
-// +build !linux,!windows

 package configs

--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go
@@ -1,5 +1,4 @@
 //go:build !linux
-// +build !linux

 package configs

--- a/vendor/github.com/opencontainers/runc/libcontainer/devices/device_unix.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/devices/device_unix.go
@@ -1,5 +1,4 @@
 //go:build !windows
-// +build !windows

 package devices

--- a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go
@@ -1,145 +0,0 @@
-//go:build linux
-// +build linux
-
-package system
-
-import (
-	"os"
-	"os/exec"
-	"runtime"
-	"strings"
-	"unsafe"
-
-	"golang.org/x/sys/unix"
-)
-
-type ParentDeathSignal int
-
-func (p ParentDeathSignal) Restore() error {
-	if p == 0 {
-		return nil
-	}
-	current, err := GetParentDeathSignal()
-	if err != nil {
-		return err
-	}
-	if p == current {
-		return nil
-	}
-	return p.Set()
-}
-
-func (p ParentDeathSignal) Set() error {
-	return SetParentDeathSignal(uintptr(p))
-}
-
-// Deprecated: Execv is not used in runc anymore, it will be removed in v1.2.0.
-func Execv(cmd string, args []string, env []string) error {
-	name, err := exec.LookPath(cmd)
-	if err != nil {
-		return err
-	}
-	return Exec(name, args, env)
-}
-
-func Exec(cmd string, args []string, env []string) error {
-	for {
-		err := unix.Exec(cmd, args, env)
-		if err != unix.EINTR { //nolint:errorlint // unix errors are bare
-			return &os.PathError{Op: "exec", Path: cmd, Err: err}
-		}
-	}
-}
-
-func SetParentDeathSignal(sig uintptr) error {
-	if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil {
-		return err
-	}
-	return nil
-}
-
-func GetParentDeathSignal() (ParentDeathSignal, error) {
-	var sig int
-	if err := unix.Prctl(unix.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0, 0, 0); err != nil {
-		return -1, err
-	}
-	return ParentDeathSignal(sig), nil
-}
-
-func SetKeepCaps() error {
-	if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 1, 0, 0, 0); err != nil {
-		return err
-	}
-
-	return nil
-}
-
-func ClearKeepCaps() error {
-	if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 0, 0, 0, 0); err != nil {
-		return err
-	}
-
-	return nil
-}
-
-func Setctty() error {
-	if err := unix.IoctlSetInt(0, unix.TIOCSCTTY, 0); err != nil {
-		return err
-	}
-	return nil
-}
-
-// SetSubreaper sets the value i as the subreaper setting for the calling process
-func SetSubreaper(i int) error {
-	return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
-}
-
-// GetSubreaper returns the subreaper setting for the calling process
-func GetSubreaper() (int, error) {
-	var i uintptr
-
-	if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
-		return -1, err
-	}
-
-	return int(i), nil
-}
-
-func prepareAt(dir *os.File, path string) (int, string) {
-	if dir == nil {
-		return unix.AT_FDCWD, path
-	}
-
-	// Rather than just filepath.Join-ing path here, do it manually so the
-	// error and handle correctly indicate cases like path=".." as being
-	// relative to the correct directory. The handle.Name() might end up being
-	// wrong but because this is (currently) only used in MkdirAllInRoot, that
-	// isn't a problem.
-	dirName := dir.Name()
-	if !strings.HasSuffix(dirName, "/") {
-		dirName += "/"
-	}
-	fullPath := dirName + path
-
-	return int(dir.Fd()), fullPath
-}
-
-func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) {
-	dirFd, fullPath := prepareAt(dir, path)
-	fd, err := unix.Openat(dirFd, path, flags, mode)
-	if err != nil {
-		return nil, &os.PathError{Op: "openat", Path: fullPath, Err: err}
-	}
-	runtime.KeepAlive(dir)
-	return os.NewFile(uintptr(fd), fullPath), nil
-}
-
-func Mkdirat(dir *os.File, path string, mode uint32) error {
-	dirFd, fullPath := prepareAt(dir, path)
-	err := unix.Mkdirat(dirFd, path, mode)
-	if err != nil {
-		err = &os.PathError{Op: "mkdirat", Path: fullPath, Err: err}
-	}
-	runtime.KeepAlive(dir)
-	return err
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go
@@ -1,127 +0,0 @@
-package system
-
-import (
-	"fmt"
-	"os"
-	"path/filepath"
-	"strconv"
-	"strings"
-)
-
-// State is the status of a process.
-type State rune
-
-const ( // Only values for Linux 3.14 and later are listed here
-	Dead        State = 'X'
-	DiskSleep   State = 'D'
-	Running     State = 'R'
-	Sleeping    State = 'S'
-	Stopped     State = 'T'
-	TracingStop State = 't'
-	Zombie      State = 'Z'
-	Parked      State = 'P'
-	Idle        State = 'I'
-)
-
-// String forms of the state from proc(5)'s documentation for
-// /proc/[pid]/status' "State" field.
-func (s State) String() string {
-	switch s {
-	case Dead:
-		return "dead"
-	case DiskSleep:
-		return "disk sleep"
-	case Running:
-		return "running"
-	case Sleeping:
-		return "sleeping"
-	case Stopped:
-		return "stopped"
-	case TracingStop:
-		return "tracing stop"
-	case Zombie:
-		return "zombie"
-	case Parked:
-		return "parked"
-	case Idle:
-		return "idle" // kernel thread
-	default:
-		return fmt.Sprintf("unknown (%c)", s)
-	}
-}
-
-// Stat_t represents the information from /proc/[pid]/stat, as
-// described in proc(5) with names based on the /proc/[pid]/status
-// fields.
-type Stat_t struct {
-	// Name is the command run by the process.
-	Name string
-
-	// State is the state of the process.
-	State State
-
-	// StartTime is the number of clock ticks after system boot (since
-	// Linux 2.6).
-	StartTime uint64
-}
-
-// Stat returns a Stat_t instance for the specified process.
-func Stat(pid int) (stat Stat_t, err error) {
-	bytes, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat"))
-	if err != nil {
-		return stat, err
-	}
-	return parseStat(string(bytes))
-}
-
-func parseStat(data string) (stat Stat_t, err error) {
-	// Example:
-	// 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0
-	// The fields are space-separated, see full description in proc(5).
-	//
-	// We are only interested in:
-	//  * field 2: process name. It is the only field enclosed into
-	//    parenthesis, as it can contain spaces (and parenthesis) inside.
-	//  * field 3: process state, a single character (%c)
-	//  * field 22: process start time, a long unsigned integer (%llu).
-
-	// 1. Look for the first '(' and the last ')' first, what's in between is Name.
-	//    We expect at least 20 fields and a space after the last one.
-
-	const minAfterName = 20*2 + 1 // the min field is '0 '.
-
-	first := strings.IndexByte(data, '(')
-	if first < 0 || first+minAfterName >= len(data) {
-		return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data)
-	}
-
-	last := strings.LastIndexByte(data, ')')
-	if last <= first || last+minAfterName >= len(data) {
-		return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data)
-	}
-
-	stat.Name = data[first+1 : last]
-
-	// 2. Remove fields 1 and 2 and a space after. State is right after.
-	data = data[last+2:]
-	stat.State = State(data[0])
-
-	// 3. StartTime is field 22, data is at field 3 now, so we need to skip 19 spaces.
-	skipSpaces := 22 - 3
-	for first = 0; skipSpaces > 0 && first < len(data); first++ {
-		if data[first] == ' ' {
-			skipSpaces--
-		}
-	}
-	// Now first points to StartTime; look for space right after.
-	i := strings.IndexByte(data[first:], ' ')
-	if i < 0 {
-		return stat, fmt.Errorf("invalid stat data (too short): %q", data)
-	}
-	stat.StartTime, err = strconv.ParseUint(data[first:first+i], 10, 64)
-	if err != nil {
-		return stat, fmt.Errorf("invalid stat data (bad start time): %w", err)
-	}
-
-	return stat, nil
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux.go
@@ -1,15 +0,0 @@
-//go:build go1.23
-
-package system
-
-import (
-	"syscall"
-)
-
-// ClearRlimitNofileCache clears go runtime's nofile rlimit cache. The argument
-// is process RLIMIT_NOFILE values. Relies on go.dev/cl/588076.
-func ClearRlimitNofileCache(lim *syscall.Rlimit) {
-	// Ignore the return values since we only need to clean the cache,
-	// the limit is going to be set via unix.Prlimit elsewhere.
-	_ = syscall.Setrlimit(syscall.RLIMIT_NOFILE, lim)
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux_go122.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux_go122.go
@@ -1,27 +0,0 @@
-//go:build go1.19 && !go1.23
-
-// TODO: remove this file once go 1.22 is no longer supported.
-
-package system
-
-import (
-	"sync/atomic"
-	"syscall"
-	_ "unsafe" // Needed for go:linkname to work.
-)
-
-//go:linkname syscallOrigRlimitNofile syscall.origRlimitNofile
-var syscallOrigRlimitNofile atomic.Pointer[syscall.Rlimit]
-
-// ClearRlimitNofileCache clears go runtime's nofile rlimit cache.
-// The argument is process RLIMIT_NOFILE values.
-func ClearRlimitNofileCache(_ *syscall.Rlimit) {
-	// As reported in issue #4195, the new version of go runtime(since 1.19)
-	// will cache rlimit-nofile. Before executing execve, the rlimit-nofile
-	// of the process will be restored with the cache. In runc, this will
-	// cause the rlimit-nofile setting by the parent process for the container
-	// to become invalid. It can be solved by clearing this cache. But
-	// unfortunately, go stdlib doesn't provide such function, so we need to
-	// link to the private var `origRlimitNofile` in package syscall to hack.
-	syscallOrigRlimitNofile.Store(nil)
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_stub.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_stub.go
@@ -1,7 +0,0 @@
-//go:build !go1.19
-
-package system
-
-import "syscall"
-
-func ClearRlimitNofileCache(_ *syscall.Rlimit) {}
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_32.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_32.go
@@ -1,27 +0,0 @@
-//go:build linux && (386 || arm)
-// +build linux
-// +build 386 arm
-
-package system
-
-import (
-	"golang.org/x/sys/unix"
-)
-
-// Setuid sets the uid of the calling thread to the specified uid.
-func Setuid(uid int) (err error) {
-	_, _, e1 := unix.RawSyscall(unix.SYS_SETUID32, uintptr(uid), 0, 0)
-	if e1 != 0 {
-		err = e1
-	}
-	return
-}
-
-// Setgid sets the gid of the calling thread to the specified gid.
-func Setgid(gid int) (err error) {
-	_, _, e1 := unix.RawSyscall(unix.SYS_SETGID32, uintptr(gid), 0, 0)
-	if e1 != 0 {
-		err = e1
-	}
-	return
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go
@@ -1,27 +0,0 @@
-//go:build linux && (arm64 || amd64 || mips || mipsle || mips64 || mips64le || ppc || ppc64 || ppc64le || riscv64 || s390x)
-// +build linux
-// +build arm64 amd64 mips mipsle mips64 mips64le ppc ppc64 ppc64le riscv64 s390x
-
-package system
-
-import (
-	"golang.org/x/sys/unix"
-)
-
-// Setuid sets the uid of the calling thread to the specified uid.
-func Setuid(uid int) (err error) {
-	_, _, e1 := unix.RawSyscall(unix.SYS_SETUID, uintptr(uid), 0, 0)
-	if e1 != 0 {
-		err = e1
-	}
-	return
-}
-
-// Setgid sets the gid of the calling thread to the specified gid.
-func Setgid(gid int) (err error) {
-	_, _, e1 := unix.RawSyscall(unix.SYS_SETGID, uintptr(gid), 0, 0)
-	if e1 != 0 {
-		err = e1
-	}
-	return
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go
@@ -1,157 +0,0 @@
-//go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
-
-package user
-
-import (
-	"io"
-	"os"
-	"strconv"
-
-	"golang.org/x/sys/unix"
-)
-
-// Unix-specific path to the passwd and group formatted files.
-const (
-	unixPasswdPath = "/etc/passwd"
-	unixGroupPath  = "/etc/group"
-)
-
-// LookupUser looks up a user by their username in /etc/passwd. If the user
-// cannot be found (or there is no /etc/passwd file on the filesystem), then
-// LookupUser returns an error.
-func LookupUser(username string) (User, error) {
-	return lookupUserFunc(func(u User) bool {
-		return u.Name == username
-	})
-}
-
-// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
-// be found (or there is no /etc/passwd file on the filesystem), then LookupId
-// returns an error.
-func LookupUid(uid int) (User, error) {
-	return lookupUserFunc(func(u User) bool {
-		return u.Uid == uid
-	})
-}
-
-func lookupUserFunc(filter func(u User) bool) (User, error) {
-	// Get operating system-specific passwd reader-closer.
-	passwd, err := GetPasswd()
-	if err != nil {
-		return User{}, err
-	}
-	defer passwd.Close()
-
-	// Get the users.
-	users, err := ParsePasswdFilter(passwd, filter)
-	if err != nil {
-		return User{}, err
-	}
-
-	// No user entries found.
-	if len(users) == 0 {
-		return User{}, ErrNoPasswdEntries
-	}
-
-	// Assume the first entry is the "correct" one.
-	return users[0], nil
-}
-
-// LookupGroup looks up a group by its name in /etc/group. If the group cannot
-// be found (or there is no /etc/group file on the filesystem), then LookupGroup
-// returns an error.
-func LookupGroup(groupname string) (Group, error) {
-	return lookupGroupFunc(func(g Group) bool {
-		return g.Name == groupname
-	})
-}
-
-// LookupGid looks up a group by its group id in /etc/group. If the group cannot
-// be found (or there is no /etc/group file on the filesystem), then LookupGid
-// returns an error.
-func LookupGid(gid int) (Group, error) {
-	return lookupGroupFunc(func(g Group) bool {
-		return g.Gid == gid
-	})
-}
-
-func lookupGroupFunc(filter func(g Group) bool) (Group, error) {
-	// Get operating system-specific group reader-closer.
-	group, err := GetGroup()
-	if err != nil {
-		return Group{}, err
-	}
-	defer group.Close()
-
-	// Get the users.
-	groups, err := ParseGroupFilter(group, filter)
-	if err != nil {
-		return Group{}, err
-	}
-
-	// No user entries found.
-	if len(groups) == 0 {
-		return Group{}, ErrNoGroupEntries
-	}
-
-	// Assume the first entry is the "correct" one.
-	return groups[0], nil
-}
-
-func GetPasswdPath() (string, error) {
-	return unixPasswdPath, nil
-}
-
-func GetPasswd() (io.ReadCloser, error) {
-	return os.Open(unixPasswdPath)
-}
-
-func GetGroupPath() (string, error) {
-	return unixGroupPath, nil
-}
-
-func GetGroup() (io.ReadCloser, error) {
-	return os.Open(unixGroupPath)
-}
-
-// CurrentUser looks up the current user by their user id in /etc/passwd. If the
-// user cannot be found (or there is no /etc/passwd file on the filesystem),
-// then CurrentUser returns an error.
-func CurrentUser() (User, error) {
-	return LookupUid(unix.Getuid())
-}
-
-// CurrentGroup looks up the current user's group by their primary group id's
-// entry in /etc/passwd. If the group cannot be found (or there is no
-// /etc/group file on the filesystem), then CurrentGroup returns an error.
-func CurrentGroup() (Group, error) {
-	return LookupGid(unix.Getgid())
-}
-
-func currentUserSubIDs(fileName string) ([]SubID, error) {
-	u, err := CurrentUser()
-	if err != nil {
-		return nil, err
-	}
-	filter := func(entry SubID) bool {
-		return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid)
-	}
-	return ParseSubIDFileFilter(fileName, filter)
-}
-
-func CurrentUserSubUIDs() ([]SubID, error) {
-	return currentUserSubIDs("/etc/subuid")
-}
-
-func CurrentUserSubGIDs() ([]SubID, error) {
-	return currentUserSubIDs("/etc/subgid")
-}
-
-func CurrentProcessUIDMap() ([]IDMap, error) {
-	return ParseIDMapFile("/proc/self/uid_map")
-}
-
-func CurrentProcessGIDMap() ([]IDMap, error) {
-	return ParseIDMapFile("/proc/self/gid_map")
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go
@@ -1,604 +0,0 @@
-package user
-
-import (
-	"bufio"
-	"bytes"
-	"errors"
-	"fmt"
-	"io"
-	"os"
-	"strconv"
-	"strings"
-)
-
-const (
-	minID = 0
-	maxID = 1<<31 - 1 // for 32-bit systems compatibility
-)
-
-var (
-	// ErrNoPasswdEntries is returned if no matching entries were found in /etc/group.
-	ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
-	// ErrNoGroupEntries is returned if no matching entries were found in /etc/passwd.
-	ErrNoGroupEntries = errors.New("no matching entries in group file")
-	// ErrRange is returned if a UID or GID is outside of the valid range.
-	ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minID, maxID)
-)
-
-type User struct {
-	Name  string
-	Pass  string
-	Uid   int
-	Gid   int
-	Gecos string
-	Home  string
-	Shell string
-}
-
-type Group struct {
-	Name string
-	Pass string
-	Gid  int
-	List []string
-}
-
-// SubID represents an entry in /etc/sub{u,g}id
-type SubID struct {
-	Name  string
-	SubID int64
-	Count int64
-}
-
-// IDMap represents an entry in /proc/PID/{u,g}id_map
-type IDMap struct {
-	ID       int64
-	ParentID int64
-	Count    int64
-}
-
-func parseLine(line []byte, v ...interface{}) {
-	parseParts(bytes.Split(line, []byte(":")), v...)
-}
-
-func parseParts(parts [][]byte, v ...interface{}) {
-	if len(parts) == 0 {
-		return
-	}
-
-	for i, p := range parts {
-		// Ignore cases where we don't have enough fields to populate the arguments.
-		// Some configuration files like to misbehave.
-		if len(v) <= i {
-			break
-		}
-
-		// Use the type of the argument to figure out how to parse it, scanf() style.
-		// This is legit.
-		switch e := v[i].(type) {
-		case *string:
-			*e = string(p)
-		case *int:
-			// "numbers", with conversion errors ignored because of some misbehaving configuration files.
-			*e, _ = strconv.Atoi(string(p))
-		case *int64:
-			*e, _ = strconv.ParseInt(string(p), 10, 64)
-		case *[]string:
-			// Comma-separated lists.
-			if len(p) != 0 {
-				*e = strings.Split(string(p), ",")
-			} else {
-				*e = []string{}
-			}
-		default:
-			// Someone goof'd when writing code using this function. Scream so they can hear us.
-			panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e))
-		}
-	}
-}
-
-func ParsePasswdFile(path string) ([]User, error) {
-	passwd, err := os.Open(path)
-	if err != nil {
-		return nil, err
-	}
-	defer passwd.Close()
-	return ParsePasswd(passwd)
-}
-
-func ParsePasswd(passwd io.Reader) ([]User, error) {
-	return ParsePasswdFilter(passwd, nil)
-}
-
-func ParsePasswdFileFilter(path string, filter func(User) bool) ([]User, error) {
-	passwd, err := os.Open(path)
-	if err != nil {
-		return nil, err
-	}
-	defer passwd.Close()
-	return ParsePasswdFilter(passwd, filter)
-}
-
-func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) {
-	if r == nil {
-		return nil, errors.New("nil source for passwd-formatted data")
-	}
-
-	var (
-		s   = bufio.NewScanner(r)
-		out = []User{}
-	)
-
-	for s.Scan() {
-		line := bytes.TrimSpace(s.Bytes())
-		if len(line) == 0 {
-			continue
-		}
-
-		// see: man 5 passwd
-		//  name:password:UID:GID:GECOS:directory:shell
-		// Name:Pass:Uid:Gid:Gecos:Home:Shell
-		//  root:x:0:0:root:/root:/bin/bash
-		//  adm:x:3:4:adm:/var/adm:/bin/false
-		p := User{}
-		parseLine(line, &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell)
-
-		if filter == nil || filter(p) {
-			out = append(out, p)
-		}
-	}
-	if err := s.Err(); err != nil {
-		return nil, err
-	}
-
-	return out, nil
-}
-
-func ParseGroupFile(path string) ([]Group, error) {
-	group, err := os.Open(path)
-	if err != nil {
-		return nil, err
-	}
-
-	defer group.Close()
-	return ParseGroup(group)
-}
-
-func ParseGroup(group io.Reader) ([]Group, error) {
-	return ParseGroupFilter(group, nil)
-}
-
-func ParseGroupFileFilter(path string, filter func(Group) bool) ([]Group, error) {
-	group, err := os.Open(path)
-	if err != nil {
-		return nil, err
-	}
-	defer group.Close()
-	return ParseGroupFilter(group, filter)
-}
-
-func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
-	if r == nil {
-		return nil, errors.New("nil source for group-formatted data")
-	}
-	rd := bufio.NewReader(r)
-	out := []Group{}
-
-	// Read the file line-by-line.
-	for {
-		var (
-			isPrefix  bool
-			wholeLine []byte
-			err       error
-		)
-
-		// Read the next line. We do so in chunks (as much as reader's
-		// buffer is able to keep), check if we read enough columns
-		// already on each step and store final result in wholeLine.
-		for {
-			var line []byte
-			line, isPrefix, err = rd.ReadLine()
-			if err != nil {
-				// We should return no error if EOF is reached
-				// without a match.
-				if err == io.EOF {
-					err = nil
-				}
-				return out, err
-			}
-
-			// Simple common case: line is short enough to fit in a
-			// single reader's buffer.
-			if !isPrefix && len(wholeLine) == 0 {
-				wholeLine = line
-				break
-			}
-
-			wholeLine = append(wholeLine, line...)
-
-			// Check if we read the whole line already.
-			if !isPrefix {
-				break
-			}
-		}
-
-		// There's no spec for /etc/passwd or /etc/group, but we try to follow
-		// the same rules as the glibc parser, which allows comments and blank
-		// space at the beginning of a line.
-		wholeLine = bytes.TrimSpace(wholeLine)
-		if len(wholeLine) == 0 || wholeLine[0] == '#' {
-			continue
-		}
-
-		// see: man 5 group
-		//  group_name:password:GID:user_list
-		// Name:Pass:Gid:List
-		//  root:x:0:root
-		//  adm:x:4:root,adm,daemon
-		p := Group{}
-		parseLine(wholeLine, &p.Name, &p.Pass, &p.Gid, &p.List)
-
-		if filter == nil || filter(p) {
-			out = append(out, p)
-		}
-	}
-}
-
-type ExecUser struct {
-	Uid   int
-	Gid   int
-	Sgids []int
-	Home  string
-}
-
-// GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the
-// given file paths and uses that data as the arguments to GetExecUser. If the
-// files cannot be opened for any reason, the error is ignored and a nil
-// io.Reader is passed instead.
-func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) {
-	var passwd, group io.Reader
-
-	if passwdFile, err := os.Open(passwdPath); err == nil {
-		passwd = passwdFile
-		defer passwdFile.Close()
-	}
-
-	if groupFile, err := os.Open(groupPath); err == nil {
-		group = groupFile
-		defer groupFile.Close()
-	}
-
-	return GetExecUser(userSpec, defaults, passwd, group)
-}
-
-// GetExecUser parses a user specification string (using the passwd and group
-// readers as sources for /etc/passwd and /etc/group data, respectively). In
-// the case of blank fields or missing data from the sources, the values in
-// defaults is used.
-//
-// GetExecUser will return an error if a user or group literal could not be
-// found in any entry in passwd and group respectively.
-//
-// Examples of valid user specifications are:
-//   - ""
-//   - "user"
-//   - "uid"
-//   - "user:group"
-//   - "uid:gid
-//   - "user:gid"
-//   - "uid:group"
-//
-// It should be noted that if you specify a numeric user or group id, they will
-// not be evaluated as usernames (only the metadata will be filled). So attempting
-// to parse a user with user.Name = "1337" will produce the user with a UID of
-// 1337.
-func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) {
-	if defaults == nil {
-		defaults = new(ExecUser)
-	}
-
-	// Copy over defaults.
-	user := &ExecUser{
-		Uid:   defaults.Uid,
-		Gid:   defaults.Gid,
-		Sgids: defaults.Sgids,
-		Home:  defaults.Home,
-	}
-
-	// Sgids slice *cannot* be nil.
-	if user.Sgids == nil {
-		user.Sgids = []int{}
-	}
-
-	// Allow for userArg to have either "user" syntax, or optionally "user:group" syntax
-	var userArg, groupArg string
-	parseLine([]byte(userSpec), &userArg, &groupArg)
-
-	// Convert userArg and groupArg to be numeric, so we don't have to execute
-	// Atoi *twice* for each iteration over lines.
-	uidArg, uidErr := strconv.Atoi(userArg)
-	gidArg, gidErr := strconv.Atoi(groupArg)
-
-	// Find the matching user.
-	users, err := ParsePasswdFilter(passwd, func(u User) bool {
-		if userArg == "" {
-			// Default to current state of the user.
-			return u.Uid == user.Uid
-		}
-
-		if uidErr == nil {
-			// If the userArg is numeric, always treat it as a UID.
-			return uidArg == u.Uid
-		}
-
-		return u.Name == userArg
-	})
-
-	// If we can't find the user, we have to bail.
-	if err != nil && passwd != nil {
-		if userArg == "" {
-			userArg = strconv.Itoa(user.Uid)
-		}
-		return nil, fmt.Errorf("unable to find user %s: %w", userArg, err)
-	}
-
-	var matchedUserName string
-	if len(users) > 0 {
-		// First match wins, even if there's more than one matching entry.
-		matchedUserName = users[0].Name
-		user.Uid = users[0].Uid
-		user.Gid = users[0].Gid
-		user.Home = users[0].Home
-	} else if userArg != "" {
-		// If we can't find a user with the given username, the only other valid
-		// option is if it's a numeric username with no associated entry in passwd.
-
-		if uidErr != nil {
-			// Not numeric.
-			return nil, fmt.Errorf("unable to find user %s: %w", userArg, ErrNoPasswdEntries)
-		}
-		user.Uid = uidArg
-
-		// Must be inside valid uid range.
-		if user.Uid < minID || user.Uid > maxID {
-			return nil, ErrRange
-		}
-
-		// Okay, so it's numeric. We can just roll with this.
-	}
-
-	// On to the groups. If we matched a username, we need to do this because of
-	// the supplementary group IDs.
-	if groupArg != "" || matchedUserName != "" {
-		groups, err := ParseGroupFilter(group, func(g Group) bool {
-			// If the group argument isn't explicit, we'll just search for it.
-			if groupArg == "" {
-				// Check if user is a member of this group.
-				for _, u := range g.List {
-					if u == matchedUserName {
-						return true
-					}
-				}
-				return false
-			}
-
-			if gidErr == nil {
-				// If the groupArg is numeric, always treat it as a GID.
-				return gidArg == g.Gid
-			}
-
-			return g.Name == groupArg
-		})
-		if err != nil && group != nil {
-			return nil, fmt.Errorf("unable to find groups for spec %v: %w", matchedUserName, err)
-		}
-
-		// Only start modifying user.Gid if it is in explicit form.
-		if groupArg != "" {
-			if len(groups) > 0 {
-				// First match wins, even if there's more than one matching entry.
-				user.Gid = groups[0].Gid
-			} else {
-				// If we can't find a group with the given name, the only other valid
-				// option is if it's a numeric group name with no associated entry in group.
-
-				if gidErr != nil {
-					// Not numeric.
-					return nil, fmt.Errorf("unable to find group %s: %w", groupArg, ErrNoGroupEntries)
-				}
-				user.Gid = gidArg
-
-				// Must be inside valid gid range.
-				if user.Gid < minID || user.Gid > maxID {
-					return nil, ErrRange
-				}
-
-				// Okay, so it's numeric. We can just roll with this.
-			}
-		} else if len(groups) > 0 {
-			// Supplementary group ids only make sense if in the implicit form.
-			user.Sgids = make([]int, len(groups))
-			for i, group := range groups {
-				user.Sgids[i] = group.Gid
-			}
-		}
-	}
-
-	return user, nil
-}
-
-// GetAdditionalGroups looks up a list of groups by name or group id
-// against the given /etc/group formatted data. If a group name cannot
-// be found, an error will be returned. If a group id cannot be found,
-// or the given group data is nil, the id will be returned as-is
-// provided it is in the legal range.
-func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, error) {
-	groups := []Group{}
-	if group != nil {
-		var err error
-		groups, err = ParseGroupFilter(group, func(g Group) bool {
-			for _, ag := range additionalGroups {
-				if g.Name == ag || strconv.Itoa(g.Gid) == ag {
-					return true
-				}
-			}
-			return false
-		})
-		if err != nil {
-			return nil, fmt.Errorf("Unable to find additional groups %v: %w", additionalGroups, err)
-		}
-	}
-
-	gidMap := make(map[int]struct{})
-	for _, ag := range additionalGroups {
-		var found bool
-		for _, g := range groups {
-			// if we found a matched group either by name or gid, take the
-			// first matched as correct
-			if g.Name == ag || strconv.Itoa(g.Gid) == ag {
-				if _, ok := gidMap[g.Gid]; !ok {
-					gidMap[g.Gid] = struct{}{}
-					found = true
-					break
-				}
-			}
-		}
-		// we asked for a group but didn't find it. let's check to see
-		// if we wanted a numeric group
-		if !found {
-			gid, err := strconv.ParseInt(ag, 10, 64)
-			if err != nil {
-				// Not a numeric ID either.
-				return nil, fmt.Errorf("Unable to find group %s: %w", ag, ErrNoGroupEntries)
-			}
-			// Ensure gid is inside gid range.
-			if gid < minID || gid > maxID {
-				return nil, ErrRange
-			}
-			gidMap[int(gid)] = struct{}{}
-		}
-	}
-	gids := []int{}
-	for gid := range gidMap {
-		gids = append(gids, gid)
-	}
-	return gids, nil
-}
-
-// GetAdditionalGroupsPath is a wrapper around GetAdditionalGroups
-// that opens the groupPath given and gives it as an argument to
-// GetAdditionalGroups.
-func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) {
-	var group io.Reader
-
-	if groupFile, err := os.Open(groupPath); err == nil {
-		group = groupFile
-		defer groupFile.Close()
-	}
-	return GetAdditionalGroups(additionalGroups, group)
-}
-
-func ParseSubIDFile(path string) ([]SubID, error) {
-	subid, err := os.Open(path)
-	if err != nil {
-		return nil, err
-	}
-	defer subid.Close()
-	return ParseSubID(subid)
-}
-
-func ParseSubID(subid io.Reader) ([]SubID, error) {
-	return ParseSubIDFilter(subid, nil)
-}
-
-func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) {
-	subid, err := os.Open(path)
-	if err != nil {
-		return nil, err
-	}
-	defer subid.Close()
-	return ParseSubIDFilter(subid, filter)
-}
-
-func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) {
-	if r == nil {
-		return nil, errors.New("nil source for subid-formatted data")
-	}
-
-	var (
-		s   = bufio.NewScanner(r)
-		out = []SubID{}
-	)
-
-	for s.Scan() {
-		line := bytes.TrimSpace(s.Bytes())
-		if len(line) == 0 {
-			continue
-		}
-
-		// see: man 5 subuid
-		p := SubID{}
-		parseLine(line, &p.Name, &p.SubID, &p.Count)
-
-		if filter == nil || filter(p) {
-			out = append(out, p)
-		}
-	}
-	if err := s.Err(); err != nil {
-		return nil, err
-	}
-
-	return out, nil
-}
-
-func ParseIDMapFile(path string) ([]IDMap, error) {
-	r, err := os.Open(path)
-	if err != nil {
-		return nil, err
-	}
-	defer r.Close()
-	return ParseIDMap(r)
-}
-
-func ParseIDMap(r io.Reader) ([]IDMap, error) {
-	return ParseIDMapFilter(r, nil)
-}
-
-func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) {
-	r, err := os.Open(path)
-	if err != nil {
-		return nil, err
-	}
-	defer r.Close()
-	return ParseIDMapFilter(r, filter)
-}
-
-func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) {
-	if r == nil {
-		return nil, errors.New("nil source for idmap-formatted data")
-	}
-
-	var (
-		s   = bufio.NewScanner(r)
-		out = []IDMap{}
-	)
-
-	for s.Scan() {
-		line := bytes.TrimSpace(s.Bytes())
-		if len(line) == 0 {
-			continue
-		}
-
-		// see: man 7 user_namespaces
-		p := IDMap{}
-		parseParts(bytes.Fields(line), &p.ID, &p.ParentID, &p.Count)
-
-		if filter == nil || filter(p) {
-			out = append(out, p)
-		}
-	}
-	if err := s.Err(); err != nil {
-		return nil, err
-	}
-
-	return out, nil
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go
@@ -1,43 +0,0 @@
-//go:build gofuzz
-// +build gofuzz
-
-package user
-
-import (
-	"io"
-	"strings"
-)
-
-func IsDivisbleBy(n int, divisibleby int) bool {
-	return (n % divisibleby) == 0
-}
-
-func FuzzUser(data []byte) int {
-	if len(data) == 0 {
-		return -1
-	}
-	if !IsDivisbleBy(len(data), 5) {
-		return -1
-	}
-
-	var divided [][]byte
-
-	chunkSize := len(data) / 5
-
-	for i := 0; i < len(data); i += chunkSize {
-		end := i + chunkSize
-
-		divided = append(divided, data[i:end])
-	}
-
-	_, _ = ParsePasswdFilter(strings.NewReader(string(divided[0])), nil)
-
-	var passwd, group io.Reader
-
-	group = strings.NewReader(string(divided[1]))
-	_, _ = GetAdditionalGroups([]string{string(divided[2])}, group)
-
-	passwd = strings.NewReader(string(divided[3]))
-	_, _ = GetExecUser(string(divided[4]), nil, passwd, group)
-	return 1
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go
@@ -1,5 +0,0 @@
-package userns
-
-// RunningInUserNS detects whether we are currently running in a user namespace.
-// Originally copied from github.com/lxc/lxd/shared/util.go
-var RunningInUserNS = runningInUserNS
--- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go
@@ -1,16 +0,0 @@
-//go:build gofuzz
-// +build gofuzz
-
-package userns
-
-import (
-	"strings"
-
-	"github.com/opencontainers/runc/libcontainer/user"
-)
-
-func FuzzUIDMap(data []byte) int {
-	uidmap, _ := user.ParseIDMap(strings.NewReader(string(data)))
-	_ = uidMapInUserNS(uidmap)
-	return 1
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go
@@ -1,37 +0,0 @@
-package userns
-
-import (
-	"sync"
-
-	"github.com/opencontainers/runc/libcontainer/user"
-)
-
-var (
-	inUserNS bool
-	nsOnce   sync.Once
-)
-
-// runningInUserNS detects whether we are currently running in a user namespace.
-// Originally copied from github.com/lxc/lxd/shared/util.go
-func runningInUserNS() bool {
-	nsOnce.Do(func() {
-		uidmap, err := user.CurrentProcessUIDMap()
-		if err != nil {
-			// This kernel-provided file only exists if user namespaces are supported
-			return
-		}
-		inUserNS = uidMapInUserNS(uidmap)
-	})
-	return inUserNS
-}
-
-func uidMapInUserNS(uidmap []user.IDMap) bool {
-	/*
-	 * We assume we are in the initial user namespace if we have a full
-	 * range - 4294967295 uids starting at uid 0.
-	 */
-	if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
-		return false
-	}
-	return true
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps.c
+++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps.c
@@ -1,79 +0,0 @@
-#define _GNU_SOURCE
-#include <fcntl.h>
-#include <sched.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <stdarg.h>
-#include <stdlib.h>
-
-/*
- * All of the code here is run inside an aync-signal-safe context, so we need
- * to be careful to not call any functions that could cause issues. In theory,
- * since we are a Go program, there are fewer restrictions in practice, it's
- * better to be safe than sorry.
- *
- * The only exception is exit, which we need to call to make sure we don't
- * return into runc.
- */
-
-void bail(int pipefd, const char *fmt, ...)
-{
-	va_list args;
-
-	va_start(args, fmt);
-	vdprintf(pipefd, fmt, args);
-	va_end(args);
-
-	exit(1);
-}
-
-int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd)
-{
-	char buffer[4096] = { 0 };
-
-	pid_t child = fork();
-	if (child != 0)
-		return child;
-	/* in child */
-
-	/* Join the target userns. */
-	int nsfd = open(userns_path, O_RDONLY);
-	if (nsfd < 0)
-		bail(errfd, "open userns path %s failed: %m", userns_path);
-
-	int err = setns(nsfd, CLONE_NEWUSER);
-	if (err < 0)
-		bail(errfd, "setns %s failed: %m", userns_path);
-
-	close(nsfd);
-
-	/* Pipe the requested file contents. */
-	int fd = open(path, O_RDONLY);
-	if (fd < 0)
-		bail(errfd, "open %s in userns %s failed: %m", path, userns_path);
-
-	int nread, ntotal = 0;
-	while ((nread = read(fd, buffer, sizeof(buffer))) != 0) {
-		if (nread < 0)
-			bail(errfd, "read bytes from %s failed (after %d total bytes read): %m", path, ntotal);
-		ntotal += nread;
-
-		int nwritten = 0;
-		while (nwritten < nread) {
-			int n = write(outfd, buffer, nread - nwritten);
-			if (n < 0)
-				bail(errfd, "write %d bytes from %s failed (after %d bytes written): %m",
-				     nread - nwritten, path, nwritten);
-			nwritten += n;
-		}
-		if (nread != nwritten)
-			bail(errfd, "mismatch for bytes read and written: %d read != %d written", nread, nwritten);
-	}
-
-	close(fd);
-	close(outfd);
-	close(errfd);
-
-	/* We must exit here, otherwise we would return into a forked runc. */
-	exit(0);
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps_linux.go
@@ -1,186 +0,0 @@
-//go:build linux
-
-package userns
-
-import (
-	"bufio"
-	"bytes"
-	"fmt"
-	"io"
-	"os"
-	"unsafe"
-
-	"github.com/opencontainers/runc/libcontainer/configs"
-	"github.com/sirupsen/logrus"
-)
-
-/*
-#include <stdlib.h>
-extern int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd);
-*/
-import "C"
-
-func parseIdmapData(data []byte) (ms []configs.IDMap, err error) {
-	scanner := bufio.NewScanner(bytes.NewReader(data))
-	for scanner.Scan() {
-		var m configs.IDMap
-		line := scanner.Text()
-		if _, err := fmt.Sscanf(line, "%d %d %d", &m.ContainerID, &m.HostID, &m.Size); err != nil {
-			return nil, fmt.Errorf("parsing id map failed: invalid format in line %q: %w", line, err)
-		}
-		ms = append(ms, m)
-	}
-	if err := scanner.Err(); err != nil {
-		return nil, fmt.Errorf("parsing id map failed: %w", err)
-	}
-	return ms, nil
-}
-
-// Do something equivalent to nsenter --user=<nsPath> cat <path>, but more
-// efficiently. Returns the contents of the requested file from within the user
-// namespace.
-func spawnUserNamespaceCat(nsPath string, path string) ([]byte, error) {
-	rdr, wtr, err := os.Pipe()
-	if err != nil {
-		return nil, fmt.Errorf("create pipe for userns spawn failed: %w", err)
-	}
-	defer rdr.Close()
-	defer wtr.Close()
-
-	errRdr, errWtr, err := os.Pipe()
-	if err != nil {
-		return nil, fmt.Errorf("create error pipe for userns spawn failed: %w", err)
-	}
-	defer errRdr.Close()
-	defer errWtr.Close()
-
-	cNsPath := C.CString(nsPath)
-	defer C.free(unsafe.Pointer(cNsPath))
-	cPath := C.CString(path)
-	defer C.free(unsafe.Pointer(cPath))
-
-	childPid := C.spawn_userns_cat(cNsPath, cPath, C.int(wtr.Fd()), C.int(errWtr.Fd()))
-
-	if childPid < 0 {
-		return nil, fmt.Errorf("failed to spawn fork for userns")
-	} else if childPid == 0 {
-		// this should never happen
-		panic("runc executing inside fork child -- unsafe state!")
-	}
-
-	// We are in the parent -- close the write end of the pipe before reading.
-	wtr.Close()
-	output, err := io.ReadAll(rdr)
-	rdr.Close()
-	if err != nil {
-		return nil, fmt.Errorf("reading from userns spawn failed: %w", err)
-	}
-
-	// Ditto for the error pipe.
-	errWtr.Close()
-	errOutput, err := io.ReadAll(errRdr)
-	errRdr.Close()
-	if err != nil {
-		return nil, fmt.Errorf("reading from userns spawn error pipe failed: %w", err)
-	}
-	errOutput = bytes.TrimSpace(errOutput)
-
-	// Clean up the child.
-	child, err := os.FindProcess(int(childPid))
-	if err != nil {
-		return nil, fmt.Errorf("could not find userns spawn process: %w", err)
-	}
-	state, err := child.Wait()
-	if err != nil {
-		return nil, fmt.Errorf("failed to wait for userns spawn process: %w", err)
-	}
-	if !state.Success() {
-		errStr := string(errOutput)
-		if errStr == "" {
-			errStr = fmt.Sprintf("unknown error (status code %d)", state.ExitCode())
-		}
-		return nil, fmt.Errorf("userns spawn: %s", errStr)
-	} else if len(errOutput) > 0 {
-		// We can just ignore weird output in the error pipe if the process
-		// didn't bail(), but for completeness output for debugging.
-		logrus.Debugf("userns spawn succeeded but unexpected error message found: %s", string(errOutput))
-	}
-	// The subprocess succeeded, return whatever it wrote to the pipe.
-	return output, nil
-}
-
-func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, err error) {
-	var (
-		pid         int
-		extra       rune
-		tryFastPath bool
-	)
-
-	// nsPath is usually of the form /proc/<pid>/ns/user, which means that we
-	// already have a pid that is part of the user namespace and thus we can
-	// just use the pid to read from /proc/<pid>/*id_map.
-	//
-	// Note that Sscanf doesn't consume the whole input, so we check for any
-	// trailing data with %c. That way, we can be sure the pattern matched
-	// /proc/$pid/ns/user _exactly_ iff n === 1.
-	if n, _ := fmt.Sscanf(nsPath, "/proc/%d/ns/user%c", &pid, &extra); n == 1 {
-		tryFastPath = pid > 0
-	}
-
-	for _, mapType := range []struct {
-		name  string
-		idMap *[]configs.IDMap
-	}{
-		{"uid_map", &uidMap},
-		{"gid_map", &gidMap},
-	} {
-		var mapData []byte
-
-		if tryFastPath {
-			path := fmt.Sprintf("/proc/%d/%s", pid, mapType.name)
-			data, err := os.ReadFile(path)
-			if err != nil {
-				// Do not error out here -- we need to try the slow path if the
-				// fast path failed.
-				logrus.Debugf("failed to use fast path to read %s from userns %s (error: %s), falling back to slow userns-join path", mapType.name, nsPath, err)
-			} else {
-				mapData = data
-			}
-		} else {
-			logrus.Debugf("cannot use fast path to read %s from userns %s, falling back to slow userns-join path", mapType.name, nsPath)
-		}
-
-		if mapData == nil {
-			// We have to actually join the namespace if we cannot take the
-			// fast path. The path is resolved with respect to the child
-			// process, so just use /proc/self.
-			data, err := spawnUserNamespaceCat(nsPath, "/proc/self/"+mapType.name)
-			if err != nil {
-				return nil, nil, err
-			}
-			mapData = data
-		}
-		idMap, err := parseIdmapData(mapData)
-		if err != nil {
-			return nil, nil, fmt.Errorf("failed to parse %s of userns %s: %w", mapType.name, nsPath, err)
-		}
-		*mapType.idMap = idMap
-	}
-
-	return uidMap, gidMap, nil
-}
-
-// IsSameMapping returns whether or not the two id mappings are the same. Note
-// that if the order of the mappings is different, or a mapping has been split,
-// the mappings will be considered different.
-func IsSameMapping(a, b []configs.IDMap) bool {
-	if len(a) != len(b) {
-		return false
-	}
-	for idx := range a {
-		if a[idx] != b[idx] {
-			return false
-		}
-	}
-	return true
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go
@@ -1,18 +0,0 @@
-//go:build !linux
-// +build !linux
-
-package userns
-
-import "github.com/opencontainers/runc/libcontainer/user"
-
-// runningInUserNS is a stub for non-Linux systems
-// Always returns false
-func runningInUserNS() bool {
-	return false
-}
-
-// uidMapInUserNS is a stub for non-Linux systems
-// Always returns false
-func uidMapInUserNS(uidmap []user.IDMap) bool {
-	return false
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
@@ -19,13 +19,14 @@ package utils
 import (
 	"fmt"
 	"os"
+	"runtime"

 	"golang.org/x/sys/unix"
 )

-// MaxSendfdLen is the maximum length of the name of a file descriptor being
-// sent using SendFd. The name of the file handle returned by RecvFd will never
-// be larger than this value.
+// MaxNameLen is the maximum length of the name of a file descriptor being sent
+// using SendFile. The name of the file handle returned by RecvFile will never be
+// larger than this value.
 const MaxNameLen = 4096

 // oobSpace is the size of the oob slice required to store a single FD. Note
@@ -33,26 +34,21 @@ const MaxNameLen = 4096
 // so sizeof(fd) = 4.
 var oobSpace = unix.CmsgSpace(4)

-// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
+// RecvFile waits for a file descriptor to be sent over the given AF_UNIX
 // socket. The file name of the remote file descriptor will be recreated
 // locally (it is sent as non-auxiliary data in the same payload).
-func RecvFd(socket *os.File) (*os.File, error) {
-	// For some reason, unix.Recvmsg uses the length rather than the capacity
-	// when passing the msg_controllen and other attributes to recvmsg.  So we
-	// have to actually set the length.
+func RecvFile(socket *os.File) (_ *os.File, Err error) {
 	name := make([]byte, MaxNameLen)
 	oob := make([]byte, oobSpace)

 	sockfd := socket.Fd()
-	n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0)
+	n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC)
 	if err != nil {
 		return nil, err
 	}
-
 	if n >= MaxNameLen || oobn != oobSpace {
-		return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
+		return nil, fmt.Errorf("recvfile: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
 	}
-
 	// Truncate.
 	name = name[:n]
 	oob = oob[:oobn]
@@ -61,36 +57,63 @@ func RecvFd(socket *os.File) (*os.File, error) {
 	if err != nil {
 		return nil, err
 	}
+
+	// We cannot control how many SCM_RIGHTS we receive, and upon receiving
+	// them all of the descriptors are installed in our fd table, so we need to
+	// parse all of the SCM_RIGHTS we received in order to close all of the
+	// descriptors on error.
+	var fds []int
+	defer func() {
+		for i, fd := range fds {
+			if i == 0 && Err == nil {
+				// Only close the first one on error.
+				continue
+			}
+			// Always close extra ones.
+			_ = unix.Close(fd)
+		}
+	}()
+	var lastErr error
+	for _, scm := range scms {
+		if scm.Header.Type == unix.SCM_RIGHTS {
+			scmFds, err := unix.ParseUnixRights(&scm)
+			if err != nil {
+				lastErr = err
+			} else {
+				fds = append(fds, scmFds...)
+			}
+		}
+	}
+	if lastErr != nil {
+		return nil, lastErr
+	}
+
+	// We do this after collecting the fds to make sure we close them all when
+	// returning an error here.
 	if len(scms) != 1 {
 		return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
 	}
-	scm := scms[0]
-
-	fds, err := unix.ParseUnixRights(&scm)
-	if err != nil {
-		return nil, err
-	}
 	if len(fds) != 1 {
 		return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
 	}
-	fd := uintptr(fds[0])
-
-	return os.NewFile(fd, string(name)), nil
+	return os.NewFile(uintptr(fds[0]), string(name)), nil
 }

-// SendFd sends a file descriptor over the given AF_UNIX socket. In
-// addition, the file.Name() of the given file will also be sent as
-// non-auxiliary data in the same payload (allowing to send contextual
-// information for a file descriptor).
-func SendFd(socket *os.File, name string, fd uintptr) error {
+// SendFile sends a file over the given AF_UNIX socket. file.Name() is also
+// included so that if the other end uses RecvFile, the file will have the same
+// name information.
+func SendFile(socket *os.File, file *os.File) error {
+	name := file.Name()
 	if len(name) >= MaxNameLen {
 		return fmt.Errorf("sendfd: filename too long: %s", name)
 	}
-	return SendFds(socket, []byte(name), int(fd))
+	err := SendRawFd(socket, name, file.Fd())
+	runtime.KeepAlive(file)
+	return err
 }

-// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket.
-func SendFds(socket *os.File, msg []byte, fds ...int) error {
-	oob := unix.UnixRights(fds...)
-	return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0)
+// SendRawFd sends a specific file descriptor over the given AF_UNIX socket.
+func SendRawFd(socket *os.File, msg string, fd uintptr) error {
+	oob := unix.UnixRights(int(fd))
+	return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0)
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
@@ -1,17 +1,12 @@
 package utils

 import (
-	"encoding/binary"
 	"encoding/json"
-	"fmt"
 	"io"
 	"os"
 	"path/filepath"
-	"strconv"
 	"strings"
-	"unsafe"

-	securejoin "github.com/cyphar/filepath-securejoin"
 	"golang.org/x/sys/unix"
 )

@@ -19,20 +14,6 @@ const (
 	exitSignalOffset = 128
 )

-// NativeEndian is the native byte order of the host system.
-var NativeEndian binary.ByteOrder
-
-func init() {
-	// Copied from <golang.org/x/net/internal/socket/sys.go>.
-	i := uint32(1)
-	b := (*[4]byte)(unsafe.Pointer(&i))
-	if b[0] == 1 {
-		NativeEndian = binary.LittleEndian
-	} else {
-		NativeEndian = binary.BigEndian
-	}
-}
-
 // ExitStatus returns the correct exit status for a process based on if it
 // was signaled or exited cleanly
 func ExitStatus(status unix.WaitStatus) int {
@@ -43,6 +24,9 @@ func ExitStatus(status unix.WaitStatus) int {
 }

 // WriteJSON writes the provided struct v to w using standard json marshaling
+// without a trailing newline. This is used instead of json.Encoder because
+// there might be a problem in json decoder in some cases, see:
+// https://github.com/docker/docker/issues/14203#issuecomment-174177790
 func WriteJSON(w io.Writer, v interface{}) error {
 	data, err := json.Marshal(v)
 	if err != nil {
@@ -99,52 +83,16 @@ func stripRoot(root, path string) string {
 	return CleanPath("/" + path)
 }

-// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
-// corresponding to the unsafePath resolved within the root. Before passing the
-// fd, this path is verified to have been inside the root -- so operating on it
-// through the passed fdpath should be safe. Do not access this path through
-// the original path strings, and do not attempt to use the pathname outside of
-// the passed closure (the file handle will be freed once the closure returns).
-func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
-	// Remove the root then forcefully resolve inside the root.
-	unsafePath = stripRoot(root, unsafePath)
-	path, err := securejoin.SecureJoin(root, unsafePath)
-	if err != nil {
-		return fmt.Errorf("resolving path inside rootfs failed: %w", err)
-	}
-
-	// Open the target path.
-	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
-	if err != nil {
-		return fmt.Errorf("open o_path procfd: %w", err)
-	}
-	defer fh.Close()
-
-	// Double-check the path is the one we expected.
-	procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
-	if realpath, err := os.Readlink(procfd); err != nil {
-		return fmt.Errorf("procfd verification failed: %w", err)
-	} else if realpath != path {
-		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
-	}
-
-	// Run the closure.
-	return fn(procfd)
-}
-
-// SearchLabels searches a list of key-value pairs for the provided key and
-// returns the corresponding value. The pairs must be separated with '='.
-func SearchLabels(labels []string, query string) string {
-	for _, l := range labels {
-		parts := strings.SplitN(l, "=", 2)
-		if len(parts) < 2 {
-			continue
-		}
-		if parts[0] == query {
-			return parts[1]
+// SearchLabels searches through a list of key=value pairs for a given key,
+// returning its value, and the binary flag telling whether the key exist.
+func SearchLabels(labels []string, key string) (string, bool) {
+	key += "="
+	for _, s := range labels {
+		if strings.HasPrefix(s, key) {
+			return s[len(key):], true
 		}
 	}
-	return ""
+	return "", false
 }

 // Annotations returns the bundle path and user defined annotations from the
@@ -153,14 +101,14 @@ func SearchLabels(labels []string, query string) string {
 func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
 	userAnnotations = make(map[string]string)
 	for _, l := range labels {
-		parts := strings.SplitN(l, "=", 2)
-		if len(parts) < 2 {
+		name, value, ok := strings.Cut(l, "=")
+		if !ok {
 			continue
 		}
-		if parts[0] == "bundle" {
-			bundle = parts[1]
+		if name == "bundle" {
+			bundle = value
 		} else {
-			userAnnotations[parts[0]] = parts[1]
+			userAnnotations[name] = value
 		}
 	}
 	return
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go
@@ -1,20 +1,20 @@
 //go:build !windows
-// +build !windows

 package utils

 import (
-	"errors"
 	"fmt"
+	"math"
 	"os"
 	"path/filepath"
+	"runtime"
 	"strconv"
 	"strings"
+	"sync"
 	_ "unsafe" // for go:linkname

-	"github.com/opencontainers/runc/libcontainer/system"
-
 	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )

@@ -30,12 +30,39 @@ func EnsureProcHandle(fh *os.File) error {
 	return nil
 }

+var (
+	haveCloseRangeCloexecBool bool
+	haveCloseRangeCloexecOnce sync.Once
+)
+
+func haveCloseRangeCloexec() bool {
+	haveCloseRangeCloexecOnce.Do(func() {
+		// Make sure we're not closing a random file descriptor.
+		tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
+		if err != nil {
+			return
+		}
+		defer unix.Close(tmpFd)
+
+		err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
+		// Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
+		// -ENOSYS and -EINVAL ultimately mean we don't have support, but any
+		// other potential error would imply that even the most basic close
+		// operation wouldn't work.
+		haveCloseRangeCloexecBool = err == nil
+	})
+	return haveCloseRangeCloexecBool
+}
+
 type fdFunc func(fd int)

 // fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
 // the current process.
 func fdRangeFrom(minFd int, fn fdFunc) error {
-	fdDir, err := os.Open("/proc/self/fd")
+	procSelfFd, closer := ProcThreadSelf("fd")
+	defer closer()
+
+	fdDir, err := os.Open(procSelfFd)
 	if err != nil {
 		return err
 	}
@@ -73,6 +100,12 @@ func fdRangeFrom(minFd int, fn fdFunc) error {
 // CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
 // equal to minFd in the current process.
 func CloseExecFrom(minFd int) error {
+	// Use close_range(CLOSE_RANGE_CLOEXEC) if possible.
+	if haveCloseRangeCloexec() {
+		err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC)
+		return os.NewSyscallError("close_range", err)
+	}
+	// Otherwise, fall back to the standard loop.
 	return fdRangeFrom(minFd, unix.CloseOnExec)
 }

@@ -95,7 +128,8 @@ func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
 // *os.File operations would apply to the wrong file). This function is only
 // intended to be called from the last stage of runc init.
 func UnsafeCloseFrom(minFd int) error {
-	// We must not close some file descriptors.
+	// We cannot use close_range(2) even if it is available, because we must
+	// not close some file descriptors.
 	return fdRangeFrom(minFd, func(fd int) {
 		if runtime_IsPollDescriptor(uintptr(fd)) {
 			// These are the Go runtimes internal netpoll file descriptors.
@@ -113,8 +147,8 @@ func UnsafeCloseFrom(minFd int) error {
 	})
 }

-// NewSockPair returns a new unix socket pair
-func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
+// NewSockPair returns a new SOCK_STREAM unix socket pair.
+func NewSockPair(name string) (parent, child *os.File, err error) {
 	fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
 	if err != nil {
 		return nil, nil, err
@@ -122,6 +156,112 @@ func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
 	return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
 }

+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
+// corresponding to the unsafePath resolved within the root. Before passing the
+// fd, this path is verified to have been inside the root -- so operating on it
+// through the passed fdpath should be safe. Do not access this path through
+// the original path strings, and do not attempt to use the pathname outside of
+// the passed closure (the file handle will be freed once the closure returns).
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
+	// Remove the root then forcefully resolve inside the root.
+	unsafePath = stripRoot(root, unsafePath)
+	path, err := securejoin.SecureJoin(root, unsafePath)
+	if err != nil {
+		return fmt.Errorf("resolving path inside rootfs failed: %w", err)
+	}
+
+	procSelfFd, closer := ProcThreadSelf("fd/")
+	defer closer()
+
+	// Open the target path.
+	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return fmt.Errorf("open o_path procfd: %w", err)
+	}
+	defer fh.Close()
+
+	procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
+	// Double-check the path is the one we expected.
+	if realpath, err := os.Readlink(procfd); err != nil {
+		return fmt.Errorf("procfd verification failed: %w", err)
+	} else if realpath != path {
+		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
+	}
+
+	return fn(procfd)
+}
+
+type ProcThreadSelfCloser func()
+
+var (
+	haveProcThreadSelf     bool
+	haveProcThreadSelfOnce sync.Once
+)
+
+// ProcThreadSelf returns a string that is equivalent to
+// /proc/thread-self/<subpath>, with a graceful fallback on older kernels where
+// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
+// meaning that the passed string needs to be trusted. The caller _must_ call
+// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
+// *only once* after it has finished using the returned path string.
+func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
+	haveProcThreadSelfOnce.Do(func() {
+		if _, err := os.Stat("/proc/thread-self/"); err == nil {
+			haveProcThreadSelf = true
+		} else {
+			logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err)
+		}
+	})
+
+	// We need to lock our thread until the caller is done with the path string
+	// because any non-atomic operation on the path (such as opening a file,
+	// then reading it) could be interrupted by the Go runtime where the
+	// underlying thread is swapped out and the original thread is killed,
+	// resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
+	// addition, the pre-3.17 fallback makes everything non-atomic because the
+	// same thing could happen between unix.Gettid() and the path operations.
+	//
+	// In theory, we don't need to lock in the atomic user case when using
+	// /proc/thread-self/, but it's better to be safe than sorry (and there are
+	// only one or two truly atomic users of /proc/thread-self/).
+	runtime.LockOSThread()
+
+	threadSelf := "/proc/thread-self/"
+	if !haveProcThreadSelf {
+		// Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
+		threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
+		if _, err := os.Stat(threadSelf); err != nil {
+			// Unfortunately, this code is called from rootfs_linux.go where we
+			// are running inside the pid namespace of the container but /proc
+			// is the host's procfs. Unfortunately there is no real way to get
+			// the correct tid to use here (the kernel age means we cannot do
+			// things like set up a private fsopen("proc") -- even scanning
+			// NSpid in all of the tasks in /proc/self/task/*/status requires
+			// Linux 4.1).
+			//
+			// So, we just have to assume that /proc/self is acceptable in this
+			// one specific case.
+			if os.Getpid() == 1 {
+				logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
+			} else {
+				// This should never happen, but the fallback should work in most cases...
+				logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
+			}
+			threadSelf = "/proc/self/"
+		}
+	}
+	return threadSelf + subpath, runtime.UnlockOSThread
+}
+
+// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to
+// create a /proc/thread-self handle for given file descriptor.
+//
+// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but
+// without using fmt.Sprintf to avoid unneeded overhead.
+func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) {
+	return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10))
+}
+
 // IsLexicallyInRoot is shorthand for strings.HasPrefix(path+"/", root+"/"),
 // but properly handling the case where path or root are "/".
 //
@@ -156,83 +296,45 @@ func IsLexicallyInRoot(root, path string) bool {
 // This means that the path also must not contain ".." elements, otherwise an
 // error will occur.
 //
-// This is a somewhat less safe alternative to
-// <https://github.com/cyphar/filepath-securejoin/pull/13>, but it should
-// detect attempts to trick us into creating directories outside of the root.
-// We should migrate to securejoin.MkdirAll once it is merged.
+// This uses securejoin.MkdirAllHandle under the hood, but it has special
+// handling if unsafePath has already been scoped within the rootfs (this is
+// needed for a lot of runc callers and fixing this would require reworking a
+// lot of path logic).
 func MkdirAllInRootOpen(root, unsafePath string, mode uint32) (_ *os.File, Err error) {
-	// If the path is already "within" the root, use it verbatim.
-	fullPath := unsafePath
-	if !IsLexicallyInRoot(root, unsafePath) {
-		var err error
-		fullPath, err = securejoin.SecureJoin(root, unsafePath)
+	// If the path is already "within" the root, get the path relative to the
+	// root and use that as the unsafe path. This is necessary because a lot of
+	// MkdirAllInRootOpen callers have already done SecureJoin, and refactoring
+	// all of them to stop using these SecureJoin'd paths would require a fair
+	// amount of work.
+	// TODO(cyphar): Do the refactor to libpathrs once it's ready.
+	if IsLexicallyInRoot(root, unsafePath) {
+		subPath, err := filepath.Rel(root, unsafePath)
 		if err != nil {
 			return nil, err
 		}
-	}
-	subPath, err := filepath.Rel(root, fullPath)
-	if err != nil {
-		return nil, err
+		unsafePath = subPath
 	}

 	// Check for any silly mode bits.
 	if mode&^0o7777 != 0 {
 		return nil, fmt.Errorf("tried to include non-mode bits in MkdirAll mode: 0o%.3o", mode)
 	}
+	// Linux (and thus os.MkdirAll) silently ignores the suid and sgid bits if
+	// passed. While it would make sense to return an error in that case (since
+	// the user has asked for a mode that won't be applied), for compatibility
+	// reasons we have to ignore these bits.
+	if ignoredBits := mode &^ 0o1777; ignoredBits != 0 {
+		logrus.Warnf("MkdirAll called with no-op mode bits that are ignored by Linux: 0o%.3o", ignoredBits)
+		mode &= 0o1777
+	}

-	currentDir, err := os.OpenFile(root, unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+	rootDir, err := os.OpenFile(root, unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
 	if err != nil {
 		return nil, fmt.Errorf("open root handle: %w", err)
 	}
-	defer func() {
-		if Err != nil {
-			currentDir.Close()
-		}
-	}()
+	defer rootDir.Close()

-	for _, part := range strings.Split(subPath, string(filepath.Separator)) {
-		switch part {
-		case "", ".":
-			// Skip over no-op components.
-			continue
-		case "..":
-			return nil, fmt.Errorf("possible breakout detected: found %q component in SecureJoin subpath %s", part, subPath)
-		}
-
-		nextDir, err := system.Openat(currentDir, part, unix.O_DIRECTORY|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
-		switch {
-		case err == nil:
-			// Update the currentDir.
-			_ = currentDir.Close()
-			currentDir = nextDir
-
-		case errors.Is(err, unix.ENOTDIR):
-			// This might be a symlink or some other random file. Either way,
-			// error out.
-			return nil, fmt.Errorf("cannot mkdir in %s/%s: %w", currentDir.Name(), part, unix.ENOTDIR)
-
-		case errors.Is(err, os.ErrNotExist):
-			// Luckily, mkdirat will not follow trailing symlinks, so this is
-			// safe to do as-is.
-			if err := system.Mkdirat(currentDir, part, mode); err != nil {
-				return nil, err
-			}
-			// Open the new directory. There is a race here where an attacker
-			// could swap the directory with a different directory, but
-			// MkdirAll's fuzzy semantics mean we don't care about that.
-			nextDir, err := system.Openat(currentDir, part, unix.O_DIRECTORY|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
-			if err != nil {
-				return nil, fmt.Errorf("open newly created directory: %w", err)
-			}
-			// Update the currentDir.
-			_ = currentDir.Close()
-			currentDir = nextDir
-
-		default:
-			return nil, err
-		}
-	}
-	return currentDir, nil
+	return securejoin.MkdirAllHandle(rootDir, unsafePath, int(mode))
 }

 // MkdirAllInRoot is a wrapper around MkdirAllInRootOpen which closes the
@@ -244,3 +346,18 @@ func MkdirAllInRoot(root, unsafePath string, mode uint32) error {
 	}
 	return err
 }
+
+// Openat is a Go-friendly openat(2) wrapper.
+func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) {
+	dirFd := unix.AT_FDCWD
+	if dir != nil {
+		dirFd = int(dir.Fd())
+	}
+	flags |= unix.O_CLOEXEC
+
+	fd, err := unix.Openat(dirFd, path, flags, mode)
+	if err != nil {
+		return nil, &os.PathError{Op: "openat", Path: path, Err: err}
+	}
+	return os.NewFile(uintptr(fd), dir.Name()+"/"+path), nil
+}