Update moby/runc dependencies

Signed-off-by: Davanum Srinivas <davanum@gmail.com>
2025-11-03 19:58:17 +00:00 · 2024-10-10 11:58:22 -04:00
parent 9ffc095f88
commit 521f2d106b
43 changed files with 157 additions and 166 deletions
--- a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go
@@ -801,54 +801,33 @@ func mknodDevice(dest string, node *devices.Device) error {
 	return os.Chown(dest, int(node.Uid), int(node.Gid))
 }

-// Get the parent mount point of directory passed in as argument. Also return
-// optional fields.
-func getParentMount(rootfs string) (string, string, error) {
-	mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(rootfs))
-	if err != nil {
-		return "", "", err
-	}
-	if len(mi) < 1 {
-		return "", "", fmt.Errorf("could not find parent mount of %s", rootfs)
-	}
-
-	// find the longest mount point
-	var idx, maxlen int
-	for i := range mi {
-		if len(mi[i].Mountpoint) > maxlen {
-			maxlen = len(mi[i].Mountpoint)
-			idx = i
+// rootfsParentMountPrivate ensures rootfs parent mount is private.
+// This is needed for two reasons:
+//   - pivot_root() will fail if parent mount is shared;
+//   - when we bind mount rootfs, if its parent is not private, the new mount
+//     will propagate (leak!) to parent namespace and we don't want that.
+func rootfsParentMountPrivate(path string) error {
+	var err error
+	// Assuming path is absolute and clean (this is checked in
+	// libcontainer/validate). Any error other than EINVAL means we failed,
+	// and EINVAL means this is not a mount point, so traverse up until we
+	// find one.
+	for {
+		err = unix.Mount("", path, "", unix.MS_PRIVATE, "")
+		if err == nil {
+			return nil
 		}
-	}
-	return mi[idx].Mountpoint, mi[idx].Optional, nil
-}
-
-// Make parent mount private if it was shared
-func rootfsParentMountPrivate(rootfs string) error {
-	sharedMount := false
-
-	parentMount, optionalOpts, err := getParentMount(rootfs)
-	if err != nil {
-		return err
-	}
-
-	optsSplit := strings.Split(optionalOpts, " ")
-	for _, opt := range optsSplit {
-		if strings.HasPrefix(opt, "shared:") {
-			sharedMount = true
+		if err != unix.EINVAL || path == "/" { //nolint:errorlint // unix errors are bare
 			break
 		}
+		path = filepath.Dir(path)
 	}
-
-	// Make parent mount PRIVATE if it was shared. It is needed for two
-	// reasons. First of all pivot_root() will fail if parent mount is
-	// shared. Secondly when we bind mount rootfs it will propagate to
-	// parent namespace and we don't want that to happen.
-	if sharedMount {
-		return mount("", parentMount, "", "", unix.MS_PRIVATE, "")
+	return &mountError{
+		op:     "remount-private",
+		target: path,
+		flags:  unix.MS_PRIVATE,
+		err:    err,
 	}
-
-	return nil
 }

 func prepareRoot(config *configs.Config) error {
@@ -860,9 +839,6 @@ func prepareRoot(config *configs.Config) error {
 		return err
 	}

-	// Make parent mount private to make sure following bind mount does
-	// not propagate in other namespaces. Also it will help with kernel
-	// check pass in pivot_root. (IS_SHARED(new_mnt->mnt_parent))
 	if err := rootfsParentMountPrivate(config.Rootfs); err != nil {
 		return err
 	}
--- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_linux.go
@@ -164,11 +164,11 @@ func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error)
 	return program, nil
 }

-type nativeArch uint32
+type linuxAuditArch uint32

-const invalidArch nativeArch = 0
+const invalidArch linuxAuditArch = 0

-func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
+func scmpArchToAuditArch(arch libseccomp.ScmpArch) (linuxAuditArch, error) {
 	switch arch {
 	case libseccomp.ArchNative:
 		// Convert to actual native architecture.
@@ -176,85 +176,89 @@ func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
 		if err != nil {
 			return invalidArch, fmt.Errorf("unable to get native arch: %w", err)
 		}
-		return archToNative(arch)
+		return scmpArchToAuditArch(arch)
 	case libseccomp.ArchX86:
-		return nativeArch(C.C_AUDIT_ARCH_I386), nil
+		return linuxAuditArch(C.C_AUDIT_ARCH_I386), nil
 	case libseccomp.ArchAMD64, libseccomp.ArchX32:
 		// NOTE: x32 is treated like x86_64 except all x32 syscalls have the
 		//       30th bit of the syscall number set to indicate that it's not a
 		//       normal x86_64 syscall.
-		return nativeArch(C.C_AUDIT_ARCH_X86_64), nil
+		return linuxAuditArch(C.C_AUDIT_ARCH_X86_64), nil
 	case libseccomp.ArchARM:
-		return nativeArch(C.C_AUDIT_ARCH_ARM), nil
+		return linuxAuditArch(C.C_AUDIT_ARCH_ARM), nil
 	case libseccomp.ArchARM64:
-		return nativeArch(C.C_AUDIT_ARCH_AARCH64), nil
+		return linuxAuditArch(C.C_AUDIT_ARCH_AARCH64), nil
 	case libseccomp.ArchMIPS:
-		return nativeArch(C.C_AUDIT_ARCH_MIPS), nil
+		return linuxAuditArch(C.C_AUDIT_ARCH_MIPS), nil
 	case libseccomp.ArchMIPS64:
-		return nativeArch(C.C_AUDIT_ARCH_MIPS64), nil
+		return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64), nil
 	case libseccomp.ArchMIPS64N32:
-		return nativeArch(C.C_AUDIT_ARCH_MIPS64N32), nil
+		return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64N32), nil
 	case libseccomp.ArchMIPSEL:
-		return nativeArch(C.C_AUDIT_ARCH_MIPSEL), nil
+		return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL), nil
 	case libseccomp.ArchMIPSEL64:
-		return nativeArch(C.C_AUDIT_ARCH_MIPSEL64), nil
+		return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64), nil
 	case libseccomp.ArchMIPSEL64N32:
-		return nativeArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
+		return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
 	case libseccomp.ArchPPC:
-		return nativeArch(C.C_AUDIT_ARCH_PPC), nil
+		return linuxAuditArch(C.C_AUDIT_ARCH_PPC), nil
 	case libseccomp.ArchPPC64:
-		return nativeArch(C.C_AUDIT_ARCH_PPC64), nil
+		return linuxAuditArch(C.C_AUDIT_ARCH_PPC64), nil
 	case libseccomp.ArchPPC64LE:
-		return nativeArch(C.C_AUDIT_ARCH_PPC64LE), nil
+		return linuxAuditArch(C.C_AUDIT_ARCH_PPC64LE), nil
 	case libseccomp.ArchS390:
-		return nativeArch(C.C_AUDIT_ARCH_S390), nil
+		return linuxAuditArch(C.C_AUDIT_ARCH_S390), nil
 	case libseccomp.ArchS390X:
-		return nativeArch(C.C_AUDIT_ARCH_S390X), nil
+		return linuxAuditArch(C.C_AUDIT_ARCH_S390X), nil
 	case libseccomp.ArchRISCV64:
-		return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil
+		return linuxAuditArch(C.C_AUDIT_ARCH_RISCV64), nil
 	default:
 		return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
 	}
 }

-type lastSyscallMap map[nativeArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
+type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall

 // Figure out largest syscall number referenced in the filter for each
 // architecture. We will be generating code based on the native architecture
 // representation, but SCMP_ARCH_X32 means we have to track cases where the
 // same architecture has different largest syscalls based on the mode.
 func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
-	lastSyscalls := make(lastSyscallMap)
-	// Only loop over architectures which are present in the filter. Any other
-	// architectures will get the libseccomp bad architecture action anyway.
+	scmpArchs := make(map[libseccomp.ScmpArch]struct{})
 	for _, ociArch := range config.Architectures {
 		arch, err := libseccomp.GetArchFromString(ociArch)
 		if err != nil {
 			return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
 		}
+		scmpArchs[arch] = struct{}{}
+	}
+	// On architectures like ppc64le, Docker inexplicably doesn't include the
+	// native architecture in the architecture list which results in no
+	// architectures being present in the list at all (rendering the ENOSYS
+	// stub a no-op). So, always include the native architecture.
+	if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil {
+		return nil, fmt.Errorf("unable to get native arch: %w", err)
+	} else if _, ok := scmpArchs[nativeScmpArch]; !ok {
+		logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch)
+		scmpArchs[nativeScmpArch] = struct{}{}
+	}
+	logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs)

-		// Map native architecture to a real architecture value to avoid
-		// doubling-up the lastSyscall mapping.
-		if arch == libseccomp.ArchNative {
-			nativeArch, err := libseccomp.GetNativeArch()
-			if err != nil {
-				return nil, fmt.Errorf("unable to get native architecture: %w", err)
-			}
-			arch = nativeArch
-		}
-
-		// Figure out native architecture representation of the architecture.
-		nativeArch, err := archToNative(arch)
+	// Only loop over architectures which are present in the filter. Any other
+	// architectures will get the libseccomp bad architecture action anyway.
+	lastSyscalls := make(lastSyscallMap)
+	for arch := range scmpArchs {
+		auditArch, err := scmpArchToAuditArch(arch)
 		if err != nil {
 			return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
 		}

-		if _, ok := lastSyscalls[nativeArch]; !ok {
-			lastSyscalls[nativeArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
+		if _, ok := lastSyscalls[auditArch]; !ok {
+			lastSyscalls[auditArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
 		}
-		if _, ok := lastSyscalls[nativeArch][arch]; ok {
+		if _, ok := lastSyscalls[auditArch][arch]; ok {
 			// Because of ArchNative we may hit the same entry multiple times.
-			// Just skip it if we've seen this (nativeArch, ScmpArch)
+			// Just skip it if we've seen this (linuxAuditArch, ScmpArch)
 			// combination before.
 			continue
 		}
@@ -272,10 +276,11 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
 			}
 		}
 		if largestSyscall != 0 {
-			lastSyscalls[nativeArch][arch] = largestSyscall
+			logrus.Debugf("seccomp: largest syscall number for arch %v is %v", arch, largestSyscall)
+			lastSyscalls[auditArch][arch] = largestSyscall
 		} else {
-			logrus.Warnf("could not find any syscalls for arch %s", ociArch)
-			delete(lastSyscalls[nativeArch], arch)
+			logrus.Warnf("could not find any syscalls for arch %v", arch)
+			delete(lastSyscalls[auditArch], arch)
 		}
 	}
 	return lastSyscalls, nil
@@ -293,10 +298,10 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
 // close_range(2) which were added out-of-order in the syscall table between
 // kernel releases.
 func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) {
-	// A jump-table for each nativeArch used to generate the initial
+	// A jump-table for each linuxAuditArch used to generate the initial
 	// conditional jumps -- measured from the *END* of the program so they
 	// remain valid after prepending to the tail.
-	archJumpTable := map[nativeArch]uint32{}
+	archJumpTable := map[linuxAuditArch]uint32{}

 	// Generate our own -ENOSYS rules for each architecture. They have to be
 	// generated in reverse (prepended to the tail of the program) because the
@@ -309,7 +314,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
 	}

 	// Generate the syscall -ENOSYS rules.
-	for nativeArch, maxSyscalls := range lastSyscalls {
+	for auditArch, maxSyscalls := range lastSyscalls {
 		// The number of instructions from the tail of this section which need
 		// to be jumped in order to reach the -ENOSYS return. If the section
 		// does not jump, it will fall through to the actual filter.
@@ -390,7 +395,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)

 			// If we're on x86 we need to add a check for x32 and if we're in
 			// the wrong mode we jump over the section.
-			if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
+			if uint32(auditArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
 				// Generate a prefix to check the mode.
 				switch scmpArch {
 				case libseccomp.ArchAMD64:
@@ -419,8 +424,8 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
 			section = append(section, sectionTail...)
 		case 2:
 			// x32 and x86_64 are a unique case, we can't handle any others.
-			if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
-				return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", nativeArch)
+			if uint32(auditArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
+				return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", auditArch)
 			}

 			x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
@@ -497,7 +502,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
 		programTail = append(section, programTail...)

 		// Update jump table.
-		archJumpTable[nativeArch] = uint32(len(programTail))
+		archJumpTable[auditArch] = uint32(len(programTail))
 	}

 	// Add a dummy "jump to filter" for any architecture we might miss below.
@@ -517,9 +522,9 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
 	//       architectures based on how large the jumps are going to be, or
 	//       re-sort the candidate architectures each time to make sure that we
 	//       pick the largest jump which is going to be smaller than 255.
-	for nativeArch := range lastSyscalls {
+	for auditArch := range lastSyscalls {
 		// We jump forwards but the jump table is calculated from the *END*.
-		jump := uint32(len(programTail)) - archJumpTable[nativeArch]
+		jump := uint32(len(programTail)) - archJumpTable[auditArch]

 		// Same routine as above -- this is a basic jeq check, complicated
 		// slightly if it turns out that we need to do a long jump.
@@ -528,7 +533,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
 				// jeq [arch],[jump]
 				bpf.JumpIf{
 					Cond:     bpf.JumpEqual,
-					Val:      uint32(nativeArch),
+					Val:      uint32(auditArch),
 					SkipTrue: uint8(jump),
 				},
 			}, programTail...)
@@ -537,7 +542,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
 				// jne [arch],1
 				bpf.JumpIf{
 					Cond:     bpf.JumpNotEqual,
-					Val:      uint32(nativeArch),
+					Val:      uint32(auditArch),
 					SkipTrue: 1,
 				},
 				// ja [jump]