Update to latest cadvisor

Signed-off-by: Davanum Srinivas <davanum@gmail.com>
This commit is contained in:
Davanum Srinivas
2024-10-25 20:05:11 -04:00
parent b3cf9c6e5c
commit 152d342a8d
106 changed files with 13 additions and 20577 deletions

View File

@@ -1,318 +0,0 @@
# libcontainer
[![Go Reference](https://pkg.go.dev/badge/github.com/opencontainers/runc/libcontainer.svg)](https://pkg.go.dev/github.com/opencontainers/runc/libcontainer)
Libcontainer provides a native Go implementation for creating containers
with namespaces, cgroups, capabilities, and filesystem access controls.
It allows you to manage the lifecycle of the container performing additional operations
after the container is created.
#### Container
A container is a self contained execution environment that shares the kernel of the
host system and which is (optionally) isolated from other containers in the system.
#### Using libcontainer
Because containers are spawned in a two step process you will need a binary that
will be executed as the init process for the container. In libcontainer, we use
the current binary (/proc/self/exe) to be executed as the init process, and use
arg "init", we call the first step process "bootstrap", so you always need a "init"
function as the entry of "bootstrap".
In addition to the go init function the early stage bootstrap is handled by importing
[nsenter](https://github.com/opencontainers/runc/blob/master/libcontainer/nsenter/README.md).
```go
import (
_ "github.com/opencontainers/runc/libcontainer/nsenter"
)
func init() {
if len(os.Args) > 1 && os.Args[1] == "init" {
runtime.GOMAXPROCS(1)
runtime.LockOSThread()
factory, _ := libcontainer.New("")
if err := factory.StartInitialization(); err != nil {
logrus.Fatal(err)
}
panic("--this line should have never been executed, congratulations--")
}
}
```
Then to create a container you first have to initialize an instance of a factory
that will handle the creation and initialization for a container.
```go
factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init"))
if err != nil {
logrus.Fatal(err)
return
}
```
Once you have an instance of the factory created we can create a configuration
struct describing how the container is to be created. A sample would look similar to this:
```go
defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
var devices []*configs.DeviceRule
for _, device := range specconv.AllowedDevices {
devices = append(devices, &device.Rule)
}
config := &configs.Config{
Rootfs: "/your/path/to/rootfs",
Capabilities: &configs.Capabilities{
Bounding: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Effective: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Permitted: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Ambient: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
},
Namespaces: configs.Namespaces([]configs.Namespace{
{Type: configs.NEWNS},
{Type: configs.NEWUTS},
{Type: configs.NEWIPC},
{Type: configs.NEWPID},
{Type: configs.NEWUSER},
{Type: configs.NEWNET},
{Type: configs.NEWCGROUP},
}),
Cgroups: &configs.Cgroup{
Name: "test-container",
Parent: "system",
Resources: &configs.Resources{
MemorySwappiness: nil,
Devices: devices,
},
},
MaskPaths: []string{
"/proc/kcore",
"/sys/firmware",
},
ReadonlyPaths: []string{
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
},
Devices: specconv.AllowedDevices,
Hostname: "testing",
Mounts: []*configs.Mount{
{
Source: "proc",
Destination: "/proc",
Device: "proc",
Flags: defaultMountFlags,
},
{
Source: "tmpfs",
Destination: "/dev",
Device: "tmpfs",
Flags: unix.MS_NOSUID | unix.MS_STRICTATIME,
Data: "mode=755",
},
{
Source: "devpts",
Destination: "/dev/pts",
Device: "devpts",
Flags: unix.MS_NOSUID | unix.MS_NOEXEC,
Data: "newinstance,ptmxmode=0666,mode=0620,gid=5",
},
{
Device: "tmpfs",
Source: "shm",
Destination: "/dev/shm",
Data: "mode=1777,size=65536k",
Flags: defaultMountFlags,
},
{
Source: "mqueue",
Destination: "/dev/mqueue",
Device: "mqueue",
Flags: defaultMountFlags,
},
{
Source: "sysfs",
Destination: "/sys",
Device: "sysfs",
Flags: defaultMountFlags | unix.MS_RDONLY,
},
},
UidMappings: []configs.IDMap{
{
ContainerID: 0,
HostID: 1000,
Size: 65536,
},
},
GidMappings: []configs.IDMap{
{
ContainerID: 0,
HostID: 1000,
Size: 65536,
},
},
Networks: []*configs.Network{
{
Type: "loopback",
Address: "127.0.0.1/0",
Gateway: "localhost",
},
},
Rlimits: []configs.Rlimit{
{
Type: unix.RLIMIT_NOFILE,
Hard: uint64(1025),
Soft: uint64(1025),
},
},
}
```
Once you have the configuration populated you can create a container:
```go
container, err := factory.Create("container-id", config)
if err != nil {
logrus.Fatal(err)
return
}
```
To spawn bash as the initial process inside the container and have the
processes pid returned in order to wait, signal, or kill the process:
```go
process := &libcontainer.Process{
Args: []string{"/bin/bash"},
Env: []string{"PATH=/bin"},
User: "daemon",
Stdin: os.Stdin,
Stdout: os.Stdout,
Stderr: os.Stderr,
Init: true,
}
err := container.Run(process)
if err != nil {
container.Destroy()
logrus.Fatal(err)
return
}
// wait for the process to finish.
_, err := process.Wait()
if err != nil {
logrus.Fatal(err)
}
// destroy the container.
container.Destroy()
```
Additional ways to interact with a running container are:
```go
// return all the pids for all processes running inside the container.
processes, err := container.Processes()
// get detailed cpu, memory, io, and network statistics for the container and
// it's processes.
stats, err := container.Stats()
// pause all processes inside the container.
container.Pause()
// resume all paused processes.
container.Resume()
// send signal to container's init process.
container.Signal(signal)
// update container resource constraints.
container.Set(config)
// get current status of the container.
status, err := container.Status()
// get current container's state information.
state, err := container.State()
```
#### Checkpoint & Restore
libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
This lets you save the state of a process running inside a container to disk, and then restore
that state into a new process, on the same machine or on another machine.
`criu` version 1.5.2 or higher is required to use checkpoint and restore.
If you don't already have `criu` installed, you can build it from source, following the
[online instructions](http://criu.org/Installation). `criu` is also installed in the docker image
generated when building libcontainer with docker.
## Copyright and license
Code and documentation copyright 2014 Docker, inc.
The code and documentation are released under the [Apache 2.0 license](../LICENSE).
The documentation is also released under Creative Commons Attribution 4.0 International License.
You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/.

View File

@@ -1,465 +0,0 @@
## Container Specification - v1
This is the standard configuration for version 1 containers. It includes
namespaces, standard filesystem setup, a default Linux capability set, and
information about resource reservations. It also has information about any
populated environment settings for the processes running inside a container.
Along with the configuration of how a container is created the standard also
discusses actions that can be performed on a container to manage and inspect
information about the processes running inside.
The v1 profile is meant to be able to accommodate the majority of applications
with a strong security configuration.
### System Requirements and Compatibility
Minimum requirements:
* Kernel version - 3.10 recommended 2.6.2x minimum(with backported patches)
* Mounted cgroups with each subsystem in its own hierarchy
### Namespaces
| Flag | Enabled |
| --------------- | ------- |
| CLONE_NEWPID | 1 |
| CLONE_NEWUTS | 1 |
| CLONE_NEWIPC | 1 |
| CLONE_NEWNET | 1 |
| CLONE_NEWNS | 1 |
| CLONE_NEWUSER | 1 |
| CLONE_NEWCGROUP | 1 |
Namespaces are created for the container via the `unshare` syscall.
### Filesystem
A root filesystem must be provided to a container for execution. The container
will use this root filesystem (rootfs) to jail and spawn processes inside where
the binaries and system libraries are local to that directory. Any binaries
to be executed must be contained within this rootfs.
Mounts that happen inside the container are automatically cleaned up when the
container exits as the mount namespace is destroyed and the kernel will
unmount all the mounts that were setup within that namespace.
For a container to execute properly there are certain filesystems that
are required to be mounted within the rootfs that the runtime will setup.
| Path | Type | Flags | Data |
| ----------- | ------ | -------------------------------------- | ---------------------------------------- |
| /proc | proc | MS_NOEXEC,MS_NOSUID,MS_NODEV | |
| /dev | tmpfs | MS_NOEXEC,MS_STRICTATIME | mode=755 |
| /dev/shm | tmpfs | MS_NOEXEC,MS_NOSUID,MS_NODEV | mode=1777,size=65536k |
| /dev/mqueue | mqueue | MS_NOEXEC,MS_NOSUID,MS_NODEV | |
| /dev/pts | devpts | MS_NOEXEC,MS_NOSUID | newinstance,ptmxmode=0666,mode=620,gid=5 |
| /sys | sysfs | MS_NOEXEC,MS_NOSUID,MS_NODEV,MS_RDONLY | |
After a container's filesystems are mounted within the newly created
mount namespace `/dev` will need to be populated with a set of device nodes.
It is expected that a rootfs does not need to have any device nodes specified
for `/dev` within the rootfs as the container will setup the correct devices
that are required for executing a container's process.
| Path | Mode | Access |
| ------------ | ---- | ---------- |
| /dev/null | 0666 | rwm |
| /dev/zero | 0666 | rwm |
| /dev/full | 0666 | rwm |
| /dev/tty | 0666 | rwm |
| /dev/random | 0666 | rwm |
| /dev/urandom | 0666 | rwm |
**ptmx**
`/dev/ptmx` will need to be a symlink to the host's `/dev/ptmx` within
the container.
The use of a pseudo TTY is optional within a container and it should support both.
If a pseudo is provided to the container `/dev/console` will need to be
setup by binding the console in `/dev/` after it has been populated and mounted
in tmpfs.
| Source | Destination | UID GID | Mode | Type |
| --------------- | ------------ | ------- | ---- | ---- |
| *pty host path* | /dev/console | 0 0 | 0600 | bind |
After `/dev/null` has been setup we check for any external links between
the container's io, STDIN, STDOUT, STDERR. If the container's io is pointing
to `/dev/null` outside the container we close and `dup2` the `/dev/null`
that is local to the container's rootfs.
After the container has `/proc` mounted a few standard symlinks are setup
within `/dev/` for the io.
| Source | Destination |
| --------------- | ----------- |
| /proc/self/fd | /dev/fd |
| /proc/self/fd/0 | /dev/stdin |
| /proc/self/fd/1 | /dev/stdout |
| /proc/self/fd/2 | /dev/stderr |
A `pivot_root` is used to change the root for the process, effectively
jailing the process inside the rootfs.
```c
put_old = mkdir(...);
pivot_root(rootfs, put_old);
chdir("/");
unmount(put_old, MS_DETACH);
rmdir(put_old);
```
For container's running with a rootfs inside `ramfs` a `MS_MOVE` combined
with a `chroot` is required as `pivot_root` is not supported in `ramfs`.
```c
mount(rootfs, "/", NULL, MS_MOVE, NULL);
chroot(".");
chdir("/");
```
The `umask` is set back to `0022` after the filesystem setup has been completed.
### Resources
Cgroups are used to handle resource allocation for containers. This includes
system resources like cpu, memory, and device access.
| Subsystem | Enabled |
| ---------- | ------- |
| devices | 1 |
| memory | 1 |
| cpu | 1 |
| cpuacct | 1 |
| cpuset | 1 |
| blkio | 1 |
| perf_event | 1 |
| freezer | 1 |
| hugetlb | 1 |
| pids | 1 |
All cgroup subsystem are joined so that statistics can be collected from
each of the subsystems. Freezer does not expose any stats but is joined
so that containers can be paused and resumed.
The parent process of the container's init must place the init pid inside
the correct cgroups before the initialization begins. This is done so
that no processes or threads escape the cgroups. This sync is
done via a pipe ( specified in the runtime section below ) that the container's
init process will block waiting for the parent to finish setup.
### IntelRdt
Intel platforms with new Xeon CPU support Resource Director Technology (RDT).
Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are
two sub-features of RDT.
Cache Allocation Technology (CAT) provides a way for the software to restrict
cache allocation to a defined 'subset' of L3 cache which may be overlapping
with other 'subsets'. The different subsets are identified by class of
service (CLOS) and each CLOS has a capacity bitmask (CBM).
Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle
over memory bandwidth for the software. A user controls the resource by
indicating the percentage of maximum memory bandwidth or memory bandwidth limit
in MBps unit if MBA Software Controller is enabled.
It can be used to handle L3 cache and memory bandwidth resources allocation
for containers if hardware and kernel support Intel RDT CAT and MBA features.
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via
"resource control" filesystem.
Intel RDT "resource control" filesystem hierarchy:
```
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| | |-- cbm_mask
| | |-- min_cbm_bits
| | |-- num_closids
| |-- MB
| |-- bandwidth_gran
| |-- delay_linear
| |-- min_bandwidth
| |-- num_closids
|-- ...
|-- schemata
|-- tasks
|-- <container_id>
|-- ...
|-- schemata
|-- tasks
```
For runc, we can make use of `tasks` and `schemata` configuration for L3
cache and memory bandwidth resources constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent.
The file `schemata` has a list of all the resources available to this group.
Each resource (L3 cache, memory bandwidth) has its own line and format.
L3 cache schema:
It has allocation bitmasks/values for L3 cache on each socket, which
contains L3 cache id and capacity bitmask (CBM).
```
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
```
For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0"
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel CPU models. Kernel will check if it is valid when writing.
e.g., default value 0xfffff in root indicates the max bits of CBM is 20
bits, which mapping to entire L3 cache capacity. Some valid CBM values to
set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
Memory bandwidth schema:
It has allocation values for memory bandwidth on each socket, which contains
L3 cache id and memory bandwidth.
```
Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
```
For example, on a two-socket machine, the schema line could be "MB:0=20;1=70"
The minimum bandwidth percentage value for each CPU model is predefined and
can be looked up through "info/MB/min_bandwidth". The bandwidth granularity
that is allocated is also dependent on the CPU model and can be looked up at
"info/MB/bandwidth_gran". The available bandwidth control steps are:
min_bw + N * bw_gran. Intermediate values are rounded to the next control
step available on the hardware.
If MBA Software Controller is enabled through mount option "-o mba_MBps"
mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl
We could specify memory bandwidth in "MBps" (Mega Bytes per second) unit
instead of "percentages". The kernel underneath would use a software feedback
mechanism or a "Software Controller" which reads the actual bandwidth using
MBM counters and adjust the memory bandwidth percentages to ensure:
"actual memory bandwidth < user specified memory bandwidth".
For example, on a two-socket machine, the schema line could be
"MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on socket 0
and 7000 MBps memory bandwidth limit on socket 1.
For more information about Intel RDT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
```
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10%
with a memory bandwidth granularity of 10%.
Tasks inside the container only have access to the "upper" 7/11 of L3 cache
on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a
maximum memory bandwidth of 20% on socket 0 and 70% on socket 1.
"linux": {
"intelRdt": {
"closID": "guaranteed_group",
"l3CacheSchema": "L3:0=7f0;1=1f",
"memBwSchema": "MB:0=20;1=70"
}
}
```
### Security
The standard set of Linux capabilities that are set in a container
provide a good default for security and flexibility for the applications.
| Capability | Enabled |
| -------------------- | ------- |
| CAP_NET_RAW | 1 |
| CAP_NET_BIND_SERVICE | 1 |
| CAP_AUDIT_READ | 1 |
| CAP_AUDIT_WRITE | 1 |
| CAP_DAC_OVERRIDE | 1 |
| CAP_SETFCAP | 1 |
| CAP_SETPCAP | 1 |
| CAP_SETGID | 1 |
| CAP_SETUID | 1 |
| CAP_MKNOD | 1 |
| CAP_CHOWN | 1 |
| CAP_FOWNER | 1 |
| CAP_FSETID | 1 |
| CAP_KILL | 1 |
| CAP_SYS_CHROOT | 1 |
| CAP_NET_BROADCAST | 0 |
| CAP_SYS_MODULE | 0 |
| CAP_SYS_RAWIO | 0 |
| CAP_SYS_PACCT | 0 |
| CAP_SYS_ADMIN | 0 |
| CAP_SYS_NICE | 0 |
| CAP_SYS_RESOURCE | 0 |
| CAP_SYS_TIME | 0 |
| CAP_SYS_TTY_CONFIG | 0 |
| CAP_AUDIT_CONTROL | 0 |
| CAP_MAC_OVERRIDE | 0 |
| CAP_MAC_ADMIN | 0 |
| CAP_NET_ADMIN | 0 |
| CAP_SYSLOG | 0 |
| CAP_DAC_READ_SEARCH | 0 |
| CAP_LINUX_IMMUTABLE | 0 |
| CAP_IPC_LOCK | 0 |
| CAP_IPC_OWNER | 0 |
| CAP_SYS_PTRACE | 0 |
| CAP_SYS_BOOT | 0 |
| CAP_LEASE | 0 |
| CAP_WAKE_ALARM | 0 |
| CAP_BLOCK_SUSPEND | 0 |
Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor)
and [selinux](http://selinuxproject.org/page/Main_Page) can be used with
the containers. A container should support setting an apparmor profile or
selinux process and mount labels if provided in the configuration.
Standard apparmor profile:
```c
#include <tunables/global>
profile <profile_name> flags=(attach_disconnected,mediate_deleted) {
#include <abstractions/base>
network,
capability,
file,
umount,
deny @{PROC}/sys/fs/** wklx,
deny @{PROC}/sysrq-trigger rwklx,
deny @{PROC}/mem rwklx,
deny @{PROC}/kmem rwklx,
deny @{PROC}/sys/kernel/[^s][^h][^m]* wklx,
deny @{PROC}/sys/kernel/*/** wklx,
deny mount,
deny /sys/[^f]*/** wklx,
deny /sys/f[^s]*/** wklx,
deny /sys/fs/[^c]*/** wklx,
deny /sys/fs/c[^g]*/** wklx,
deny /sys/fs/cg[^r]*/** wklx,
deny /sys/firmware/efi/efivars/** rwklx,
deny /sys/kernel/security/** rwklx,
}
```
*TODO: seccomp work is being done to find a good default config*
### Runtime and Init Process
During container creation the parent process needs to talk to the container's init
process and have a form of synchronization. This is accomplished by creating
a pipe that is passed to the container's init. When the init process first spawns
it will block on its side of the pipe until the parent closes its side. This
allows the parent to have time to set the new process inside a cgroup hierarchy
and/or write any uid/gid mappings required for user namespaces.
The pipe is passed to the init process via FD 3.
The application consuming libcontainer should be compiled statically. libcontainer
does not define any init process and the arguments provided are used to `exec` the
process inside the application. There should be no long running init within the
container spec.
If a pseudo tty is provided to a container it will open and `dup2` the console
as the container's STDIN, STDOUT, STDERR as well as mounting the console
as `/dev/console`.
An extra set of mounts are provided to a container and setup for use. A container's
rootfs can contain some non portable files inside that can cause side effects during
execution of a process. These files are usually created and populated with the container
specific information via the runtime.
**Extra runtime files:**
* /etc/hosts
* /etc/resolv.conf
* /etc/hostname
* /etc/localtime
#### Defaults
There are a few defaults that can be overridden by users, but in their omission
these apply to processes within a container.
| Type | Value |
| ------------------- | ------------------------------ |
| Parent Death Signal | SIGKILL |
| UID | 0 |
| GID | 0 |
| GROUPS | 0, NULL |
| CWD | "/" |
| $HOME | Current user's home dir or "/" |
| Readonly rootfs | false |
| Pseudo TTY | false |
## Actions
After a container is created there is a standard set of actions that can
be done to the container. These actions are part of the public API for
a container.
| Action | Description |
| -------------- | ------------------------------------------------------------------ |
| Get processes | Return all the pids for processes running inside a container |
| Get Stats | Return resource statistics for the container as a whole |
| Wait | Waits on the container's init process ( pid 1 ) |
| Wait Process | Wait on any of the container's processes returning the exit status |
| Destroy | Kill the container's init process and remove any filesystem state |
| Signal | Send a signal to the container's init process |
| Signal Process | Send a signal to any of the container's processes |
| Pause | Pause all processes inside the container |
| Resume | Resume all processes inside the container if paused |
| Exec | Execute a new process inside of the container ( requires setns ) |
| Set | Setup configs of the container after it's created |
### Execute a new process inside of a running container
User can execute a new process inside of a running container. Any binaries to be
executed must be accessible within the container's rootfs.
The started process will run inside the container's rootfs. Any changes
made by the process to the container's filesystem will persist after the
process finished executing.
The started process will join all the container's existing namespaces. When the
container is paused, the process will also be paused and will resume when
the container is unpaused. The started process will only run when the container's
primary process (PID 1) is running, and will not be restarted when the container
is restarted.
#### Planned additions
The started process will have its own cgroups nested inside the container's
cgroups. This is used for process tracking and optionally resource allocation
handling for the new process. Freezer cgroup is required, the rest of the cgroups
are optional. The process executor must place its pid inside the correct
cgroups before starting the process. This is done so that no child processes or
threads can escape the cgroups.
When the process is stopped, the process executor will try (in a best-effort way)
to stop all its children and remove the sub-cgroups.

View File

@@ -1,16 +0,0 @@
package apparmor
import "errors"
var (
// IsEnabled returns true if apparmor is enabled for the host.
IsEnabled = isEnabled
// ApplyProfile will apply the profile with the specified name to the process after
// the next exec. It is only supported on Linux and produces an ErrApparmorNotEnabled
// on other platforms.
ApplyProfile = applyProfile
// ErrApparmorNotEnabled indicates that AppArmor is not enabled or not supported.
ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported")
)

View File

@@ -1,68 +0,0 @@
package apparmor
import (
"errors"
"fmt"
"os"
"sync"
"github.com/opencontainers/runc/libcontainer/utils"
)
var (
appArmorEnabled bool
checkAppArmor sync.Once
)
// isEnabled returns true if apparmor is enabled for the host.
func isEnabled() bool {
checkAppArmor.Do(func() {
if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil {
buf, err := os.ReadFile("/sys/module/apparmor/parameters/enabled")
appArmorEnabled = err == nil && len(buf) > 1 && buf[0] == 'Y'
}
})
return appArmorEnabled
}
func setProcAttr(attr, value string) error {
// Under AppArmor you can only change your own attr, so use /proc/self/
// instead of /proc/<tid>/ like libapparmor does
attrPath := "/proc/self/attr/apparmor/" + attr
if _, err := os.Stat(attrPath); errors.Is(err, os.ErrNotExist) {
// fall back to the old convention
attrPath = "/proc/self/attr/" + attr
}
f, err := os.OpenFile(attrPath, os.O_WRONLY, 0)
if err != nil {
return err
}
defer f.Close()
if err := utils.EnsureProcHandle(f); err != nil {
return err
}
_, err = f.WriteString(value)
return err
}
// changeOnExec reimplements aa_change_onexec from libapparmor in Go
func changeOnExec(name string) error {
if err := setProcAttr("exec", "exec "+name); err != nil {
return fmt.Errorf("apparmor failed to apply profile: %w", err)
}
return nil
}
// applyProfile will apply the profile with the specified name to the process after
// the next exec. It is only supported on Linux and produces an error on other
// platforms.
func applyProfile(name string) error {
if name == "" {
return nil
}
return changeOnExec(name)
}

View File

@@ -1,15 +0,0 @@
//go:build !linux
// +build !linux
package apparmor
func isEnabled() bool {
return false
}
func applyProfile(name string) error {
if name != "" {
return ErrApparmorNotEnabled
}
return nil
}

View File

@@ -1,123 +0,0 @@
//go:build linux
// +build linux
package capabilities
import (
"sort"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
"github.com/syndtr/gocapability/capability"
)
const allCapabilityTypes = capability.CAPS | capability.BOUNDING | capability.AMBIENT
var (
capabilityMap map[string]capability.Cap
capTypes = []capability.CapType{
capability.BOUNDING,
capability.PERMITTED,
capability.INHERITABLE,
capability.EFFECTIVE,
capability.AMBIENT,
}
)
func init() {
capabilityMap = make(map[string]capability.Cap, capability.CAP_LAST_CAP+1)
for _, c := range capability.List() {
if c > capability.CAP_LAST_CAP {
continue
}
capabilityMap["CAP_"+strings.ToUpper(c.String())] = c
}
}
// KnownCapabilities returns the list of the known capabilities.
// Used by `runc features`.
func KnownCapabilities() []string {
list := capability.List()
res := make([]string, len(list))
for i, c := range list {
res[i] = "CAP_" + strings.ToUpper(c.String())
}
return res
}
// New creates a new Caps from the given Capabilities config. Unknown Capabilities
// or Capabilities that are unavailable in the current environment are ignored,
// printing a warning instead.
func New(capConfig *configs.Capabilities) (*Caps, error) {
var (
err error
c Caps
)
unknownCaps := make(map[string]struct{})
c.caps = map[capability.CapType][]capability.Cap{
capability.BOUNDING: capSlice(capConfig.Bounding, unknownCaps),
capability.EFFECTIVE: capSlice(capConfig.Effective, unknownCaps),
capability.INHERITABLE: capSlice(capConfig.Inheritable, unknownCaps),
capability.PERMITTED: capSlice(capConfig.Permitted, unknownCaps),
capability.AMBIENT: capSlice(capConfig.Ambient, unknownCaps),
}
if c.pid, err = capability.NewPid2(0); err != nil {
return nil, err
}
if err = c.pid.Load(); err != nil {
return nil, err
}
if len(unknownCaps) > 0 {
logrus.Warn("ignoring unknown or unavailable capabilities: ", mapKeys(unknownCaps))
}
return &c, nil
}
// capSlice converts the slice of capability names in caps, to their numeric
// equivalent, and returns them as a slice. Unknown or unavailable capabilities
// are not returned, but appended to unknownCaps.
func capSlice(caps []string, unknownCaps map[string]struct{}) []capability.Cap {
var out []capability.Cap
for _, c := range caps {
if v, ok := capabilityMap[c]; !ok {
unknownCaps[c] = struct{}{}
} else {
out = append(out, v)
}
}
return out
}
// mapKeys returns the keys of input in sorted order
func mapKeys(input map[string]struct{}) []string {
var keys []string
for c := range input {
keys = append(keys, c)
}
sort.Strings(keys)
return keys
}
// Caps holds the capabilities for a container.
type Caps struct {
pid capability.Capabilities
caps map[capability.CapType][]capability.Cap
}
// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist.
func (c *Caps) ApplyBoundingSet() error {
c.pid.Clear(capability.BOUNDING)
c.pid.Set(capability.BOUNDING, c.caps[capability.BOUNDING]...)
return c.pid.Apply(capability.BOUNDING)
}
// Apply sets all the capabilities for the current process in the config.
func (c *Caps) ApplyCaps() error {
c.pid.Clear(allCapabilityTypes)
for _, g := range capTypes {
c.pid.Set(g, c.caps[g]...)
}
return c.pid.Apply(allCapabilityTypes)
}

View File

@@ -1,4 +0,0 @@
//go:build !linux
// +build !linux
package capabilities

View File

@@ -1,86 +0,0 @@
package validate
import (
"errors"
"fmt"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
)
// rootlessEUID makes sure that the config can be applied when runc
// is being executed as a non-root user (euid != 0) in the current user namespace.
func (v *ConfigValidator) rootlessEUID(config *configs.Config) error {
if !config.RootlessEUID {
return nil
}
if err := rootlessEUIDMappings(config); err != nil {
return err
}
if err := rootlessEUIDMount(config); err != nil {
return err
}
// XXX: We currently can't verify the user config at all, because
// configs.Config doesn't store the user-related configs. So this
// has to be verified by setupUser() in init_linux.go.
return nil
}
func rootlessEUIDMappings(config *configs.Config) error {
if !config.Namespaces.Contains(configs.NEWUSER) {
return errors.New("rootless container requires user namespaces")
}
// We only require mappings if we are not joining another userns.
if path := config.Namespaces.PathOf(configs.NEWUSER); path == "" {
if len(config.UidMappings) == 0 {
return errors.New("rootless containers requires at least one UID mapping")
}
if len(config.GidMappings) == 0 {
return errors.New("rootless containers requires at least one GID mapping")
}
}
return nil
}
// mount verifies that the user isn't trying to set up any mounts they don't have
// the rights to do. In addition, it makes sure that no mount has a `uid=` or
// `gid=` option that doesn't resolve to root.
func rootlessEUIDMount(config *configs.Config) error {
// XXX: We could whitelist allowed devices at this point, but I'm not
// convinced that's a good idea. The kernel is the best arbiter of
// access control.
for _, mount := range config.Mounts {
// Check that the options list doesn't contain any uid= or gid= entries
// that don't resolve to root.
for _, opt := range strings.Split(mount.Data, ",") {
if strings.HasPrefix(opt, "uid=") {
var uid int
n, err := fmt.Sscanf(opt, "uid=%d", &uid)
if n != 1 || err != nil {
// Ignore unknown mount options.
continue
}
if _, err := config.HostUID(uid); err != nil {
return fmt.Errorf("cannot specify uid=%d mount option for rootless container: %w", uid, err)
}
}
if strings.HasPrefix(opt, "gid=") {
var gid int
n, err := fmt.Sscanf(opt, "gid=%d", &gid)
if n != 1 || err != nil {
// Ignore unknown mount options.
continue
}
if _, err := config.HostGID(gid); err != nil {
return fmt.Errorf("cannot specify gid=%d mount option for rootless container: %w", gid, err)
}
}
}
}
return nil
}

View File

@@ -1,306 +0,0 @@
package validate
import (
"errors"
"fmt"
"os"
"path/filepath"
"strings"
"sync"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/intelrdt"
selinux "github.com/opencontainers/selinux/go-selinux"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
type Validator interface {
Validate(*configs.Config) error
}
func New() Validator {
return &ConfigValidator{}
}
type ConfigValidator struct{}
type check func(config *configs.Config) error
func (v *ConfigValidator) Validate(config *configs.Config) error {
checks := []check{
v.cgroups,
v.rootfs,
v.network,
v.hostname,
v.security,
v.usernamespace,
v.cgroupnamespace,
v.sysctl,
v.intelrdt,
v.rootlessEUID,
}
for _, c := range checks {
if err := c(config); err != nil {
return err
}
}
// Relaxed validation rules for backward compatibility
warns := []check{
v.mounts, // TODO (runc v1.x.x): make this an error instead of a warning
}
for _, c := range warns {
if err := c(config); err != nil {
logrus.WithError(err).Warn("invalid configuration")
}
}
return nil
}
// rootfs validates if the rootfs is an absolute path and is not a symlink
// to the container's root filesystem.
func (v *ConfigValidator) rootfs(config *configs.Config) error {
if _, err := os.Stat(config.Rootfs); err != nil {
return fmt.Errorf("invalid rootfs: %w", err)
}
cleaned, err := filepath.Abs(config.Rootfs)
if err != nil {
return fmt.Errorf("invalid rootfs: %w", err)
}
if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil {
return fmt.Errorf("invalid rootfs: %w", err)
}
if filepath.Clean(config.Rootfs) != cleaned {
return errors.New("invalid rootfs: not an absolute path, or a symlink")
}
return nil
}
func (v *ConfigValidator) network(config *configs.Config) error {
if !config.Namespaces.Contains(configs.NEWNET) {
if len(config.Networks) > 0 || len(config.Routes) > 0 {
return errors.New("unable to apply network settings without a private NET namespace")
}
}
return nil
}
func (v *ConfigValidator) hostname(config *configs.Config) error {
if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) {
return errors.New("unable to set hostname without a private UTS namespace")
}
return nil
}
func (v *ConfigValidator) security(config *configs.Config) error {
// restrict sys without mount namespace
if (len(config.MaskPaths) > 0 || len(config.ReadonlyPaths) > 0) &&
!config.Namespaces.Contains(configs.NEWNS) {
return errors.New("unable to restrict sys entries without a private MNT namespace")
}
if config.ProcessLabel != "" && !selinux.GetEnabled() {
return errors.New("selinux label is specified in config, but selinux is disabled or not supported")
}
return nil
}
func (v *ConfigValidator) usernamespace(config *configs.Config) error {
if config.Namespaces.Contains(configs.NEWUSER) {
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
return errors.New("user namespaces aren't enabled in the kernel")
}
hasPath := config.Namespaces.PathOf(configs.NEWUSER) != ""
hasMappings := config.UidMappings != nil || config.GidMappings != nil
if !hasPath && !hasMappings {
return errors.New("user namespaces enabled, but no namespace path to join nor mappings to apply specified")
}
// The hasPath && hasMappings validation case is handled in specconv --
// we cache the mappings in Config during specconv in the hasPath case,
// so we cannot do that validation here.
} else {
if config.UidMappings != nil || config.GidMappings != nil {
return errors.New("user namespace mappings specified, but user namespace isn't enabled in the config")
}
}
return nil
}
func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error {
if config.Namespaces.Contains(configs.NEWCGROUP) {
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
return errors.New("cgroup namespaces aren't enabled in the kernel")
}
}
return nil
}
// convertSysctlVariableToDotsSeparator can return sysctl variables in dots separator format.
// The '/' separator is also accepted in place of a '.'.
// Convert the sysctl variables to dots separator format for validation.
// More info: sysctl(8), sysctl.d(5).
//
// For example:
// Input sysctl variable "net/ipv4/conf/eno2.100.rp_filter"
// will return the converted value "net.ipv4.conf.eno2/100.rp_filter"
func convertSysctlVariableToDotsSeparator(val string) string {
if val == "" {
return val
}
firstSepIndex := strings.IndexAny(val, "./")
if firstSepIndex == -1 || val[firstSepIndex] == '.' {
return val
}
f := func(r rune) rune {
switch r {
case '.':
return '/'
case '/':
return '.'
}
return r
}
return strings.Map(f, val)
}
// sysctl validates that the specified sysctl keys are valid or not.
// /proc/sys isn't completely namespaced and depending on which namespaces
// are specified, a subset of sysctls are permitted.
func (v *ConfigValidator) sysctl(config *configs.Config) error {
validSysctlMap := map[string]bool{
"kernel.msgmax": true,
"kernel.msgmnb": true,
"kernel.msgmni": true,
"kernel.sem": true,
"kernel.shmall": true,
"kernel.shmmax": true,
"kernel.shmmni": true,
"kernel.shm_rmid_forced": true,
}
var (
netOnce sync.Once
hostnet bool
hostnetErr error
)
for s := range config.Sysctl {
s := convertSysctlVariableToDotsSeparator(s)
if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") {
if config.Namespaces.Contains(configs.NEWIPC) {
continue
} else {
return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s)
}
}
if strings.HasPrefix(s, "net.") {
// Is container using host netns?
// Here "host" means "current", not "initial".
netOnce.Do(func() {
if !config.Namespaces.Contains(configs.NEWNET) {
hostnet = true
return
}
path := config.Namespaces.PathOf(configs.NEWNET)
if path == "" {
// own netns, so hostnet = false
return
}
hostnet, hostnetErr = isHostNetNS(path)
})
if hostnetErr != nil {
return fmt.Errorf("invalid netns path: %w", hostnetErr)
}
if hostnet {
return fmt.Errorf("sysctl %q not allowed in host network namespace", s)
}
continue
}
if config.Namespaces.Contains(configs.NEWUTS) {
switch s {
case "kernel.domainname":
// This is namespaced and there's no explicit OCI field for it.
continue
case "kernel.hostname":
// This is namespaced but there's a conflicting (dedicated) OCI field for it.
return fmt.Errorf("sysctl %q is not allowed as it conflicts with the OCI %q field", s, "hostname")
}
}
return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s)
}
return nil
}
func (v *ConfigValidator) intelrdt(config *configs.Config) error {
if config.IntelRdt != nil {
if config.IntelRdt.ClosID == "." || config.IntelRdt.ClosID == ".." || strings.Contains(config.IntelRdt.ClosID, "/") {
return fmt.Errorf("invalid intelRdt.ClosID %q", config.IntelRdt.ClosID)
}
if !intelrdt.IsCATEnabled() && config.IntelRdt.L3CacheSchema != "" {
return errors.New("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled")
}
if !intelrdt.IsMBAEnabled() && config.IntelRdt.MemBwSchema != "" {
return errors.New("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled")
}
}
return nil
}
func (v *ConfigValidator) cgroups(config *configs.Config) error {
c := config.Cgroups
if c == nil {
return nil
}
if (c.Name != "" || c.Parent != "") && c.Path != "" {
return fmt.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c)
}
r := c.Resources
if r == nil {
return nil
}
if !cgroups.IsCgroup2UnifiedMode() && r.Unified != nil {
return cgroups.ErrV1NoUnified
}
if cgroups.IsCgroup2UnifiedMode() {
_, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
if err != nil {
return err
}
}
return nil
}
func (v *ConfigValidator) mounts(config *configs.Config) error {
for _, m := range config.Mounts {
if !filepath.IsAbs(m.Destination) {
return fmt.Errorf("invalid mount %+v: mount destination not absolute", m)
}
}
return nil
}
func isHostNetNS(path string) (bool, error) {
const currentProcessNetns = "/proc/self/ns/net"
var st1, st2 unix.Stat_t
if err := unix.Stat(currentProcessNetns, &st1); err != nil {
return false, &os.PathError{Op: "stat", Path: currentProcessNetns, Err: err}
}
if err := unix.Stat(path, &st2); err != nil {
return false, &os.PathError{Op: "stat", Path: path, Err: err}
}
return (st1.Dev == st2.Dev) && (st1.Ino == st2.Ino), nil
}

View File

@@ -1,41 +0,0 @@
package libcontainer
import (
"os"
"golang.org/x/sys/unix"
)
// mount initializes the console inside the rootfs mounting with the specified mount label
// and applying the correct ownership of the console.
func mountConsole(slavePath string) error {
oldMask := unix.Umask(0o000)
defer unix.Umask(oldMask)
f, err := os.Create("/dev/console")
if err != nil && !os.IsExist(err) {
return err
}
if f != nil {
f.Close()
}
return mount(slavePath, "/dev/console", "", "bind", unix.MS_BIND, "")
}
// dupStdio opens the slavePath for the console and dups the fds to the current
// processes stdio, fd 0,1,2.
func dupStdio(slavePath string) error {
fd, err := unix.Open(slavePath, unix.O_RDWR, 0)
if err != nil {
return &os.PathError{
Op: "open",
Path: slavePath,
Err: err,
}
}
for _, i := range []int{0, 1, 2} {
if err := unix.Dup3(fd, i, 0); err != nil {
return err
}
}
return nil
}

View File

@@ -1,130 +0,0 @@
// Package libcontainer provides a native Go implementation for creating containers
// with namespaces, cgroups, capabilities, and filesystem access controls.
// It allows you to manage the lifecycle of the container performing additional operations
// after the container is created.
package libcontainer
import (
"os"
"time"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runtime-spec/specs-go"
)
// Status is the status of a container.
type Status int
const (
// Created is the status that denotes the container exists but has not been run yet.
Created Status = iota
// Running is the status that denotes the container exists and is running.
Running
// Pausing is the status that denotes the container exists, it is in the process of being paused.
Pausing
// Paused is the status that denotes the container exists, but all its processes are paused.
Paused
// Stopped is the status that denotes the container does not have a created or running process.
Stopped
)
func (s Status) String() string {
switch s {
case Created:
return "created"
case Running:
return "running"
case Pausing:
return "pausing"
case Paused:
return "paused"
case Stopped:
return "stopped"
default:
return "unknown"
}
}
// BaseState represents the platform agnostic pieces relating to a
// running container's state
type BaseState struct {
// ID is the container ID.
ID string `json:"id"`
// InitProcessPid is the init process id in the parent namespace.
InitProcessPid int `json:"init_process_pid"`
// InitProcessStartTime is the init process start time in clock cycles since boot time.
InitProcessStartTime uint64 `json:"init_process_start"`
// Created is the unix timestamp for the creation time of the container in UTC
Created time.Time `json:"created"`
// Config is the container's configuration.
Config configs.Config `json:"config"`
}
// BaseContainer is a libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
// was not found. BaseContainer includes methods that are platform agnostic.
type BaseContainer interface {
// Returns the ID of the container
ID() string
// Returns the current status of the container.
Status() (Status, error)
// State returns the current container's state information.
State() (*State, error)
// OCIState returns the current container's state information.
OCIState() (*specs.State, error)
// Returns the current config of the container.
Config() configs.Config
// Returns the PIDs inside this container. The PIDs are in the namespace of the calling process.
//
// Some of the returned PIDs may no longer refer to processes in the Container, unless
// the Container state is PAUSED in which case every PID in the slice is valid.
Processes() ([]int, error)
// Returns statistics for the container.
Stats() (*Stats, error)
// Set resources of container as configured
//
// We can use this to change resources when containers are running.
//
Set(config configs.Config) error
// Start a process inside the container. Returns error if process fails to
// start. You can track process lifecycle with passed Process structure.
Start(process *Process) (err error)
// Run immediately starts the process inside the container. Returns error if process
// fails to start. It does not block waiting for the exec fifo after start returns but
// opens the fifo after start returns.
Run(process *Process) (err error)
// Destroys the container, if its in a valid state, after killing any
// remaining running processes.
//
// Any event registrations are removed before the container is destroyed.
// No error is returned if the container is already destroyed.
//
// Running containers must first be stopped using Signal(..).
// Paused containers must first be resumed using Resume(..).
Destroy() error
// Signal sends the provided signal code to the container's initial process.
//
// If all is specified the signal is sent to all processes in the container
// including the initial process.
Signal(s os.Signal, all bool) error
// Exec signals the container to exec the users process at the end of the init.
Exec() error
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,34 +0,0 @@
package libcontainer
import criu "github.com/checkpoint-restore/go-criu/v5/rpc"
type CriuPageServerInfo struct {
Address string // IP address of CRIU page server
Port int32 // port number of CRIU page server
}
type VethPairName struct {
ContainerInterfaceName string
HostInterfaceName string
}
type CriuOpts struct {
ImagesDirectory string // directory for storing image files
WorkDirectory string // directory to cd and write logs/pidfiles/stats to
ParentImage string // directory for storing parent image files in pre-dump and dump
LeaveRunning bool // leave container in running state after checkpoint
TcpEstablished bool // checkpoint/restore established TCP connections
ExternalUnixConnections bool // allow external unix connections
ShellJob bool // allow to dump and restore shell jobs
FileLocks bool // handle file locks, for safety
PreDump bool // call criu predump to perform iterative checkpoint
PageServer CriuPageServerInfo // allow to dump to criu page server
VethPairs []VethPairName // pass the veth to criu when restore
ManageCgroupsMode criu.CriuCgMode // dump or restore cgroup mode
EmptyNs uint32 // don't c/r properties for namespace from this mask
AutoDedup bool // auto deduplication for incremental dumps
LazyPages bool // restore memory pages lazily using userfaultfd
StatusFd int // fd for feedback when lazy server is ready
LsmProfile string // LSM profile used to restore the container
LsmMountContext string // LSM mount context value to use during restore
}

View File

@@ -1,17 +0,0 @@
//go:build !go1.20
// +build !go1.20
package libcontainer
import "golang.org/x/sys/unix"
func eaccess(path string) error {
// This check is similar to access(2) with X_OK except for
// setuid/setgid binaries where it checks against the effective
// (rather than real) uid and gid. It is not needed in go 1.20
// and beyond and will be removed later.
// Relies on code added in https://go-review.googlesource.com/c/sys/+/468877
// and older CLs linked from there.
return unix.Faccessat(unix.AT_FDCWD, path, unix.X_OK, unix.AT_EACCESS)
}

View File

@@ -1,10 +0,0 @@
//go:build go1.20
package libcontainer
func eaccess(path string) error {
// Not needed in Go 1.20+ as the functionality is already in there
// (added by https://go.dev/cl/416115, https://go.dev/cl/414824,
// and fixed in Go 1.20.2 by https://go.dev/cl/469956).
return nil
}

View File

@@ -1,13 +0,0 @@
package libcontainer
import "errors"
var (
ErrExist = errors.New("container with given ID already exists")
ErrInvalidID = errors.New("invalid container ID format")
ErrNotExist = errors.New("container does not exist")
ErrPaused = errors.New("container paused")
ErrRunning = errors.New("container still running")
ErrNotRunning = errors.New("container not running")
ErrNotPaused = errors.New("container not paused")
)

View File

@@ -1,30 +0,0 @@
package libcontainer
import (
"github.com/opencontainers/runc/libcontainer/configs"
)
type Factory interface {
// Creates a new container with the given id and starts the initial process inside it.
// id must be a string containing only letters, digits and underscores and must contain
// between 1 and 1024 characters, inclusive.
//
// The id must not already be in use by an existing container. Containers created using
// a factory with the same path (and filesystem) must have distinct ids.
//
// Returns the new container with a running process.
//
// On error, any partially created container parts are cleaned up (the operation is atomic).
Create(id string, config *configs.Config) (Container, error)
// Load takes an ID for an existing container and returns the container information
// from the state. This presents a read only view of the container.
Load(id string) (Container, error)
// StartInitialization is an internal API to libcontainer used during the reexec of the
// container.
StartInitialization() error
// Type returns info string about factory type (e.g. lxc, libcontainer...)
Type() string
}

View File

@@ -1,402 +0,0 @@
package libcontainer
import (
"encoding/json"
"errors"
"fmt"
"os"
"path/filepath"
"regexp"
"runtime/debug"
"strconv"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/moby/sys/mountinfo"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups/manager"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/configs/validate"
"github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/sirupsen/logrus"
)
const (
stateFilename = "state.json"
execFifoFilename = "exec.fifo"
)
var idRegex = regexp.MustCompile(`^[\w+-\.]+$`)
// InitArgs returns an options func to configure a LinuxFactory with the
// provided init binary path and arguments.
func InitArgs(args ...string) func(*LinuxFactory) error {
return func(l *LinuxFactory) (err error) {
if len(args) > 0 {
// Resolve relative paths to ensure that its available
// after directory changes.
if args[0], err = filepath.Abs(args[0]); err != nil {
// The only error returned from filepath.Abs is
// the one from os.Getwd, i.e. a system error.
return err
}
}
l.InitArgs = args
return nil
}
}
// TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs.
func TmpfsRoot(l *LinuxFactory) error {
mounted, err := mountinfo.Mounted(l.Root)
if err != nil {
return err
}
if !mounted {
if err := mount("tmpfs", l.Root, "", "tmpfs", 0, ""); err != nil {
return err
}
}
return nil
}
// CriuPath returns an option func to configure a LinuxFactory with the
// provided criupath
func CriuPath(criupath string) func(*LinuxFactory) error {
return func(l *LinuxFactory) error {
l.CriuPath = criupath
return nil
}
}
// New returns a linux based container factory based in the root directory and
// configures the factory with the provided option funcs.
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
if root != "" {
if err := os.MkdirAll(root, 0o700); err != nil {
return nil, err
}
}
l := &LinuxFactory{
Root: root,
InitPath: "/proc/self/exe",
InitArgs: []string{os.Args[0], "init"},
Validator: validate.New(),
CriuPath: "criu",
}
for _, opt := range options {
if opt == nil {
continue
}
if err := opt(l); err != nil {
return nil, err
}
}
return l, nil
}
// LinuxFactory implements the default factory interface for linux based systems.
type LinuxFactory struct {
// Root directory for the factory to store state.
Root string
// InitPath is the path for calling the init responsibilities for spawning
// a container.
InitPath string
// InitArgs are arguments for calling the init responsibilities for spawning
// a container.
InitArgs []string
// CriuPath is the path to the criu binary used for checkpoint and restore of
// containers.
CriuPath string
// New{u,g}idmapPath is the path to the binaries used for mapping with
// rootless containers.
NewuidmapPath string
NewgidmapPath string
// Validator provides validation to container configurations.
Validator validate.Validator
}
func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
if l.Root == "" {
return nil, errors.New("root not set")
}
if err := l.validateID(id); err != nil {
return nil, err
}
if err := l.Validator.Validate(config); err != nil {
return nil, err
}
containerRoot, err := securejoin.SecureJoin(l.Root, id)
if err != nil {
return nil, err
}
if _, err := os.Stat(containerRoot); err == nil {
return nil, ErrExist
} else if !os.IsNotExist(err) {
return nil, err
}
cm, err := manager.New(config.Cgroups)
if err != nil {
return nil, err
}
// Check that cgroup does not exist or empty (no processes).
// Note for cgroup v1 this check is not thorough, as there are multiple
// separate hierarchies, while both Exists() and GetAllPids() only use
// one for "devices" controller (assuming others are the same, which is
// probably true in almost all scenarios). Checking all the hierarchies
// would be too expensive.
if cm.Exists() {
pids, err := cm.GetAllPids()
// Reading PIDs can race with cgroups removal, so ignore ENOENT and ENODEV.
if err != nil && !errors.Is(err, os.ErrNotExist) && !errors.Is(err, unix.ENODEV) {
return nil, fmt.Errorf("unable to get cgroup PIDs: %w", err)
}
if len(pids) != 0 {
if config.Cgroups.Systemd {
// systemd cgroup driver can't add a pid to an
// existing systemd unit and will return an
// error anyway, so let's error out early.
return nil, fmt.Errorf("container's cgroup is not empty: %d process(es) found", len(pids))
}
// TODO: return an error.
logrus.Warnf("container's cgroup is not empty: %d process(es) found", len(pids))
logrus.Warn("DEPRECATED: running container in a non-empty cgroup won't be supported in runc 1.2; https://github.com/opencontainers/runc/issues/3132")
}
}
// Check that cgroup is not frozen. Do not use Exists() here
// since in cgroup v1 it only checks "devices" controller.
st, err := cm.GetFreezerState()
if err != nil {
return nil, fmt.Errorf("unable to get cgroup freezer state: %w", err)
}
if st == configs.Frozen {
return nil, errors.New("container's cgroup unexpectedly frozen")
}
if err := os.MkdirAll(containerRoot, 0o711); err != nil {
return nil, err
}
if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil {
return nil, err
}
c := &linuxContainer{
id: id,
root: containerRoot,
config: config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
newuidmapPath: l.NewuidmapPath,
newgidmapPath: l.NewgidmapPath,
cgroupManager: cm,
intelRdtManager: intelrdt.NewManager(config, id, ""),
}
c.state = &stoppedState{c: c}
return c, nil
}
func (l *LinuxFactory) Load(id string) (Container, error) {
if l.Root == "" {
return nil, errors.New("root not set")
}
// when load, we need to check id is valid or not.
if err := l.validateID(id); err != nil {
return nil, err
}
containerRoot, err := securejoin.SecureJoin(l.Root, id)
if err != nil {
return nil, err
}
state, err := l.loadState(containerRoot)
if err != nil {
return nil, err
}
r := &nonChildProcess{
processPid: state.InitProcessPid,
processStartTime: state.InitProcessStartTime,
fds: state.ExternalDescriptors,
}
cm, err := manager.NewWithPaths(state.Config.Cgroups, state.CgroupPaths)
if err != nil {
return nil, err
}
c := &linuxContainer{
initProcess: r,
initProcessStartTime: state.InitProcessStartTime,
id: id,
config: &state.Config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
newuidmapPath: l.NewuidmapPath,
newgidmapPath: l.NewgidmapPath,
cgroupManager: cm,
intelRdtManager: intelrdt.NewManager(&state.Config, id, state.IntelRdtPath),
root: containerRoot,
created: state.Created,
}
c.state = &loadedState{c: c}
if err := c.refreshState(); err != nil {
return nil, err
}
return c, nil
}
func (l *LinuxFactory) Type() string {
return "libcontainer"
}
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) {
// Get the INITPIPE.
envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE")
pipefd, err := strconv.Atoi(envInitPipe)
if err != nil {
err = fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err)
logrus.Error(err)
return err
}
pipe := os.NewFile(uintptr(pipefd), "pipe")
defer pipe.Close()
defer func() {
// We have an error during the initialization of the container's init,
// send it back to the parent process in the form of an initError.
if werr := writeSync(pipe, procError); werr != nil {
fmt.Fprintln(os.Stderr, err)
return
}
if werr := utils.WriteJSON(pipe, &initError{Message: err.Error()}); werr != nil {
fmt.Fprintln(os.Stderr, err)
return
}
}()
// Only init processes have FIFOFD.
fifofd := -1
envInitType := os.Getenv("_LIBCONTAINER_INITTYPE")
it := initType(envInitType)
if it == initStandard {
envFifoFd := os.Getenv("_LIBCONTAINER_FIFOFD")
if fifofd, err = strconv.Atoi(envFifoFd); err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD: %w", err)
}
}
var consoleSocket *os.File
if envConsole := os.Getenv("_LIBCONTAINER_CONSOLE"); envConsole != "" {
console, err := strconv.Atoi(envConsole)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE: %w", err)
}
consoleSocket = os.NewFile(uintptr(console), "console-socket")
defer consoleSocket.Close()
}
logPipeFdStr := os.Getenv("_LIBCONTAINER_LOGPIPE")
logPipeFd, err := strconv.Atoi(logPipeFdStr)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE: %w", err)
}
// Get mount files (O_PATH).
mountFds, err := parseMountFds()
if err != nil {
return err
}
// clear the current process's environment to clean any libcontainer
// specific env vars.
os.Clearenv()
defer func() {
if e := recover(); e != nil {
if ee, ok := e.(error); ok {
err = fmt.Errorf("panic from initialization: %w, %s", ee, debug.Stack())
} else {
err = fmt.Errorf("panic from initialization: %v, %s", e, debug.Stack())
}
}
}()
i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds)
if err != nil {
return err
}
// If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
return i.Init()
}
func (l *LinuxFactory) loadState(root string) (*State, error) {
stateFilePath, err := securejoin.SecureJoin(root, stateFilename)
if err != nil {
return nil, err
}
f, err := os.Open(stateFilePath)
if err != nil {
if os.IsNotExist(err) {
return nil, ErrNotExist
}
return nil, err
}
defer f.Close()
var state *State
if err := json.NewDecoder(f).Decode(&state); err != nil {
return nil, err
}
return state, nil
}
func (l *LinuxFactory) validateID(id string) error {
if !idRegex.MatchString(id) || string(os.PathSeparator)+id != utils.CleanPath(string(os.PathSeparator)+id) {
return ErrInvalidID
}
return nil
}
// NewuidmapPath returns an option func to configure a LinuxFactory with the
// provided ..
func NewuidmapPath(newuidmapPath string) func(*LinuxFactory) error {
return func(l *LinuxFactory) error {
l.NewuidmapPath = newuidmapPath
return nil
}
}
// NewgidmapPath returns an option func to configure a LinuxFactory with the
// provided ..
func NewgidmapPath(newgidmapPath string) func(*LinuxFactory) error {
return func(l *LinuxFactory) error {
l.NewgidmapPath = newgidmapPath
return nil
}
}
func parseMountFds() ([]int, error) {
fdsJson := os.Getenv("_LIBCONTAINER_MOUNT_FDS")
if fdsJson == "" {
// Always return the nil slice if no fd is present.
return nil, nil
}
var mountFds []int
if err := json.Unmarshal([]byte(fdsJson), &mountFds); err != nil {
return nil, fmt.Errorf("Error unmarshalling _LIBCONTAINER_MOUNT_FDS: %w", err)
}
return mountFds, nil
}

View File

@@ -1,641 +0,0 @@
package libcontainer
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
"net"
"os"
"path/filepath"
"strings"
"syscall"
"unsafe"
"github.com/containerd/console"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/capabilities"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/user"
"github.com/opencontainers/runc/libcontainer/utils"
)
type initType string
const (
initSetns initType = "setns"
initStandard initType = "standard"
)
type pid struct {
Pid int `json:"stage2_pid"`
PidFirstChild int `json:"stage1_pid"`
}
// network is an internal struct used to setup container networks.
type network struct {
configs.Network
// TempVethPeerName is a unique temporary veth peer name that was placed into
// the container's namespace.
TempVethPeerName string `json:"temp_veth_peer_name"`
}
// initConfig is used for transferring parameters from Exec() to Init()
type initConfig struct {
Args []string `json:"args"`
Env []string `json:"env"`
Cwd string `json:"cwd"`
Capabilities *configs.Capabilities `json:"capabilities"`
ProcessLabel string `json:"process_label"`
AppArmorProfile string `json:"apparmor_profile"`
NoNewPrivileges bool `json:"no_new_privileges"`
User string `json:"user"`
AdditionalGroups []string `json:"additional_groups"`
Config *configs.Config `json:"config"`
Networks []*network `json:"network"`
PassedFilesCount int `json:"passed_files_count"`
ContainerId string `json:"containerid"`
Rlimits []configs.Rlimit `json:"rlimits"`
CreateConsole bool `json:"create_console"`
ConsoleWidth uint16 `json:"console_width"`
ConsoleHeight uint16 `json:"console_height"`
RootlessEUID bool `json:"rootless_euid,omitempty"`
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
SpecState *specs.State `json:"spec_state,omitempty"`
Cgroup2Path string `json:"cgroup2_path,omitempty"`
}
type initer interface {
Init() error
}
func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds []int) (initer, error) {
var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err
}
if err := populateProcessEnvironment(config.Env); err != nil {
return nil, err
}
// Clean the RLIMIT_NOFILE cache in go runtime.
// Issue: https://github.com/opencontainers/runc/issues/4195
maybeClearRlimitNofileCache(config.Rlimits)
switch t {
case initSetns:
// mountFds must be nil in this case. We don't mount while doing runc exec.
if mountFds != nil {
return nil, errors.New("mountFds must be nil. Can't mount while doing runc exec.")
}
return &linuxSetnsInit{
pipe: pipe,
consoleSocket: consoleSocket,
config: config,
logFd: logFd,
}, nil
case initStandard:
return &linuxStandardInit{
pipe: pipe,
consoleSocket: consoleSocket,
parentPid: unix.Getppid(),
config: config,
fifoFd: fifoFd,
logFd: logFd,
mountFds: mountFds,
}, nil
}
return nil, fmt.Errorf("unknown init type %q", t)
}
// populateProcessEnvironment loads the provided environment variables into the
// current processes's environment.
func populateProcessEnvironment(env []string) error {
for _, pair := range env {
p := strings.SplitN(pair, "=", 2)
if len(p) < 2 {
return errors.New("invalid environment variable: missing '='")
}
name, val := p[0], p[1]
if name == "" {
return errors.New("invalid environment variable: name cannot be empty")
}
if strings.IndexByte(name, 0) >= 0 {
return fmt.Errorf("invalid environment variable %q: name contains nul byte (\\x00)", name)
}
if strings.IndexByte(val, 0) >= 0 {
return fmt.Errorf("invalid environment variable %q: value contains nul byte (\\x00)", name)
}
if err := os.Setenv(name, val); err != nil {
return err
}
}
return nil
}
// verifyCwd ensures that the current directory is actually inside the mount
// namespace root of the current process.
func verifyCwd() error {
// getcwd(2) on Linux detects if cwd is outside of the rootfs of the
// current mount namespace root, and in that case prefixes "(unreachable)"
// to the returned string. glibc's getcwd(3) and Go's Getwd() both detect
// when this happens and return ENOENT rather than returning a non-absolute
// path. In both cases we can therefore easily detect if we have an invalid
// cwd by checking the return value of getcwd(3). See getcwd(3) for more
// details, and CVE-2024-21626 for the security issue that motivated this
// check.
//
// We have to use unix.Getwd() here because os.Getwd() has a workaround for
// $PWD which involves doing stat(.), which can fail if the current
// directory is inaccessible to the container process.
if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) {
return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected")
} else if err != nil {
return fmt.Errorf("failed to verify if current working directory is safe: %w", err)
} else if !filepath.IsAbs(wd) {
// We shouldn't ever hit this, but check just in case.
return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd)
}
return nil
}
// finalizeNamespace drops the caps, sets the correct user
// and working dir, and closes any leaked file descriptors
// before executing the command inside the namespace
func finalizeNamespace(config *initConfig) error {
// Ensure that all unwanted fds we may have accidentally
// inherited are marked close-on-exec so they stay out of the
// container
if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
return fmt.Errorf("error closing exec fds: %w", err)
}
// we only do chdir if it's specified
doChdir := config.Cwd != ""
if doChdir {
// First, attempt the chdir before setting up the user.
// This could allow us to access a directory that the user running runc can access
// but the container user cannot.
err := unix.Chdir(config.Cwd)
switch {
case err == nil:
doChdir = false
case os.IsPermission(err):
// If we hit an EPERM, we should attempt again after setting up user.
// This will allow us to successfully chdir if the container user has access
// to the directory, but the user running runc does not.
// This is useful in cases where the cwd is also a volume that's been chowned to the container user.
default:
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
}
}
caps := &configs.Capabilities{}
if config.Capabilities != nil {
caps = config.Capabilities
} else if config.Config.Capabilities != nil {
caps = config.Config.Capabilities
}
w, err := capabilities.New(caps)
if err != nil {
return err
}
// drop capabilities in bounding set before changing user
if err := w.ApplyBoundingSet(); err != nil {
return fmt.Errorf("unable to apply bounding set: %w", err)
}
// preserve existing capabilities while we change users
if err := system.SetKeepCaps(); err != nil {
return fmt.Errorf("unable to set keep caps: %w", err)
}
if err := setupUser(config); err != nil {
return fmt.Errorf("unable to setup user: %w", err)
}
// Change working directory AFTER the user has been set up, if we haven't done it yet.
if doChdir {
if err := unix.Chdir(config.Cwd); err != nil {
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
}
}
// Make sure our final working directory is inside the container.
if err := verifyCwd(); err != nil {
return err
}
if err := system.ClearKeepCaps(); err != nil {
return fmt.Errorf("unable to clear keep caps: %w", err)
}
if err := w.ApplyCaps(); err != nil {
return fmt.Errorf("unable to apply caps: %w", err)
}
return nil
}
// setupConsole sets up the console from inside the container, and sends the
// master pty fd to the config.Pipe (using cmsg). This is done to ensure that
// consoles are scoped to a container properly (see runc#814 and the many
// issues related to that). This has to be run *after* we've pivoted to the new
// rootfs (and the users' configuration is entirely set up).
func setupConsole(socket *os.File, config *initConfig, mount bool) error {
defer socket.Close()
// At this point, /dev/ptmx points to something that we would expect. We
// used to change the owner of the slave path, but since the /dev/pts mount
// can have gid=X set (at the users' option). So touching the owner of the
// slave PTY is not necessary, as the kernel will handle that for us. Note
// however, that setupUser (specifically fixStdioPermissions) *will* change
// the UID owner of the console to be the user the process will run as (so
// they can actually control their console).
pty, slavePath, err := console.NewPty()
if err != nil {
return err
}
// After we return from here, we don't need the console anymore.
defer pty.Close()
if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 {
err = pty.Resize(console.WinSize{
Height: config.ConsoleHeight,
Width: config.ConsoleWidth,
})
if err != nil {
return err
}
}
// Mount the console inside our rootfs.
if mount {
if err := mountConsole(slavePath); err != nil {
return err
}
}
// While we can access console.master, using the API is a good idea.
if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil {
return err
}
// Now, dup over all the things.
return dupStdio(slavePath)
}
// syncParentReady sends to the given pipe a JSON payload which indicates that
// the init is ready to Exec the child process. It then waits for the parent to
// indicate that it is cleared to Exec.
func syncParentReady(pipe io.ReadWriter) error {
// Tell parent.
if err := writeSync(pipe, procReady); err != nil {
return err
}
// Wait for parent to give the all-clear.
return readSync(pipe, procRun)
}
// syncParentHooks sends to the given pipe a JSON payload which indicates that
// the parent should execute pre-start hooks. It then waits for the parent to
// indicate that it is cleared to resume.
func syncParentHooks(pipe io.ReadWriter) error {
// Tell parent.
if err := writeSync(pipe, procHooks); err != nil {
return err
}
// Wait for parent to give the all-clear.
return readSync(pipe, procResume)
}
// syncParentSeccomp sends to the given pipe a JSON payload which
// indicates that the parent should pick up the seccomp fd with pidfd_getfd()
// and send it to the seccomp agent over a unix socket. It then waits for
// the parent to indicate that it is cleared to resume and closes the seccompFd.
// If the seccompFd is -1, there isn't anything to sync with the parent, so it
// returns no error.
func syncParentSeccomp(pipe io.ReadWriter, seccompFd int) error {
if seccompFd == -1 {
return nil
}
// Tell parent.
if err := writeSyncWithFd(pipe, procSeccomp, seccompFd); err != nil {
unix.Close(seccompFd)
return err
}
// Wait for parent to give the all-clear.
if err := readSync(pipe, procSeccompDone); err != nil {
unix.Close(seccompFd)
return fmt.Errorf("sync parent seccomp: %w", err)
}
if err := unix.Close(seccompFd); err != nil {
return fmt.Errorf("close seccomp fd: %w", err)
}
return nil
}
// setupUser changes the groups, gid, and uid for the user inside the container
func setupUser(config *initConfig) error {
// Set up defaults.
defaultExecUser := user.ExecUser{
Uid: 0,
Gid: 0,
Home: "/",
}
passwdPath, err := user.GetPasswdPath()
if err != nil {
return err
}
groupPath, err := user.GetGroupPath()
if err != nil {
return err
}
execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
if err != nil {
return err
}
var addGroups []int
if len(config.AdditionalGroups) > 0 {
addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath)
if err != nil {
return err
}
}
// Rather than just erroring out later in setuid(2) and setgid(2), check
// that the user is mapped here.
if _, err := config.Config.HostUID(execUser.Uid); err != nil {
return errors.New("cannot set uid to unmapped user in user namespace")
}
if _, err := config.Config.HostGID(execUser.Gid); err != nil {
return errors.New("cannot set gid to unmapped user in user namespace")
}
if config.RootlessEUID {
// We cannot set any additional groups in a rootless container and thus
// we bail if the user asked us to do so. TODO: We currently can't do
// this check earlier, but if libcontainer.Process.User was typesafe
// this might work.
if len(addGroups) > 0 {
return errors.New("cannot set any additional groups in a rootless container")
}
}
// Before we change to the container's user make sure that the processes
// STDIO is correctly owned by the user that we are switching to.
if err := fixStdioPermissions(execUser); err != nil {
return err
}
setgroups, err := os.ReadFile("/proc/self/setgroups")
if err != nil && !os.IsNotExist(err) {
return err
}
// This isn't allowed in an unprivileged user namespace since Linux 3.19.
// There's nothing we can do about /etc/group entries, so we silently
// ignore setting groups here (since the user didn't explicitly ask us to
// set the group).
allowSupGroups := !config.RootlessEUID && string(bytes.TrimSpace(setgroups)) != "deny"
if allowSupGroups {
suppGroups := append(execUser.Sgids, addGroups...)
if err := unix.Setgroups(suppGroups); err != nil {
return &os.SyscallError{Syscall: "setgroups", Err: err}
}
}
if err := system.Setgid(execUser.Gid); err != nil {
return err
}
if err := system.Setuid(execUser.Uid); err != nil {
return err
}
// if we didn't get HOME already, set it based on the user's HOME
if envHome := os.Getenv("HOME"); envHome == "" {
if err := os.Setenv("HOME", execUser.Home); err != nil {
return err
}
}
return nil
}
// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
// The ownership needs to match because it is created outside of the container and needs to be
// localized.
func fixStdioPermissions(u *user.ExecUser) error {
var null unix.Stat_t
if err := unix.Stat("/dev/null", &null); err != nil {
return &os.PathError{Op: "stat", Path: "/dev/null", Err: err}
}
for _, file := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
var s unix.Stat_t
if err := unix.Fstat(int(file.Fd()), &s); err != nil {
return &os.PathError{Op: "fstat", Path: file.Name(), Err: err}
}
// Skip chown if uid is already the one we want or any of the STDIO descriptors
// were redirected to /dev/null.
if int(s.Uid) == u.Uid || s.Rdev == null.Rdev {
continue
}
// We only change the uid (as it is possible for the mount to
// prefer a different gid, and there's no reason for us to change it).
// The reason why we don't just leave the default uid=X mount setup is
// that users expect to be able to actually use their console. Without
// this code, you couldn't effectively run as a non-root user inside a
// container and also have a console set up.
if err := file.Chown(u.Uid, int(s.Gid)); err != nil {
// If we've hit an EINVAL then s.Gid isn't mapped in the user
// namespace. If we've hit an EPERM then the inode's current owner
// is not mapped in our user namespace (in particular,
// privileged_wrt_inode_uidgid() has failed). Read-only
// /dev can result in EROFS error. In any case, it's
// better for us to just not touch the stdio rather
// than bail at this point.
if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) {
continue
}
return err
}
}
return nil
}
// setupNetwork sets up and initializes any network interface inside the container.
func setupNetwork(config *initConfig) error {
for _, config := range config.Networks {
strategy, err := getStrategy(config.Type)
if err != nil {
return err
}
if err := strategy.initialize(config); err != nil {
return err
}
}
return nil
}
func setupRoute(config *configs.Config) error {
for _, config := range config.Routes {
_, dst, err := net.ParseCIDR(config.Destination)
if err != nil {
return err
}
src := net.ParseIP(config.Source)
if src == nil {
return fmt.Errorf("Invalid source for route: %s", config.Source)
}
gw := net.ParseIP(config.Gateway)
if gw == nil {
return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
}
l, err := netlink.LinkByName(config.InterfaceName)
if err != nil {
return err
}
route := &netlink.Route{
Scope: netlink.SCOPE_UNIVERSE,
Dst: dst,
Src: src,
Gw: gw,
LinkIndex: l.Attrs().Index,
}
if err := netlink.RouteAdd(route); err != nil {
return err
}
}
return nil
}
func maybeClearRlimitNofileCache(limits []configs.Rlimit) {
for _, rlimit := range limits {
if rlimit.Type == syscall.RLIMIT_NOFILE {
system.ClearRlimitNofileCache(&syscall.Rlimit{
Cur: rlimit.Soft,
Max: rlimit.Hard,
})
return
}
}
}
func setupRlimits(limits []configs.Rlimit, pid int) error {
for _, rlimit := range limits {
if err := unix.Prlimit(pid, rlimit.Type, &unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}, nil); err != nil {
return fmt.Errorf("error setting rlimit type %v: %w", rlimit.Type, err)
}
}
return nil
}
const _P_PID = 1
//nolint:structcheck,unused
type siginfo struct {
si_signo int32
si_errno int32
si_code int32
// below here is a union; si_pid is the only field we use
si_pid int32
// Pad to 128 bytes as detailed in blockUntilWaitable
pad [96]byte
}
// isWaitable returns true if the process has exited false otherwise.
// Its based off blockUntilWaitable in src/os/wait_waitid.go
func isWaitable(pid int) (bool, error) {
si := &siginfo{}
_, _, e := unix.Syscall6(unix.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), unix.WEXITED|unix.WNOWAIT|unix.WNOHANG, 0, 0)
if e != 0 {
return false, &os.SyscallError{Syscall: "waitid", Err: e}
}
return si.si_pid != 0, nil
}
// signalAllProcesses freezes then iterates over all the processes inside the
// manager's cgroups sending the signal s to them.
// If s is SIGKILL then it will wait for each process to exit.
// For all other signals it will check if the process is ready to report its
// exit status and only if it is will a wait be performed.
func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
var procs []*os.Process
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Warn(err)
}
pids, err := m.GetAllPids()
if err != nil {
if err := m.Freeze(configs.Thawed); err != nil {
logrus.Warn(err)
}
return err
}
for _, pid := range pids {
p, err := os.FindProcess(pid)
if err != nil {
logrus.Warn(err)
continue
}
procs = append(procs, p)
if err := p.Signal(s); err != nil {
logrus.Warn(err)
}
}
if err := m.Freeze(configs.Thawed); err != nil {
logrus.Warn(err)
}
subreaper, err := system.GetSubreaper()
if err != nil {
// The error here means that PR_GET_CHILD_SUBREAPER is not
// supported because this code might run on a kernel older
// than 3.4. We don't want to throw an error in that case,
// and we simplify things, considering there is no subreaper
// set.
subreaper = 0
}
for _, p := range procs {
if s != unix.SIGKILL {
if ok, err := isWaitable(p.Pid); err != nil {
if !errors.Is(err, unix.ECHILD) {
logrus.Warn("signalAllProcesses: ", p.Pid, err)
}
continue
} else if !ok {
// Not ready to report so don't wait
continue
}
}
// In case a subreaper has been setup, this code must not
// wait for the process. Otherwise, we cannot be sure the
// current process will be reaped by the subreaper, while
// the subreaper might be waiting for this process in order
// to retrieve its exit code.
if subreaper == 0 {
if _, err := p.Wait(); err != nil {
if !errors.Is(err, unix.ECHILD) {
logrus.Warn("wait: ", err)
}
}
}
}
return nil
}

View File

@@ -1,45 +0,0 @@
package keys
import (
"errors"
"fmt"
"strconv"
"strings"
"golang.org/x/sys/unix"
)
type KeySerial uint32
func JoinSessionKeyring(name string) (KeySerial, error) {
sessKeyID, err := unix.KeyctlJoinSessionKeyring(name)
if err != nil {
return 0, fmt.Errorf("unable to create session key: %w", err)
}
return KeySerial(sessKeyID), nil
}
// ModKeyringPerm modifies permissions on a keyring by reading the current permissions,
// anding the bits with the given mask (clearing permissions) and setting
// additional permission bits
func ModKeyringPerm(ringID KeySerial, mask, setbits uint32) error {
dest, err := unix.KeyctlString(unix.KEYCTL_DESCRIBE, int(ringID))
if err != nil {
return err
}
res := strings.Split(dest, ";")
if len(res) < 5 {
return errors.New("Destination buffer for key description is too small")
}
// parse permissions
perm64, err := strconv.ParseUint(res[3], 16, 32)
if err != nil {
return err
}
perm := (uint32(perm64) & mask) | setbits
return unix.KeyctlSetperm(int(ringID), perm)
}

View File

@@ -1,56 +0,0 @@
package logs
import (
"bufio"
"encoding/json"
"io"
"github.com/sirupsen/logrus"
)
func ForwardLogs(logPipe io.ReadCloser) chan error {
done := make(chan error, 1)
s := bufio.NewScanner(logPipe)
logger := logrus.StandardLogger()
if logger.ReportCaller {
// Need a copy of the standard logger, but with ReportCaller
// turned off, as the logs are merely forwarded and their
// true source is not this file/line/function.
logNoCaller := *logrus.StandardLogger()
logNoCaller.ReportCaller = false
logger = &logNoCaller
}
go func() {
for s.Scan() {
processEntry(s.Bytes(), logger)
}
if err := logPipe.Close(); err != nil {
logrus.Errorf("error closing log source: %v", err)
}
// The only error we want to return is when reading from
// logPipe has failed.
done <- s.Err()
close(done)
}()
return done
}
func processEntry(text []byte, logger *logrus.Logger) {
if len(text) == 0 {
return
}
var jl struct {
Level logrus.Level `json:"level"`
Msg string `json:"msg"`
}
if err := json.Unmarshal(text, &jl); err != nil {
logrus.Errorf("failed to decode %q to json: %v", text, err)
return
}
logger.Log(jl.Level, jl.Msg)
}

View File

@@ -1,97 +0,0 @@
package libcontainer
import (
"fmt"
"math"
"github.com/vishvananda/netlink/nl"
"golang.org/x/sys/unix"
)
// list of known message types we want to send to bootstrap program
// The number is randomly chosen to not conflict with known netlink types
const (
InitMsg uint16 = 62000
CloneFlagsAttr uint16 = 27281
NsPathsAttr uint16 = 27282
UidmapAttr uint16 = 27283
GidmapAttr uint16 = 27284
SetgroupAttr uint16 = 27285
OomScoreAdjAttr uint16 = 27286
RootlessEUIDAttr uint16 = 27287
UidmapPathAttr uint16 = 27288
GidmapPathAttr uint16 = 27289
MountSourcesAttr uint16 = 27290
)
type Int32msg struct {
Type uint16
Value uint32
}
// Serialize serializes the message.
// Int32msg has the following representation
// | nlattr len | nlattr type |
// | uint32 value |
func (msg *Int32msg) Serialize() []byte {
buf := make([]byte, msg.Len())
native := nl.NativeEndian()
native.PutUint16(buf[0:2], uint16(msg.Len()))
native.PutUint16(buf[2:4], msg.Type)
native.PutUint32(buf[4:8], msg.Value)
return buf
}
func (msg *Int32msg) Len() int {
return unix.NLA_HDRLEN + 4
}
// Bytemsg has the following representation
// | nlattr len | nlattr type |
// | value | pad |
type Bytemsg struct {
Type uint16
Value []byte
}
func (msg *Bytemsg) Serialize() []byte {
l := msg.Len()
if l > math.MaxUint16 {
// We cannot return nil nor an error here, so we panic with
// a specific type instead, which is handled via recover in
// bootstrapData.
panic(netlinkError{fmt.Errorf("netlink: cannot serialize bytemsg of length %d (larger than UINT16_MAX)", l)})
}
buf := make([]byte, (l+unix.NLA_ALIGNTO-1) & ^(unix.NLA_ALIGNTO-1))
native := nl.NativeEndian()
native.PutUint16(buf[0:2], uint16(l))
native.PutUint16(buf[2:4], msg.Type)
copy(buf[4:], msg.Value)
return buf
}
func (msg *Bytemsg) Len() int {
return unix.NLA_HDRLEN + len(msg.Value) + 1 // null-terminated
}
type Boolmsg struct {
Type uint16
Value bool
}
func (msg *Boolmsg) Serialize() []byte {
buf := make([]byte, msg.Len())
native := nl.NativeEndian()
native.PutUint16(buf[0:2], uint16(msg.Len()))
native.PutUint16(buf[2:4], msg.Type)
if msg.Value {
native.PutUint32(buf[4:8], uint32(1))
} else {
native.PutUint32(buf[4:8], uint32(0))
}
return buf
}
func (msg *Boolmsg) Len() int {
return unix.NLA_HDRLEN + 4 // alignment
}

View File

@@ -1,101 +0,0 @@
package libcontainer
import (
"io/fs"
"strconv"
"golang.org/x/sys/unix"
)
// mountError holds an error from a failed mount or unmount operation.
type mountError struct {
op string
source string
target string
procfd string
flags uintptr
data string
err error
}
// Error provides a string error representation.
func (e *mountError) Error() string {
out := e.op + " "
if e.source != "" {
out += e.source + ":" + e.target
} else {
out += e.target
}
if e.procfd != "" {
out += " (via " + e.procfd + ")"
}
if e.flags != uintptr(0) {
out += ", flags: 0x" + strconv.FormatUint(uint64(e.flags), 16)
}
if e.data != "" {
out += ", data: " + e.data
}
out += ": " + e.err.Error()
return out
}
// Unwrap returns the underlying error.
// This is a convention used by Go 1.13+ standard library.
func (e *mountError) Unwrap() error {
return e.err
}
// mount is a simple unix.Mount wrapper. If procfd is not empty, it is used
// instead of target (and the target is only used to add context to an error).
func mount(source, target, procfd, fstype string, flags uintptr, data string) error {
dst := target
if procfd != "" {
dst = procfd
}
if err := unix.Mount(source, dst, fstype, flags, data); err != nil {
return &mountError{
op: "mount",
source: source,
target: target,
procfd: procfd,
flags: flags,
data: data,
err: err,
}
}
return nil
}
// unmount is a simple unix.Unmount wrapper.
func unmount(target string, flags int) error {
err := unix.Unmount(target, flags)
if err != nil {
return &mountError{
op: "unmount",
target: target,
flags: uintptr(flags),
err: err,
}
}
return nil
}
// syscallMode returns the syscall-specific mode bits from Go's portable mode bits.
// Copy from https://cs.opensource.google/go/go/+/refs/tags/go1.20.7:src/os/file_posix.go;l=61-75
func syscallMode(i fs.FileMode) (o uint32) {
o |= uint32(i.Perm())
if i&fs.ModeSetuid != 0 {
o |= unix.S_ISUID
}
if i&fs.ModeSetgid != 0 {
o |= unix.S_ISGID
}
if i&fs.ModeSticky != 0 {
o |= unix.S_ISVTX
}
// No mapping for Go's ModeTemporary (plan9 only).
return
}

View File

@@ -1,100 +0,0 @@
package libcontainer
import (
"bytes"
"fmt"
"os"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/types"
"github.com/vishvananda/netlink"
)
var strategies = map[string]networkStrategy{
"loopback": &loopback{},
}
// networkStrategy represents a specific network configuration for
// a container's networking stack
type networkStrategy interface {
create(*network, int) error
initialize(*network) error
detach(*configs.Network) error
attach(*configs.Network) error
}
// getStrategy returns the specific network strategy for the
// provided type.
func getStrategy(tpe string) (networkStrategy, error) {
s, exists := strategies[tpe]
if !exists {
return nil, fmt.Errorf("unknown strategy type %q", tpe)
}
return s, nil
}
// Returns the network statistics for the network interfaces represented by the NetworkRuntimeInfo.
func getNetworkInterfaceStats(interfaceName string) (*types.NetworkInterface, error) {
out := &types.NetworkInterface{Name: interfaceName}
// This can happen if the network runtime information is missing - possible if the
// container was created by an old version of libcontainer.
if interfaceName == "" {
return out, nil
}
type netStatsPair struct {
// Where to write the output.
Out *uint64
// The network stats file to read.
File string
}
// Ingress for host veth is from the container. Hence tx_bytes stat on the host veth is actually number of bytes received by the container.
netStats := []netStatsPair{
{Out: &out.RxBytes, File: "tx_bytes"},
{Out: &out.RxPackets, File: "tx_packets"},
{Out: &out.RxErrors, File: "tx_errors"},
{Out: &out.RxDropped, File: "tx_dropped"},
{Out: &out.TxBytes, File: "rx_bytes"},
{Out: &out.TxPackets, File: "rx_packets"},
{Out: &out.TxErrors, File: "rx_errors"},
{Out: &out.TxDropped, File: "rx_dropped"},
}
for _, netStat := range netStats {
data, err := readSysfsNetworkStats(interfaceName, netStat.File)
if err != nil {
return nil, err
}
*(netStat.Out) = data
}
return out, nil
}
// Reads the specified statistics available under /sys/class/net/<EthInterface>/statistics
func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) {
data, err := os.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile))
if err != nil {
return 0, err
}
return strconv.ParseUint(string(bytes.TrimSpace(data)), 10, 64)
}
// loopback is a network strategy that provides a basic loopback device
type loopback struct{}
func (l *loopback) create(n *network, nspid int) error {
return nil
}
func (l *loopback) initialize(config *network) error {
return netlink.LinkSetUp(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "lo"}})
}
func (l *loopback) attach(n *configs.Network) (err error) {
return nil
}
func (l *loopback) detach(n *configs.Network) (err error) {
return nil
}

View File

@@ -1,84 +0,0 @@
package libcontainer
import (
"errors"
"fmt"
"os"
"path/filepath"
"golang.org/x/sys/unix"
)
type PressureLevel uint
const (
LowPressure PressureLevel = iota
MediumPressure
CriticalPressure
)
func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) {
evFile, err := os.Open(filepath.Join(cgDir, evName))
if err != nil {
return nil, err
}
fd, err := unix.Eventfd(0, unix.EFD_CLOEXEC)
if err != nil {
evFile.Close()
return nil, err
}
eventfd := os.NewFile(uintptr(fd), "eventfd")
eventControlPath := filepath.Join(cgDir, "cgroup.event_control")
data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg)
if err := os.WriteFile(eventControlPath, []byte(data), 0o700); err != nil {
eventfd.Close()
evFile.Close()
return nil, err
}
ch := make(chan struct{})
go func() {
defer func() {
eventfd.Close()
evFile.Close()
close(ch)
}()
buf := make([]byte, 8)
for {
if _, err := eventfd.Read(buf); err != nil {
return
}
// When a cgroup is destroyed, an event is sent to eventfd.
// So if the control path is gone, return instead of notifying.
if _, err := os.Lstat(eventControlPath); os.IsNotExist(err) {
return
}
ch <- struct{}{}
}
}()
return ch, nil
}
// notifyOnOOM returns channel on which you can expect event about OOM,
// if process died without OOM this channel will be closed.
func notifyOnOOM(dir string) (<-chan struct{}, error) {
if dir == "" {
return nil, errors.New("memory controller missing")
}
return registerMemoryEvent(dir, "memory.oom_control", "")
}
func notifyMemoryPressure(dir string, level PressureLevel) (<-chan struct{}, error) {
if dir == "" {
return nil, errors.New("memory controller missing")
}
if level > CriticalPressure {
return nil, fmt.Errorf("invalid pressure level %d", level)
}
levelStr := []string{"low", "medium", "critical"}[level]
return registerMemoryEvent(dir, "memory.pressure_level", levelStr)
}

View File

@@ -1,80 +0,0 @@
package libcontainer
import (
"fmt"
"path/filepath"
"unsafe"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func registerMemoryEventV2(cgDir, evName, cgEvName string) (<-chan struct{}, error) {
fd, err := unix.InotifyInit()
if err != nil {
return nil, fmt.Errorf("unable to init inotify: %w", err)
}
// watching oom kill
evFd, err := unix.InotifyAddWatch(fd, filepath.Join(cgDir, evName), unix.IN_MODIFY)
if err != nil {
unix.Close(fd)
return nil, fmt.Errorf("unable to add inotify watch: %w", err)
}
// Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited
cgFd, err := unix.InotifyAddWatch(fd, filepath.Join(cgDir, cgEvName), unix.IN_MODIFY)
if err != nil {
unix.Close(fd)
return nil, fmt.Errorf("unable to add inotify watch: %w", err)
}
ch := make(chan struct{})
go func() {
var (
buffer [unix.SizeofInotifyEvent + unix.PathMax + 1]byte
offset uint32
)
defer func() {
unix.Close(fd)
close(ch)
}()
for {
n, err := unix.Read(fd, buffer[:])
if err != nil {
logrus.Warnf("unable to read event data from inotify, got error: %v", err)
return
}
if n < unix.SizeofInotifyEvent {
logrus.Warnf("we should read at least %d bytes from inotify, but got %d bytes.", unix.SizeofInotifyEvent, n)
return
}
offset = 0
for offset <= uint32(n-unix.SizeofInotifyEvent) {
rawEvent := (*unix.InotifyEvent)(unsafe.Pointer(&buffer[offset]))
offset += unix.SizeofInotifyEvent + rawEvent.Len
if rawEvent.Mask&unix.IN_MODIFY != unix.IN_MODIFY {
continue
}
switch int(rawEvent.Wd) {
case evFd:
oom, err := fscommon.GetValueByKey(cgDir, evName, "oom_kill")
if err != nil || oom > 0 {
ch <- struct{}{}
}
case cgFd:
pids, err := fscommon.GetValueByKey(cgDir, cgEvName, "populated")
if err != nil || pids == 0 {
return
}
}
}
}
}()
return ch, nil
}
// notifyOnOOMV2 returns channel on which you can expect event about OOM,
// if process died without OOM this channel will be closed.
func notifyOnOOMV2(path string) (<-chan struct{}, error) {
return registerMemoryEventV2(path, "memory.events", "cgroup.events")
}

View File

@@ -1,126 +0,0 @@
package libcontainer
import (
"errors"
"io"
"math"
"os"
"github.com/opencontainers/runc/libcontainer/configs"
)
var errInvalidProcess = errors.New("invalid process")
type processOperations interface {
wait() (*os.ProcessState, error)
signal(sig os.Signal) error
pid() int
}
// Process specifies the configuration and IO for a process inside
// a container.
type Process struct {
// The command to be run followed by any arguments.
Args []string
// Env specifies the environment variables for the process.
Env []string
// User will set the uid and gid of the executing process running inside the container
// local to the container's user and group configuration.
User string
// AdditionalGroups specifies the gids that should be added to supplementary groups
// in addition to those that the user belongs to.
AdditionalGroups []string
// Cwd will change the processes current working directory inside the container's rootfs.
Cwd string
// Stdin is a pointer to a reader which provides the standard input stream.
Stdin io.Reader
// Stdout is a pointer to a writer which receives the standard output stream.
Stdout io.Writer
// Stderr is a pointer to a writer which receives the standard error stream.
Stderr io.Writer
// ExtraFiles specifies additional open files to be inherited by the container
ExtraFiles []*os.File
// Initial sizings for the console
ConsoleWidth uint16
ConsoleHeight uint16
// Capabilities specify the capabilities to keep when executing the process inside the container
// All capabilities not specified will be dropped from the processes capability mask
Capabilities *configs.Capabilities
// AppArmorProfile specifies the profile to apply to the process and is
// changed at the time the process is execed
AppArmorProfile string
// Label specifies the label to apply to the process. It is commonly used by selinux
Label string
// NoNewPrivileges controls whether processes can gain additional privileges.
NoNewPrivileges *bool
// Rlimits specifies the resource limits, such as max open files, to set in the container
// If Rlimits are not set, the container will inherit rlimits from the parent process
Rlimits []configs.Rlimit
// ConsoleSocket provides the masterfd console.
ConsoleSocket *os.File
// Init specifies whether the process is the first process in the container.
Init bool
ops processOperations
LogLevel string
// SubCgroupPaths specifies sub-cgroups to run the process in.
// Map keys are controller names, map values are paths (relative to
// container's top-level cgroup).
//
// If empty, the default top-level container's cgroup is used.
//
// For cgroup v2, the only key allowed is "".
SubCgroupPaths map[string]string
}
// Wait waits for the process to exit.
// Wait releases any resources associated with the Process
func (p Process) Wait() (*os.ProcessState, error) {
if p.ops == nil {
return nil, errInvalidProcess
}
return p.ops.wait()
}
// Pid returns the process ID
func (p Process) Pid() (int, error) {
// math.MinInt32 is returned here, because it's invalid value
// for the kill() system call.
if p.ops == nil {
return math.MinInt32, errInvalidProcess
}
return p.ops.pid(), nil
}
// Signal sends a signal to the Process.
func (p Process) Signal(sig os.Signal) error {
if p.ops == nil {
return errInvalidProcess
}
return p.ops.signal(sig)
}
// IO holds the process's STDIO
type IO struct {
Stdin io.WriteCloser
Stdout io.ReadCloser
Stderr io.ReadCloser
}

View File

@@ -1,823 +0,0 @@
package libcontainer
import (
"encoding/json"
"errors"
"fmt"
"io"
"net"
"os"
"os/exec"
"path/filepath"
"strconv"
"time"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/logs"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
type parentProcess interface {
// pid returns the pid for the running process.
pid() int
// start starts the process execution.
start() error
// send a SIGKILL to the process and wait for the exit.
terminate() error
// wait waits on the process returning the process state.
wait() (*os.ProcessState, error)
// startTime returns the process start time.
startTime() (uint64, error)
signal(os.Signal) error
externalDescriptors() []string
setExternalDescriptors(fds []string)
forwardChildLogs() chan error
}
type filePair struct {
parent *os.File
child *os.File
}
type setnsProcess struct {
cmd *exec.Cmd
messageSockPair filePair
logFilePair filePair
cgroupPaths map[string]string
rootlessCgroups bool
manager cgroups.Manager
intelRdtPath string
config *initConfig
fds []string
process *Process
bootstrapData io.Reader
initProcessPid int
}
func (p *setnsProcess) startTime() (uint64, error) {
stat, err := system.Stat(p.pid())
return stat.StartTime, err
}
func (p *setnsProcess) signal(sig os.Signal) error {
s, ok := sig.(unix.Signal)
if !ok {
return errors.New("os: unsupported signal type")
}
return unix.Kill(p.pid(), s)
}
func (p *setnsProcess) start() (retErr error) {
defer p.messageSockPair.parent.Close()
// get the "before" value of oom kill count
oom, _ := p.manager.OOMKillCount()
err := p.cmd.Start()
// close the write-side of the pipes (controlled by child)
p.messageSockPair.child.Close()
p.logFilePair.child.Close()
if err != nil {
return fmt.Errorf("error starting setns process: %w", err)
}
waitInit := initWaiter(p.messageSockPair.parent)
defer func() {
if retErr != nil {
if newOom, err := p.manager.OOMKillCount(); err == nil && newOom != oom {
// Someone in this cgroup was killed, this _might_ be us.
retErr = fmt.Errorf("%w (possibly OOM-killed)", retErr)
}
werr := <-waitInit
if werr != nil {
logrus.WithError(werr).Warn()
}
err := ignoreTerminateErrors(p.terminate())
if err != nil {
logrus.WithError(err).Warn("unable to terminate setnsProcess")
}
}
}()
if p.bootstrapData != nil {
if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
return fmt.Errorf("error copying bootstrap data to pipe: %w", err)
}
}
err = <-waitInit
if err != nil {
return err
}
if err := p.execSetns(); err != nil {
return fmt.Errorf("error executing setns process: %w", err)
}
for _, path := range p.cgroupPaths {
if err := cgroups.WriteCgroupProc(path, p.pid()); err != nil && !p.rootlessCgroups {
// On cgroup v2 + nesting + domain controllers, WriteCgroupProc may fail with EBUSY.
// https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
// Try to join the cgroup of InitProcessPid.
if cgroups.IsCgroup2UnifiedMode() && p.initProcessPid != 0 {
initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid)
initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile)
if initCgErr == nil {
if initCgPath, ok := initCg[""]; ok {
initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath)
logrus.Debugf("adding pid %d to cgroups %v failed (%v), attempting to join %q (obtained from %s)",
p.pid(), p.cgroupPaths, err, initCg, initCgDirpath)
// NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container.
err = cgroups.WriteCgroupProc(initCgDirpath, p.pid())
}
}
}
if err != nil {
return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
}
}
}
if p.intelRdtPath != "" {
// if Intel RDT "resource control" filesystem path exists
_, err := os.Stat(p.intelRdtPath)
if err == nil {
if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
return fmt.Errorf("error adding pid %d to Intel RDT: %w", p.pid(), err)
}
}
}
if err := utils.WriteJSON(p.messageSockPair.parent, p.config); err != nil {
return fmt.Errorf("error writing config to pipe: %w", err)
}
ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
switch sync.Type {
case procReady:
// Set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return fmt.Errorf("error setting rlimits for ready process: %w", err)
}
// Sync with child.
return writeSync(p.messageSockPair.parent, procRun)
case procHooks:
// This shouldn't happen.
panic("unexpected procHooks in setns")
case procSeccomp:
if p.config.Config.Seccomp.ListenerPath == "" {
return errors.New("listenerPath is not set")
}
seccompFd, err := recvSeccompFd(uintptr(p.pid()), uintptr(sync.Fd))
if err != nil {
return err
}
defer unix.Close(seccompFd)
bundle, annotations := utils.Annotations(p.config.Config.Labels)
containerProcessState := &specs.ContainerProcessState{
Version: specs.Version,
Fds: []string{specs.SeccompFdName},
Pid: p.cmd.Process.Pid,
Metadata: p.config.Config.Seccomp.ListenerMetadata,
State: specs.State{
Version: specs.Version,
ID: p.config.ContainerId,
Status: specs.StateRunning,
Pid: p.initProcessPid,
Bundle: bundle,
Annotations: annotations,
},
}
if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath,
containerProcessState, seccompFd); err != nil {
return err
}
// Sync with child.
if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
return err
}
return nil
default:
return errors.New("invalid JSON payload from child")
}
})
if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err}
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
_, _ = p.wait()
return ierr
}
return nil
}
// execSetns runs the process that executes C code to perform the setns calls
// because setns support requires the C process to fork off a child and perform the setns
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
func (p *setnsProcess) execSetns() error {
status, err := p.cmd.Process.Wait()
if err != nil {
_ = p.cmd.Wait()
return fmt.Errorf("error waiting on setns process to finish: %w", err)
}
if !status.Success() {
_ = p.cmd.Wait()
return &exec.ExitError{ProcessState: status}
}
var pid *pid
if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
_ = p.cmd.Wait()
return fmt.Errorf("error reading pid from init pipe: %w", err)
}
// Clean up the zombie parent process
// On Unix systems FindProcess always succeeds.
firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)
// Ignore the error in case the child has already been reaped for any reason
_, _ = firstChildProcess.Wait()
process, err := os.FindProcess(pid.Pid)
if err != nil {
return err
}
p.cmd.Process = process
p.process.ops = p
return nil
}
// terminate sends a SIGKILL to the forked process for the setns routine then waits to
// avoid the process becoming a zombie.
func (p *setnsProcess) terminate() error {
if p.cmd.Process == nil {
return nil
}
err := p.cmd.Process.Kill()
if _, werr := p.wait(); err == nil {
err = werr
}
return err
}
func (p *setnsProcess) wait() (*os.ProcessState, error) {
err := p.cmd.Wait()
// Return actual ProcessState even on Wait error
return p.cmd.ProcessState, err
}
func (p *setnsProcess) pid() int {
return p.cmd.Process.Pid
}
func (p *setnsProcess) externalDescriptors() []string {
return p.fds
}
func (p *setnsProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}
func (p *setnsProcess) forwardChildLogs() chan error {
return logs.ForwardLogs(p.logFilePair.parent)
}
type initProcess struct {
cmd *exec.Cmd
messageSockPair filePair
logFilePair filePair
config *initConfig
manager cgroups.Manager
intelRdtManager *intelrdt.Manager
container *linuxContainer
fds []string
process *Process
bootstrapData io.Reader
sharePidns bool
}
func (p *initProcess) pid() int {
return p.cmd.Process.Pid
}
func (p *initProcess) externalDescriptors() []string {
return p.fds
}
// getChildPid receives the final child's pid over the provided pipe.
func (p *initProcess) getChildPid() (int, error) {
var pid pid
if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
_ = p.cmd.Wait()
return -1, err
}
// Clean up the zombie parent process
// On Unix systems FindProcess always succeeds.
firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)
// Ignore the error in case the child has already been reaped for any reason
_, _ = firstChildProcess.Wait()
return pid.Pid, nil
}
func (p *initProcess) waitForChildExit(childPid int) error {
status, err := p.cmd.Process.Wait()
if err != nil {
_ = p.cmd.Wait()
return err
}
if !status.Success() {
_ = p.cmd.Wait()
return &exec.ExitError{ProcessState: status}
}
process, err := os.FindProcess(childPid)
if err != nil {
return err
}
p.cmd.Process = process
p.process.ops = p
return nil
}
func (p *initProcess) start() (retErr error) {
defer p.messageSockPair.parent.Close() //nolint: errcheck
err := p.cmd.Start()
p.process.ops = p
// close the write-side of the pipes (controlled by child)
_ = p.messageSockPair.child.Close()
_ = p.logFilePair.child.Close()
if err != nil {
p.process.ops = nil
return fmt.Errorf("unable to start init: %w", err)
}
waitInit := initWaiter(p.messageSockPair.parent)
defer func() {
if retErr != nil {
// Find out if init is killed by the kernel's OOM killer.
// Get the count before killing init as otherwise cgroup
// might be removed by systemd.
oom, err := p.manager.OOMKillCount()
if err != nil {
logrus.WithError(err).Warn("unable to get oom kill count")
} else if oom > 0 {
// Does not matter what the particular error was,
// its cause is most probably OOM, so report that.
const oomError = "container init was OOM-killed (memory limit too low?)"
if logrus.GetLevel() >= logrus.DebugLevel {
// Only show the original error if debug is set,
// as it is not generally very useful.
retErr = fmt.Errorf(oomError+": %w", retErr)
} else {
retErr = errors.New(oomError)
}
}
werr := <-waitInit
if werr != nil {
logrus.WithError(werr).Warn()
}
// Terminate the process to ensure we can remove cgroups.
if err := ignoreTerminateErrors(p.terminate()); err != nil {
logrus.WithError(err).Warn("unable to terminate initProcess")
}
_ = p.manager.Destroy()
if p.intelRdtManager != nil {
_ = p.intelRdtManager.Destroy()
}
}
}()
// Do this before syncing with child so that no children can escape the
// cgroup. We don't need to worry about not doing this and not being root
// because we'd be using the rootless cgroup manager in that case.
if err := p.manager.Apply(p.pid()); err != nil {
return fmt.Errorf("unable to apply cgroup configuration: %w", err)
}
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Apply(p.pid()); err != nil {
return fmt.Errorf("unable to apply Intel RDT configuration: %w", err)
}
}
if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
return fmt.Errorf("can't copy bootstrap data to pipe: %w", err)
}
err = <-waitInit
if err != nil {
return err
}
childPid, err := p.getChildPid()
if err != nil {
return fmt.Errorf("can't get final child's PID from pipe: %w", err)
}
// Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()). If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up.
fds, err := getPipeFds(childPid)
if err != nil {
return fmt.Errorf("error getting pipe fds for pid %d: %w", childPid, err)
}
p.setExternalDescriptors(fds)
// Wait for our first child to exit
if err := p.waitForChildExit(childPid); err != nil {
return fmt.Errorf("error waiting for our first child to exit: %w", err)
}
if err := p.createNetworkInterfaces(); err != nil {
return fmt.Errorf("error creating network interfaces: %w", err)
}
if err := p.updateSpecState(); err != nil {
return fmt.Errorf("error updating spec state: %w", err)
}
if err := p.sendConfig(); err != nil {
return fmt.Errorf("error sending config to init process: %w", err)
}
var (
sentRun bool
sentResume bool
)
ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
switch sync.Type {
case procSeccomp:
if p.config.Config.Seccomp.ListenerPath == "" {
return errors.New("listenerPath is not set")
}
seccompFd, err := recvSeccompFd(uintptr(childPid), uintptr(sync.Fd))
if err != nil {
return err
}
defer unix.Close(seccompFd)
s, err := p.container.currentOCIState()
if err != nil {
return err
}
// initProcessStartTime hasn't been set yet.
s.Pid = p.cmd.Process.Pid
s.Status = specs.StateCreating
containerProcessState := &specs.ContainerProcessState{
Version: specs.Version,
Fds: []string{specs.SeccompFdName},
Pid: s.Pid,
Metadata: p.config.Config.Seccomp.ListenerMetadata,
State: *s,
}
if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath,
containerProcessState, seccompFd); err != nil {
return err
}
// Sync with child.
if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
return err
}
case procReady:
// Set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return fmt.Errorf("error setting rlimits for ready process: %w", err)
}
// call prestart and CreateRuntime hooks
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
// Setup cgroup before the hook, so that the prestart and CreateRuntime hook could apply cgroup permissions.
if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
return fmt.Errorf("error setting cgroup config for ready process: %w", err)
}
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Set(p.config.Config); err != nil {
return fmt.Errorf("error setting Intel RDT config for ready process: %w", err)
}
}
if len(p.config.Config.Hooks) != 0 {
s, err := p.container.currentOCIState()
if err != nil {
return err
}
// initProcessStartTime hasn't been set yet.
s.Pid = p.cmd.Process.Pid
s.Status = specs.StateCreating
hooks := p.config.Config.Hooks
if err := hooks[configs.Prestart].RunHooks(s); err != nil {
return err
}
if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
return err
}
}
}
// generate a timestamp indicating when the container was started
p.container.created = time.Now().UTC()
p.container.state = &createdState{
c: p.container,
}
// NOTE: If the procRun state has been synced and the
// runc-create process has been killed for some reason,
// the runc-init[2:stage] process will be leaky. And
// the runc command also fails to parse root directory
// because the container doesn't have state.json.
//
// In order to cleanup the runc-init[2:stage] by
// runc-delete/stop, we should store the status before
// procRun sync.
state, uerr := p.container.updateState(p)
if uerr != nil {
return fmt.Errorf("unable to store init state: %w", err)
}
p.container.initProcessStartTime = state.InitProcessStartTime
// Sync with child.
if err := writeSync(p.messageSockPair.parent, procRun); err != nil {
return err
}
sentRun = true
case procHooks:
// Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
return fmt.Errorf("error setting cgroup config for procHooks process: %w", err)
}
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Set(p.config.Config); err != nil {
return fmt.Errorf("error setting Intel RDT config for procHooks process: %w", err)
}
}
if len(p.config.Config.Hooks) != 0 {
s, err := p.container.currentOCIState()
if err != nil {
return err
}
// initProcessStartTime hasn't been set yet.
s.Pid = p.cmd.Process.Pid
s.Status = specs.StateCreating
hooks := p.config.Config.Hooks
if err := hooks[configs.Prestart].RunHooks(s); err != nil {
return err
}
if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
return err
}
}
// Sync with child.
if err := writeSync(p.messageSockPair.parent, procResume); err != nil {
return err
}
sentResume = true
default:
return errors.New("invalid JSON payload from child")
}
return nil
})
if !sentRun {
return fmt.Errorf("error during container init: %w", ierr)
}
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
return errors.New("could not synchronise after executing prestart and CreateRuntime hooks with container process")
}
if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err}
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
_, _ = p.wait()
return ierr
}
return nil
}
func (p *initProcess) wait() (*os.ProcessState, error) {
err := p.cmd.Wait()
// we should kill all processes in cgroup when init is died if we use host PID namespace
if p.sharePidns {
_ = signalAllProcesses(p.manager, unix.SIGKILL)
}
return p.cmd.ProcessState, err
}
func (p *initProcess) terminate() error {
if p.cmd.Process == nil {
return nil
}
err := p.cmd.Process.Kill()
if _, werr := p.wait(); err == nil {
err = werr
}
return err
}
func (p *initProcess) startTime() (uint64, error) {
stat, err := system.Stat(p.pid())
return stat.StartTime, err
}
func (p *initProcess) updateSpecState() error {
s, err := p.container.currentOCIState()
if err != nil {
return err
}
p.config.SpecState = s
return nil
}
func (p *initProcess) sendConfig() error {
// send the config to the container's init process, we don't use JSON Encode
// here because there might be a problem in JSON decoder in some cases, see:
// https://github.com/docker/docker/issues/14203#issuecomment-174177790
return utils.WriteJSON(p.messageSockPair.parent, p.config)
}
func (p *initProcess) createNetworkInterfaces() error {
for _, config := range p.config.Config.Networks {
strategy, err := getStrategy(config.Type)
if err != nil {
return err
}
n := &network{
Network: *config,
}
if err := strategy.create(n, p.pid()); err != nil {
return err
}
p.config.Networks = append(p.config.Networks, n)
}
return nil
}
func (p *initProcess) signal(sig os.Signal) error {
s, ok := sig.(unix.Signal)
if !ok {
return errors.New("os: unsupported signal type")
}
return unix.Kill(p.pid(), s)
}
func (p *initProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}
func (p *initProcess) forwardChildLogs() chan error {
return logs.ForwardLogs(p.logFilePair.parent)
}
func recvSeccompFd(childPid, childFd uintptr) (int, error) {
pidfd, _, errno := unix.Syscall(unix.SYS_PIDFD_OPEN, childPid, 0, 0)
if errno != 0 {
return -1, fmt.Errorf("performing SYS_PIDFD_OPEN syscall: %w", errno)
}
defer unix.Close(int(pidfd))
seccompFd, _, errno := unix.Syscall(unix.SYS_PIDFD_GETFD, pidfd, childFd, 0)
if errno != 0 {
return -1, fmt.Errorf("performing SYS_PIDFD_GETFD syscall: %w", errno)
}
return int(seccompFd), nil
}
func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, fd int) error {
conn, err := net.Dial("unix", listenerPath)
if err != nil {
return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err)
}
socket, err := conn.(*net.UnixConn).File()
if err != nil {
return fmt.Errorf("cannot get seccomp socket: %w", err)
}
defer socket.Close()
b, err := json.Marshal(state)
if err != nil {
return fmt.Errorf("cannot marshall seccomp state: %w", err)
}
err = utils.SendFds(socket, b, fd)
if err != nil {
return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err)
}
return nil
}
func getPipeFds(pid int) ([]string, error) {
fds := make([]string, 3)
dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
for i := 0; i < 3; i++ {
// XXX: This breaks if the path is not a valid symlink (which can
// happen in certain particularly unlucky mount namespace setups).
f := filepath.Join(dirPath, strconv.Itoa(i))
target, err := os.Readlink(f)
if err != nil {
// Ignore permission errors, for rootless containers and other
// non-dumpable processes. if we can't get the fd for a particular
// file, there's not much we can do.
if os.IsPermission(err) {
continue
}
return fds, err
}
fds[i] = target
}
return fds, nil
}
// InitializeIO creates pipes for use with the process's stdio and returns the
// opposite side for each. Do not use this if you want to have a pseudoterminal
// set up for you by libcontainer (TODO: fix that too).
// TODO: This is mostly unnecessary, and should be handled by clients.
func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
var fds []uintptr
i = &IO{}
// cleanup in case of an error
defer func() {
if err != nil {
for _, fd := range fds {
_ = unix.Close(int(fd))
}
}
}()
// STDIN
r, w, err := os.Pipe()
if err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stdin, i.Stdin = r, w
// STDOUT
if r, w, err = os.Pipe(); err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stdout, i.Stdout = w, r
// STDERR
if r, w, err = os.Pipe(); err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stderr, i.Stderr = w, r
// change ownership of the pipes in case we are in a user namespace
for _, fd := range fds {
if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
return nil, &os.PathError{Op: "fchown", Path: "fd " + strconv.Itoa(int(fd)), Err: err}
}
}
return i, nil
}
// initWaiter returns a channel to wait on for making sure
// runc init has finished the initial setup.
func initWaiter(r io.Reader) chan error {
ch := make(chan error, 1)
go func() {
defer close(ch)
inited := make([]byte, 1)
n, err := r.Read(inited)
if err == nil {
if n < 1 {
err = errors.New("short read")
} else if inited[0] != 0 {
err = fmt.Errorf("unexpected %d != 0", inited[0])
} else {
ch <- nil
return
}
}
ch <- fmt.Errorf("waiting for init preliminary setup: %w", err)
}()
return ch
}

View File

@@ -1,128 +0,0 @@
package libcontainer
import (
"errors"
"os"
"os/exec"
"github.com/opencontainers/runc/libcontainer/system"
)
func newRestoredProcess(cmd *exec.Cmd, fds []string) (*restoredProcess, error) {
var err error
pid := cmd.Process.Pid
stat, err := system.Stat(pid)
if err != nil {
return nil, err
}
return &restoredProcess{
cmd: cmd,
processStartTime: stat.StartTime,
fds: fds,
}, nil
}
type restoredProcess struct {
cmd *exec.Cmd
processStartTime uint64
fds []string
}
func (p *restoredProcess) start() error {
return errors.New("restored process cannot be started")
}
func (p *restoredProcess) pid() int {
return p.cmd.Process.Pid
}
func (p *restoredProcess) terminate() error {
err := p.cmd.Process.Kill()
if _, werr := p.wait(); err == nil {
err = werr
}
return err
}
func (p *restoredProcess) wait() (*os.ProcessState, error) {
// TODO: how do we wait on the actual process?
// maybe use --exec-cmd in criu
err := p.cmd.Wait()
if err != nil {
var exitErr *exec.ExitError
if !errors.As(err, &exitErr) {
return nil, err
}
}
st := p.cmd.ProcessState
return st, nil
}
func (p *restoredProcess) startTime() (uint64, error) {
return p.processStartTime, nil
}
func (p *restoredProcess) signal(s os.Signal) error {
return p.cmd.Process.Signal(s)
}
func (p *restoredProcess) externalDescriptors() []string {
return p.fds
}
func (p *restoredProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}
func (p *restoredProcess) forwardChildLogs() chan error {
return nil
}
// nonChildProcess represents a process where the calling process is not
// the parent process. This process is created when a factory loads a container from
// a persisted state.
type nonChildProcess struct {
processPid int
processStartTime uint64
fds []string
}
func (p *nonChildProcess) start() error {
return errors.New("restored process cannot be started")
}
func (p *nonChildProcess) pid() int {
return p.processPid
}
func (p *nonChildProcess) terminate() error {
return errors.New("restored process cannot be terminated")
}
func (p *nonChildProcess) wait() (*os.ProcessState, error) {
return nil, errors.New("restored process cannot be waited on")
}
func (p *nonChildProcess) startTime() (uint64, error) {
return p.processStartTime, nil
}
func (p *nonChildProcess) signal(s os.Signal) error {
proc, err := os.FindProcess(p.processPid)
if err != nil {
return err
}
return proc.Signal(s)
}
func (p *nonChildProcess) externalDescriptors() []string {
return p.fds
}
func (p *nonChildProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}
func (p *nonChildProcess) forwardChildLogs() chan error {
return nil
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,113 +0,0 @@
package seccomp
import (
"fmt"
"sort"
"github.com/opencontainers/runc/libcontainer/configs"
)
var operators = map[string]configs.Operator{
"SCMP_CMP_NE": configs.NotEqualTo,
"SCMP_CMP_LT": configs.LessThan,
"SCMP_CMP_LE": configs.LessThanOrEqualTo,
"SCMP_CMP_EQ": configs.EqualTo,
"SCMP_CMP_GE": configs.GreaterThanOrEqualTo,
"SCMP_CMP_GT": configs.GreaterThan,
"SCMP_CMP_MASKED_EQ": configs.MaskEqualTo,
}
// KnownOperators returns the list of the known operations.
// Used by `runc features`.
func KnownOperators() []string {
var res []string
for k := range operators {
res = append(res, k)
}
sort.Strings(res)
return res
}
var actions = map[string]configs.Action{
"SCMP_ACT_KILL": configs.Kill,
"SCMP_ACT_ERRNO": configs.Errno,
"SCMP_ACT_TRAP": configs.Trap,
"SCMP_ACT_ALLOW": configs.Allow,
"SCMP_ACT_TRACE": configs.Trace,
"SCMP_ACT_LOG": configs.Log,
"SCMP_ACT_NOTIFY": configs.Notify,
"SCMP_ACT_KILL_THREAD": configs.KillThread,
"SCMP_ACT_KILL_PROCESS": configs.KillProcess,
}
// KnownActions returns the list of the known actions.
// Used by `runc features`.
func KnownActions() []string {
var res []string
for k := range actions {
res = append(res, k)
}
sort.Strings(res)
return res
}
var archs = map[string]string{
"SCMP_ARCH_X86": "x86",
"SCMP_ARCH_X86_64": "amd64",
"SCMP_ARCH_X32": "x32",
"SCMP_ARCH_ARM": "arm",
"SCMP_ARCH_AARCH64": "arm64",
"SCMP_ARCH_MIPS": "mips",
"SCMP_ARCH_MIPS64": "mips64",
"SCMP_ARCH_MIPS64N32": "mips64n32",
"SCMP_ARCH_MIPSEL": "mipsel",
"SCMP_ARCH_MIPSEL64": "mipsel64",
"SCMP_ARCH_MIPSEL64N32": "mipsel64n32",
"SCMP_ARCH_PPC": "ppc",
"SCMP_ARCH_PPC64": "ppc64",
"SCMP_ARCH_PPC64LE": "ppc64le",
"SCMP_ARCH_RISCV64": "riscv64",
"SCMP_ARCH_S390": "s390",
"SCMP_ARCH_S390X": "s390x",
}
// KnownArchs returns the list of the known archs.
// Used by `runc features`.
func KnownArchs() []string {
var res []string
for k := range archs {
res = append(res, k)
}
sort.Strings(res)
return res
}
// ConvertStringToOperator converts a string into a Seccomp comparison operator.
// Comparison operators use the names they are assigned by Libseccomp's header.
// Attempting to convert a string that is not a valid operator results in an
// error.
func ConvertStringToOperator(in string) (configs.Operator, error) {
if op, ok := operators[in]; ok {
return op, nil
}
return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in)
}
// ConvertStringToAction converts a string into a Seccomp rule match action.
// Actions use the names they are assigned in Libseccomp's header.
// Attempting to convert a string that is not a valid action results in an
// error.
func ConvertStringToAction(in string) (configs.Action, error) {
if act, ok := actions[in]; ok {
return act, nil
}
return 0, fmt.Errorf("string %s is not a valid action for seccomp", in)
}
// ConvertStringToArch converts a string into a Seccomp comparison arch.
func ConvertStringToArch(in string) (string, error) {
if arch, ok := archs[in]; ok {
return arch, nil
}
return "", fmt.Errorf("string %s is not a valid arch for seccomp", in)
}

View File

@@ -1,721 +0,0 @@
//go:build cgo && seccomp
// +build cgo,seccomp
package patchbpf
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"os"
"runtime"
"unsafe"
libseccomp "github.com/seccomp/libseccomp-golang"
"github.com/sirupsen/logrus"
"golang.org/x/net/bpf"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
)
// #cgo pkg-config: libseccomp
/*
#include <errno.h>
#include <stdint.h>
#include <seccomp.h>
#include <linux/seccomp.h>
const uint32_t C_ACT_ERRNO_ENOSYS = SCMP_ACT_ERRNO(ENOSYS);
// Copied from <linux/seccomp.h>.
#ifndef SECCOMP_SET_MODE_FILTER
# define SECCOMP_SET_MODE_FILTER 1
#endif
const uintptr_t C_SET_MODE_FILTER = SECCOMP_SET_MODE_FILTER;
#ifndef SECCOMP_FILTER_FLAG_LOG
# define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
#endif
const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG;
#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
# define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
#endif
const uintptr_t C_FILTER_FLAG_NEW_LISTENER = SECCOMP_FILTER_FLAG_NEW_LISTENER;
#ifndef AUDIT_ARCH_RISCV64
#ifndef EM_RISCV
#define EM_RISCV 243
#endif
#define AUDIT_ARCH_RISCV64 (EM_RISCV|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
#endif
// We use the AUDIT_ARCH_* values because those are the ones used by the kernel
// and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we
// use <seccomp.h> so we get libseccomp's fallback definitions of AUDIT_ARCH_*.
const uint32_t C_AUDIT_ARCH_I386 = AUDIT_ARCH_I386;
const uint32_t C_AUDIT_ARCH_X86_64 = AUDIT_ARCH_X86_64;
const uint32_t C_AUDIT_ARCH_ARM = AUDIT_ARCH_ARM;
const uint32_t C_AUDIT_ARCH_AARCH64 = AUDIT_ARCH_AARCH64;
const uint32_t C_AUDIT_ARCH_MIPS = AUDIT_ARCH_MIPS;
const uint32_t C_AUDIT_ARCH_MIPS64 = AUDIT_ARCH_MIPS64;
const uint32_t C_AUDIT_ARCH_MIPS64N32 = AUDIT_ARCH_MIPS64N32;
const uint32_t C_AUDIT_ARCH_MIPSEL = AUDIT_ARCH_MIPSEL;
const uint32_t C_AUDIT_ARCH_MIPSEL64 = AUDIT_ARCH_MIPSEL64;
const uint32_t C_AUDIT_ARCH_MIPSEL64N32 = AUDIT_ARCH_MIPSEL64N32;
const uint32_t C_AUDIT_ARCH_PPC = AUDIT_ARCH_PPC;
const uint32_t C_AUDIT_ARCH_PPC64 = AUDIT_ARCH_PPC64;
const uint32_t C_AUDIT_ARCH_PPC64LE = AUDIT_ARCH_PPC64LE;
const uint32_t C_AUDIT_ARCH_S390 = AUDIT_ARCH_S390;
const uint32_t C_AUDIT_ARCH_S390X = AUDIT_ARCH_S390X;
const uint32_t C_AUDIT_ARCH_RISCV64 = AUDIT_ARCH_RISCV64;
*/
import "C"
var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS)
// This syscall is used for multiplexing "large" syscalls on s390(x). Unknown
// syscalls will end up with this syscall number, so we need to explicitly
// return -ENOSYS for this syscall on those architectures.
const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0
func isAllowAction(action configs.Action) bool {
switch action {
// Trace is considered an "allow" action because a good tracer should
// support future syscalls (by handling -ENOSYS on its own), and giving
// -ENOSYS will be disruptive for emulation.
case configs.Allow, configs.Log, configs.Trace:
return true
default:
return false
}
}
func parseProgram(rdr io.Reader) ([]bpf.RawInstruction, error) {
var program []bpf.RawInstruction
loop:
for {
// Read the next instruction. We have to use NativeEndian because
// seccomp_export_bpf outputs the program in *host* endian-ness.
var insn unix.SockFilter
if err := binary.Read(rdr, utils.NativeEndian, &insn); err != nil {
if errors.Is(err, io.EOF) {
// Parsing complete.
break loop
}
if errors.Is(err, io.ErrUnexpectedEOF) {
// Parsing stopped mid-instruction.
return nil, fmt.Errorf("program parsing halted mid-instruction: %w", err)
}
// All other errors.
return nil, fmt.Errorf("error parsing instructions: %w", err)
}
program = append(program, bpf.RawInstruction{
Op: insn.Code,
Jt: insn.Jt,
Jf: insn.Jf,
K: insn.K,
})
}
return program, nil
}
func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error) {
rdr, wtr, err := os.Pipe()
if err != nil {
return nil, fmt.Errorf("error creating scratch pipe: %w", err)
}
defer wtr.Close()
defer rdr.Close()
readerBuffer := new(bytes.Buffer)
errChan := make(chan error, 1)
go func() {
_, err := io.Copy(readerBuffer, rdr)
errChan <- err
close(errChan)
}()
if err := filter.ExportBPF(wtr); err != nil {
return nil, fmt.Errorf("error exporting BPF: %w", err)
}
// Close so that the reader actually gets EOF.
_ = wtr.Close()
if copyErr := <-errChan; copyErr != nil {
return nil, fmt.Errorf("error reading from ExportBPF pipe: %w", copyErr)
}
// Parse the instructions.
rawProgram, err := parseProgram(readerBuffer)
if err != nil {
return nil, fmt.Errorf("parsing generated BPF filter: %w", err)
}
program, ok := bpf.Disassemble(rawProgram)
if !ok {
return nil, errors.New("could not disassemble entire BPF filter")
}
return program, nil
}
type linuxAuditArch uint32
const invalidArch linuxAuditArch = 0
func scmpArchToAuditArch(arch libseccomp.ScmpArch) (linuxAuditArch, error) {
switch arch {
case libseccomp.ArchNative:
// Convert to actual native architecture.
arch, err := libseccomp.GetNativeArch()
if err != nil {
return invalidArch, fmt.Errorf("unable to get native arch: %w", err)
}
return scmpArchToAuditArch(arch)
case libseccomp.ArchX86:
return linuxAuditArch(C.C_AUDIT_ARCH_I386), nil
case libseccomp.ArchAMD64, libseccomp.ArchX32:
// NOTE: x32 is treated like x86_64 except all x32 syscalls have the
// 30th bit of the syscall number set to indicate that it's not a
// normal x86_64 syscall.
return linuxAuditArch(C.C_AUDIT_ARCH_X86_64), nil
case libseccomp.ArchARM:
return linuxAuditArch(C.C_AUDIT_ARCH_ARM), nil
case libseccomp.ArchARM64:
return linuxAuditArch(C.C_AUDIT_ARCH_AARCH64), nil
case libseccomp.ArchMIPS:
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS), nil
case libseccomp.ArchMIPS64:
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64), nil
case libseccomp.ArchMIPS64N32:
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64N32), nil
case libseccomp.ArchMIPSEL:
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL), nil
case libseccomp.ArchMIPSEL64:
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64), nil
case libseccomp.ArchMIPSEL64N32:
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
case libseccomp.ArchPPC:
return linuxAuditArch(C.C_AUDIT_ARCH_PPC), nil
case libseccomp.ArchPPC64:
return linuxAuditArch(C.C_AUDIT_ARCH_PPC64), nil
case libseccomp.ArchPPC64LE:
return linuxAuditArch(C.C_AUDIT_ARCH_PPC64LE), nil
case libseccomp.ArchS390:
return linuxAuditArch(C.C_AUDIT_ARCH_S390), nil
case libseccomp.ArchS390X:
return linuxAuditArch(C.C_AUDIT_ARCH_S390X), nil
case libseccomp.ArchRISCV64:
return linuxAuditArch(C.C_AUDIT_ARCH_RISCV64), nil
default:
return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
}
}
type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
// Figure out largest syscall number referenced in the filter for each
// architecture. We will be generating code based on the native architecture
// representation, but SCMP_ARCH_X32 means we have to track cases where the
// same architecture has different largest syscalls based on the mode.
func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
scmpArchs := make(map[libseccomp.ScmpArch]struct{})
for _, ociArch := range config.Architectures {
arch, err := libseccomp.GetArchFromString(ociArch)
if err != nil {
return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
}
scmpArchs[arch] = struct{}{}
}
// On architectures like ppc64le, Docker inexplicably doesn't include the
// native architecture in the architecture list which results in no
// architectures being present in the list at all (rendering the ENOSYS
// stub a no-op). So, always include the native architecture.
if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil {
return nil, fmt.Errorf("unable to get native arch: %w", err)
} else if _, ok := scmpArchs[nativeScmpArch]; !ok {
logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch)
scmpArchs[nativeScmpArch] = struct{}{}
}
logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs)
// Only loop over architectures which are present in the filter. Any other
// architectures will get the libseccomp bad architecture action anyway.
lastSyscalls := make(lastSyscallMap)
for arch := range scmpArchs {
auditArch, err := scmpArchToAuditArch(arch)
if err != nil {
return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
}
if _, ok := lastSyscalls[auditArch]; !ok {
lastSyscalls[auditArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
}
if _, ok := lastSyscalls[auditArch][arch]; ok {
// Because of ArchNative we may hit the same entry multiple times.
// Just skip it if we've seen this (linuxAuditArch, ScmpArch)
// combination before.
continue
}
// Find the largest syscall in the filter for this architecture.
var largestSyscall libseccomp.ScmpSyscall
for _, rule := range config.Syscalls {
sysno, err := libseccomp.GetSyscallFromNameByArch(rule.Name, arch)
if err != nil {
// Ignore unknown syscalls.
continue
}
if sysno > largestSyscall {
largestSyscall = sysno
}
}
if largestSyscall != 0 {
logrus.Debugf("seccomp: largest syscall number for arch %v is %v", arch, largestSyscall)
lastSyscalls[auditArch][arch] = largestSyscall
} else {
logrus.Warnf("could not find any syscalls for arch %v", arch)
delete(lastSyscalls[auditArch], arch)
}
}
return lastSyscalls, nil
}
// FIXME FIXME FIXME
//
// This solution is less than ideal. In the future it would be great to have
// per-arch information about which syscalls were added in which kernel
// versions so we can create far more accurate filter rules (handling holes in
// the syscall table and determining -ENOSYS requirements based on kernel
// minimum version alone.
//
// This implementation can in principle cause issues with syscalls like
// close_range(2) which were added out-of-order in the syscall table between
// kernel releases.
func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) {
// A jump-table for each linuxAuditArch used to generate the initial
// conditional jumps -- measured from the *END* of the program so they
// remain valid after prepending to the tail.
archJumpTable := map[linuxAuditArch]uint32{}
// Generate our own -ENOSYS rules for each architecture. They have to be
// generated in reverse (prepended to the tail of the program) because the
// JumpIf jumps need to be computed from the end of the program.
programTail := []bpf.Instruction{
// Fall-through rules jump into the filter.
bpf.Jump{Skip: 1},
// Rules which jump to here get -ENOSYS.
bpf.RetConstant{Val: retErrnoEnosys},
}
// Generate the syscall -ENOSYS rules.
for auditArch, maxSyscalls := range lastSyscalls {
// The number of instructions from the tail of this section which need
// to be jumped in order to reach the -ENOSYS return. If the section
// does not jump, it will fall through to the actual filter.
baseJumpEnosys := uint32(len(programTail) - 1)
baseJumpFilter := baseJumpEnosys + 1
// Add the load instruction for the syscall number -- we jump here
// directly from the arch code so we need to do it here. Sadly we can't
// share this code between architecture branches.
section := []bpf.Instruction{
// load [0] (syscall number)
bpf.LoadAbsolute{Off: 0, Size: 4}, // NOTE: We assume sizeof(int) == 4.
}
switch len(maxSyscalls) {
case 0:
// No syscalls found for this arch -- skip it and move on.
continue
case 1:
// Get the only syscall and scmpArch in the map.
var (
scmpArch libseccomp.ScmpArch
sysno libseccomp.ScmpSyscall
)
for arch, no := range maxSyscalls {
sysno = no
scmpArch = arch
}
switch scmpArch {
// Return -ENOSYS for setup(2) on s390(x). This syscall is used for
// multiplexing "large syscall number" syscalls, but if the syscall
// number is not known to the kernel then the syscall number is
// left unchanged (and because it is sysno=0, you'll end up with
// EPERM for syscalls the kernel doesn't know about).
//
// The actual setup(2) syscall is never used by userspace anymore
// (and hasn't existed for decades) outside of this multiplexing
// scheme so returning -ENOSYS is fine.
case libseccomp.ArchS390, libseccomp.ArchS390X:
section = append(section, []bpf.Instruction{
// jne [setup=0],1
bpf.JumpIf{
Cond: bpf.JumpNotEqual,
Val: uint32(s390xMultiplexSyscall),
SkipTrue: 1,
},
// ret [ENOSYS]
bpf.RetConstant{Val: retErrnoEnosys},
}...)
}
// The simplest case just boils down to a single jgt instruction,
// with special handling if baseJumpEnosys is larger than 255 (and
// thus a long jump is required).
var sectionTail []bpf.Instruction
if baseJumpEnosys+1 <= 255 {
sectionTail = []bpf.Instruction{
// jgt [syscall],[baseJumpEnosys+1]
bpf.JumpIf{
Cond: bpf.JumpGreaterThan,
Val: uint32(sysno),
SkipTrue: uint8(baseJumpEnosys + 1),
},
// ja [baseJumpFilter]
bpf.Jump{Skip: baseJumpFilter},
}
} else {
sectionTail = []bpf.Instruction{
// jle [syscall],1
bpf.JumpIf{Cond: bpf.JumpLessOrEqual, Val: uint32(sysno), SkipTrue: 1},
// ja [baseJumpEnosys+1]
bpf.Jump{Skip: baseJumpEnosys + 1},
// ja [baseJumpFilter]
bpf.Jump{Skip: baseJumpFilter},
}
}
// If we're on x86 we need to add a check for x32 and if we're in
// the wrong mode we jump over the section.
if uint32(auditArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
// Generate a prefix to check the mode.
switch scmpArch {
case libseccomp.ArchAMD64:
sectionTail = append([]bpf.Instruction{
// jset (1<<30),[len(tail)-1]
bpf.JumpIf{
Cond: bpf.JumpBitsSet,
Val: 1 << 30,
SkipTrue: uint8(len(sectionTail) - 1),
},
}, sectionTail...)
case libseccomp.ArchX32:
sectionTail = append([]bpf.Instruction{
// jset (1<<30),0,[len(tail)-1]
bpf.JumpIf{
Cond: bpf.JumpBitsNotSet,
Val: 1 << 30,
SkipTrue: uint8(len(sectionTail) - 1),
},
}, sectionTail...)
default:
return nil, fmt.Errorf("unknown amd64 native architecture %#x", scmpArch)
}
}
section = append(section, sectionTail...)
case 2:
// x32 and x86_64 are a unique case, we can't handle any others.
if uint32(auditArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", auditArch)
}
x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
if !ok {
return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchX32, maxSyscalls)
}
x86sysno, ok := maxSyscalls[libseccomp.ArchAMD64]
if !ok {
return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchAMD64, maxSyscalls)
}
// The x32 ABI indicates that a syscall is being made by an x32
// process by setting the 30th bit of the syscall number, but we
// need to do some special-casing depending on whether we need to
// do long jumps.
if baseJumpEnosys+2 <= 255 {
// For the simple case we want to have something like:
// jset (1<<30),1
// jgt [x86 syscall],[baseJumpEnosys+2],1
// jgt [x32 syscall],[baseJumpEnosys+1]
// ja [baseJumpFilter]
section = append(section, []bpf.Instruction{
// jset (1<<30),1
bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
// jgt [x86 syscall],[baseJumpEnosys+1],1
bpf.JumpIf{
Cond: bpf.JumpGreaterThan,
Val: uint32(x86sysno),
SkipTrue: uint8(baseJumpEnosys + 2), SkipFalse: 1,
},
// jgt [x32 syscall],[baseJumpEnosys]
bpf.JumpIf{
Cond: bpf.JumpGreaterThan,
Val: uint32(x32sysno),
SkipTrue: uint8(baseJumpEnosys + 1),
},
// ja [baseJumpFilter]
bpf.Jump{Skip: baseJumpFilter},
}...)
} else {
// But if the [baseJumpEnosys+2] jump is larger than 255 we
// need to do a long jump like so:
// jset (1<<30),1
// jgt [x86 syscall],1,2
// jle [x32 syscall],1
// ja [baseJumpEnosys+1]
// ja [baseJumpFilter]
section = append(section, []bpf.Instruction{
// jset (1<<30),1
bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
// jgt [x86 syscall],1,2
bpf.JumpIf{
Cond: bpf.JumpGreaterThan,
Val: uint32(x86sysno),
SkipTrue: 1, SkipFalse: 2,
},
// jle [x32 syscall],[baseJumpEnosys]
bpf.JumpIf{
Cond: bpf.JumpLessOrEqual,
Val: uint32(x32sysno),
SkipTrue: 1,
},
// ja [baseJumpEnosys+1]
bpf.Jump{Skip: baseJumpEnosys + 1},
// ja [baseJumpFilter]
bpf.Jump{Skip: baseJumpFilter},
}...)
}
default:
return nil, fmt.Errorf("invalid number of architecture overlaps: %v", len(maxSyscalls))
}
// Prepend this section to the tail.
programTail = append(section, programTail...)
// Update jump table.
archJumpTable[auditArch] = uint32(len(programTail))
}
// Add a dummy "jump to filter" for any architecture we might miss below.
// Such architectures will probably get the BadArch action of the filter
// regardless.
programTail = append([]bpf.Instruction{
// ja [end of stub and start of filter]
bpf.Jump{Skip: uint32(len(programTail))},
}, programTail...)
// Generate the jump rules for each architecture. This has to be done in
// reverse as well for the same reason as above. We add to programTail
// directly because the jumps are impacted by each architecture rule we add
// as well.
//
// TODO: Maybe we want to optimise to avoid long jumps here? So sort the
// architectures based on how large the jumps are going to be, or
// re-sort the candidate architectures each time to make sure that we
// pick the largest jump which is going to be smaller than 255.
for auditArch := range lastSyscalls {
// We jump forwards but the jump table is calculated from the *END*.
jump := uint32(len(programTail)) - archJumpTable[auditArch]
// Same routine as above -- this is a basic jeq check, complicated
// slightly if it turns out that we need to do a long jump.
if jump <= 255 {
programTail = append([]bpf.Instruction{
// jeq [arch],[jump]
bpf.JumpIf{
Cond: bpf.JumpEqual,
Val: uint32(auditArch),
SkipTrue: uint8(jump),
},
}, programTail...)
} else {
programTail = append([]bpf.Instruction{
// jne [arch],1
bpf.JumpIf{
Cond: bpf.JumpNotEqual,
Val: uint32(auditArch),
SkipTrue: 1,
},
// ja [jump]
bpf.Jump{Skip: jump},
}, programTail...)
}
}
// Prepend the load instruction for the architecture.
programTail = append([]bpf.Instruction{
// load [4] (architecture)
bpf.LoadAbsolute{Off: 4, Size: 4}, // NOTE: We assume sizeof(int) == 4.
}, programTail...)
// And that's all folks!
return programTail, nil
}
func assemble(program []bpf.Instruction) ([]unix.SockFilter, error) {
rawProgram, err := bpf.Assemble(program)
if err != nil {
return nil, fmt.Errorf("error assembling program: %w", err)
}
// Convert to []unix.SockFilter for unix.SockFilter.
var filter []unix.SockFilter
for _, insn := range rawProgram {
filter = append(filter, unix.SockFilter{
Code: insn.Op,
Jt: insn.Jt,
Jf: insn.Jf,
K: insn.K,
})
}
return filter, nil
}
func generatePatch(config *configs.Seccomp) ([]bpf.Instruction, error) {
// Patch the generated cBPF only when there is not a defaultErrnoRet set
// and it is different from ENOSYS
if config.DefaultErrnoRet != nil && *config.DefaultErrnoRet == uint(retErrnoEnosys) {
return nil, nil
}
// We only add the stub if the default action is not permissive.
if isAllowAction(config.DefaultAction) {
logrus.Debugf("seccomp: skipping -ENOSYS stub filter generation")
return nil, nil
}
lastSyscalls, err := findLastSyscalls(config)
if err != nil {
return nil, fmt.Errorf("error finding last syscalls for -ENOSYS stub: %w", err)
}
stubProgram, err := generateEnosysStub(lastSyscalls)
if err != nil {
return nil, fmt.Errorf("error generating -ENOSYS stub: %w", err)
}
return stubProgram, nil
}
func enosysPatchFilter(config *configs.Seccomp, filter *libseccomp.ScmpFilter) ([]unix.SockFilter, error) {
program, err := disassembleFilter(filter)
if err != nil {
return nil, fmt.Errorf("error disassembling original filter: %w", err)
}
patch, err := generatePatch(config)
if err != nil {
return nil, fmt.Errorf("error generating patch for filter: %w", err)
}
fullProgram := append(patch, program...)
logrus.Debugf("seccomp: prepending -ENOSYS stub filter to user filter...")
for idx, insn := range patch {
logrus.Debugf(" [%4.1d] %s", idx, insn)
}
logrus.Debugf(" [....] --- original filter ---")
fprog, err := assemble(fullProgram)
if err != nil {
return nil, fmt.Errorf("error assembling modified filter: %w", err)
}
return fprog, nil
}
func filterFlags(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, err error) {
// Ignore the error since pre-2.4 libseccomp is treated as API level 0.
apiLevel, _ := libseccomp.GetAPI()
noNewPrivs, err = filter.GetNoNewPrivsBit()
if err != nil {
return 0, false, fmt.Errorf("unable to fetch no_new_privs filter bit: %w", err)
}
if apiLevel >= 3 {
if logBit, err := filter.GetLogBit(); err != nil {
return 0, false, fmt.Errorf("unable to fetch SECCOMP_FILTER_FLAG_LOG bit: %w", err)
} else if logBit {
flags |= uint(C.C_FILTER_FLAG_LOG)
}
}
// TODO: Support seccomp flags not yet added to libseccomp-golang...
for _, call := range config.Syscalls {
if call.Action == configs.Notify {
flags |= uint(C.C_FILTER_FLAG_NEW_LISTENER)
break
}
}
return
}
func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err error) {
fprog := unix.SockFprog{
Len: uint16(len(filter)),
Filter: &filter[0],
}
fd = -1 // only return a valid fd when C_FILTER_FLAG_NEW_LISTENER is set
// If no seccomp flags were requested we can use the old-school prctl(2).
if flags == 0 {
err = unix.Prctl(unix.PR_SET_SECCOMP,
unix.SECCOMP_MODE_FILTER,
uintptr(unsafe.Pointer(&fprog)), 0, 0)
} else {
fdptr, _, errno := unix.RawSyscall(unix.SYS_SECCOMP,
uintptr(C.C_SET_MODE_FILTER),
uintptr(flags), uintptr(unsafe.Pointer(&fprog)))
if errno != 0 {
err = errno
}
if flags&uint(C.C_FILTER_FLAG_NEW_LISTENER) != 0 {
fd = int(fdptr)
}
}
runtime.KeepAlive(filter)
runtime.KeepAlive(fprog)
return
}
// PatchAndLoad takes a seccomp configuration and a libseccomp filter which has
// been pre-configured with the set of rules in the seccomp config. It then
// patches said filter to handle -ENOSYS in a much nicer manner than the
// default libseccomp default action behaviour, and loads the patched filter
// into the kernel for the current process.
func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, error) {
// Generate a patched filter.
fprog, err := enosysPatchFilter(config, filter)
if err != nil {
return -1, fmt.Errorf("error patching filter: %w", err)
}
// Get the set of libseccomp flags set.
seccompFlags, noNewPrivs, err := filterFlags(config, filter)
if err != nil {
return -1, fmt.Errorf("unable to fetch seccomp filter flags: %w", err)
}
// Set no_new_privs if it was requested, though in runc we handle
// no_new_privs separately so warn if we hit this path.
if noNewPrivs {
logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path")
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return -1, fmt.Errorf("error enabling no_new_privs bit: %w", err)
}
}
// Finally, load the filter.
fd, err := sysSeccompSetFilter(seccompFlags, fprog)
if err != nil {
return -1, fmt.Errorf("error loading seccomp filter: %w", err)
}
return fd, nil
}

View File

@@ -1,4 +0,0 @@
//go:build !linux || !cgo || !seccomp
// +build !linux !cgo !seccomp
package patchbpf

View File

@@ -1,268 +0,0 @@
//go:build cgo && seccomp
// +build cgo,seccomp
package seccomp
import (
"errors"
"fmt"
libseccomp "github.com/seccomp/libseccomp-golang"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/seccomp/patchbpf"
)
var (
actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM))
actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM))
)
const (
// Linux system calls can have at most 6 arguments
syscallMaxArguments int = 6
)
// InitSeccomp installs the seccomp filters to be used in the container as
// specified in config.
// Returns the seccomp file descriptor if any of the filters include a
// SCMP_ACT_NOTIFY action, otherwise returns -1.
func InitSeccomp(config *configs.Seccomp) (int, error) {
if config == nil {
return -1, errors.New("cannot initialize Seccomp - nil config passed")
}
defaultAction, err := getAction(config.DefaultAction, config.DefaultErrnoRet)
if err != nil {
return -1, errors.New("error initializing seccomp - invalid default action")
}
// Ignore the error since pre-2.4 libseccomp is treated as API level 0.
apiLevel, _ := libseccomp.GetAPI()
for _, call := range config.Syscalls {
if call.Action == configs.Notify {
if apiLevel < 6 {
return -1, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel)
}
// We can't allow the write syscall to notify to the seccomp agent.
// After InitSeccomp() is called, we need to syncParentSeccomp() to write the seccomp fd plain
// number, so the parent sends it to the seccomp agent. If we use SCMP_ACT_NOTIFY on write, we
// never can write the seccomp fd to the parent and therefore the seccomp agent never receives
// the seccomp fd and runc is hang during initialization.
//
// Note that read()/close(), that are also used in syncParentSeccomp(), _can_ use SCMP_ACT_NOTIFY.
// Because we write the seccomp fd on the pipe to the parent, the parent is able to proceed and
// send the seccomp fd to the agent (it is another process and not subject to the seccomp
// filter). We will be blocked on read()/close() inside syncParentSeccomp() but if the seccomp
// agent allows those syscalls to proceed, initialization works just fine and the agent can
// handle future read()/close() syscalls as it wanted.
if call.Name == "write" {
return -1, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall")
}
}
}
// See comment on why write is not allowed. The same reason applies, as this can mean handling write too.
if defaultAction == libseccomp.ActNotify {
return -1, errors.New("SCMP_ACT_NOTIFY cannot be used as default action")
}
filter, err := libseccomp.NewFilter(defaultAction)
if err != nil {
return -1, fmt.Errorf("error creating filter: %w", err)
}
// Add extra architectures
for _, arch := range config.Architectures {
scmpArch, err := libseccomp.GetArchFromString(arch)
if err != nil {
return -1, fmt.Errorf("error validating Seccomp architecture: %w", err)
}
if err := filter.AddArch(scmpArch); err != nil {
return -1, fmt.Errorf("error adding architecture to seccomp filter: %w", err)
}
}
// Unset no new privs bit
if err := filter.SetNoNewPrivsBit(false); err != nil {
return -1, fmt.Errorf("error setting no new privileges: %w", err)
}
// Add a rule for each syscall
for _, call := range config.Syscalls {
if call == nil {
return -1, errors.New("encountered nil syscall while initializing Seccomp")
}
if err := matchCall(filter, call, defaultAction); err != nil {
return -1, err
}
}
seccompFd, err := patchbpf.PatchAndLoad(config, filter)
if err != nil {
return -1, fmt.Errorf("error loading seccomp filter into kernel: %w", err)
}
return seccompFd, nil
}
// Convert Libcontainer Action to Libseccomp ScmpAction
func getAction(act configs.Action, errnoRet *uint) (libseccomp.ScmpAction, error) {
switch act {
case configs.Kill, configs.KillThread:
return libseccomp.ActKillThread, nil
case configs.Errno:
if errnoRet != nil {
return libseccomp.ActErrno.SetReturnCode(int16(*errnoRet)), nil
}
return actErrno, nil
case configs.Trap:
return libseccomp.ActTrap, nil
case configs.Allow:
return libseccomp.ActAllow, nil
case configs.Trace:
if errnoRet != nil {
return libseccomp.ActTrace.SetReturnCode(int16(*errnoRet)), nil
}
return actTrace, nil
case configs.Log:
return libseccomp.ActLog, nil
case configs.Notify:
return libseccomp.ActNotify, nil
case configs.KillProcess:
return libseccomp.ActKillProcess, nil
default:
return libseccomp.ActInvalid, errors.New("invalid action, cannot use in rule")
}
}
// Convert Libcontainer Operator to Libseccomp ScmpCompareOp
func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) {
switch op {
case configs.EqualTo:
return libseccomp.CompareEqual, nil
case configs.NotEqualTo:
return libseccomp.CompareNotEqual, nil
case configs.GreaterThan:
return libseccomp.CompareGreater, nil
case configs.GreaterThanOrEqualTo:
return libseccomp.CompareGreaterEqual, nil
case configs.LessThan:
return libseccomp.CompareLess, nil
case configs.LessThanOrEqualTo:
return libseccomp.CompareLessOrEqual, nil
case configs.MaskEqualTo:
return libseccomp.CompareMaskedEqual, nil
default:
return libseccomp.CompareInvalid, errors.New("invalid operator, cannot use in rule")
}
}
// Convert Libcontainer Arg to Libseccomp ScmpCondition
func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) {
cond := libseccomp.ScmpCondition{}
if arg == nil {
return cond, errors.New("cannot convert nil to syscall condition")
}
op, err := getOperator(arg.Op)
if err != nil {
return cond, err
}
return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo)
}
// Add a rule to match a single syscall
func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall, defAct libseccomp.ScmpAction) error {
if call == nil || filter == nil {
return errors.New("cannot use nil as syscall to block")
}
if len(call.Name) == 0 {
return errors.New("empty string is not a valid syscall")
}
// Convert the call's action to the libseccomp equivalent
callAct, err := getAction(call.Action, call.ErrnoRet)
if err != nil {
return fmt.Errorf("action in seccomp profile is invalid: %w", err)
}
if callAct == defAct {
// This rule is redundant, silently skip it
// to avoid error from AddRule.
return nil
}
// If we can't resolve the syscall, assume it is not supported
// by this kernel. Warn about it, don't error out.
callNum, err := libseccomp.GetSyscallFromName(call.Name)
if err != nil {
logrus.Debugf("unknown seccomp syscall %q ignored", call.Name)
return nil
}
// Unconditional match - just add the rule
if len(call.Args) == 0 {
if err := filter.AddRule(callNum, callAct); err != nil {
return fmt.Errorf("error adding seccomp filter rule for syscall %s: %w", call.Name, err)
}
} else {
// If two or more arguments have the same condition,
// Revert to old behavior, adding each condition as a separate rule
argCounts := make([]uint, syscallMaxArguments)
conditions := []libseccomp.ScmpCondition{}
for _, cond := range call.Args {
newCond, err := getCondition(cond)
if err != nil {
return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %w", call.Name, err)
}
argCounts[cond.Index] += 1
conditions = append(conditions, newCond)
}
hasMultipleArgs := false
for _, count := range argCounts {
if count > 1 {
hasMultipleArgs = true
break
}
}
if hasMultipleArgs {
// Revert to old behavior
// Add each condition attached to a separate rule
for _, cond := range conditions {
condArr := []libseccomp.ScmpCondition{cond}
if err := filter.AddRuleConditional(callNum, callAct, condArr); err != nil {
return fmt.Errorf("error adding seccomp rule for syscall %s: %w", call.Name, err)
}
}
} else {
// No conditions share same argument
// Use new, proper behavior
if err := filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
return fmt.Errorf("error adding seccomp rule for syscall %s: %w", call.Name, err)
}
}
}
return nil
}
// Version returns major, minor, and micro.
func Version() (uint, uint, uint) {
return libseccomp.GetLibraryVersion()
}
// Enabled is true if seccomp support is compiled in.
const Enabled = true

View File

@@ -1,28 +0,0 @@
//go:build !linux || !cgo || !seccomp
// +build !linux !cgo !seccomp
package seccomp
import (
"errors"
"github.com/opencontainers/runc/libcontainer/configs"
)
var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
// InitSeccomp does nothing because seccomp is not supported.
func InitSeccomp(config *configs.Seccomp) (int, error) {
if config != nil {
return -1, ErrSeccompNotEnabled
}
return -1, nil
}
// Version returns major, minor, and micro.
func Version() (uint, uint, uint) {
return 0, 0, 0
}
// Enabled is true if seccomp support is compiled in.
const Enabled = false

View File

@@ -1,149 +0,0 @@
package libcontainer
import (
"errors"
"fmt"
"os"
"os/exec"
"strconv"
"github.com/opencontainers/selinux/go-selinux"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
)
// linuxSetnsInit performs the container's initialization for running a new process
// inside an existing container.
type linuxSetnsInit struct {
pipe *os.File
consoleSocket *os.File
config *initConfig
logFd int
}
func (l *linuxSetnsInit) getSessionRingName() string {
return "_ses." + l.config.ContainerId
}
func (l *linuxSetnsInit) Init() error {
if !l.config.Config.NoNewKeyring {
if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
return err
}
defer selinux.SetKeyLabel("") //nolint: errcheck
// Do not inherit the parent's session keyring.
if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
// Same justification as in standart_init_linux.go as to why we
// don't bail on ENOSYS.
//
// TODO(cyphar): And we should have logging here too.
if !errors.Is(err, unix.ENOSYS) {
return fmt.Errorf("unable to join session keyring: %w", err)
}
}
}
if l.config.CreateConsole {
if err := setupConsole(l.consoleSocket, l.config, false); err != nil {
return err
}
if err := system.Setctty(); err != nil {
return err
}
}
if l.config.NoNewPrivileges {
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
}
}
// Tell our parent that we're ready to exec. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return fmt.Errorf("sync ready: %w", err)
}
if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
return err
}
defer selinux.SetExecLabel("") //nolint: errcheck
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
if err != nil {
return err
}
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
return err
}
}
if err := finalizeNamespace(l.config); err != nil {
return err
}
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
}
// Check for the arg before waiting to make sure it exists and it is
// returned as a create time error.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
// exec.LookPath in Go < 1.20 might return no error for an executable
// residing on a file system mounted with noexec flag, so perform this
// extra check now while we can still return a proper error.
// TODO: remove this once go < 1.20 is not supported.
if err := eaccess(name); err != nil {
return &os.PathError{Op: "eaccess", Path: name, Err: err}
}
// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles).
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
if err != nil {
return fmt.Errorf("unable to init seccomp: %w", err)
}
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
return err
}
}
logrus.Debugf("setns_init: about to exec")
// Close the log pipe fd so the parent's ForwardLogs can exit.
if err := unix.Close(l.logFd); err != nil {
return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
}
// Close all file descriptors we are not passing to the container. This is
// necessary because the execve target could use internal runc fds as the
// execve path, potentially giving access to binary files from the host
// (which can then be opened by container processes, leading to container
// escapes). Note that because this operation will close any open file
// descriptors that are referenced by (*os.File) handles from underneath
// the Go runtime, we must not do any file operations after this point
// (otherwise the (*os.File) finaliser could close the wrong file). See
// CVE-2024-21626 for more information as to why this protection is
// necessary.
//
// This is not needed for runc-dmz, because the extra execve(2) step means
// that all O_CLOEXEC file descriptors have already been closed and thus
// the second execve(2) from runc-dmz cannot access internal file
// descriptors from runc.
if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
return err
}
return system.Exec(name, l.config.Args[0:], os.Environ())
}

View File

@@ -1,282 +0,0 @@
package libcontainer
import (
"errors"
"fmt"
"os"
"os/exec"
"strconv"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
)
type linuxStandardInit struct {
pipe *os.File
consoleSocket *os.File
parentPid int
fifoFd int
logFd int
mountFds []int
config *initConfig
}
func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
var newperms uint32
if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
// With user ns we need 'other' search permissions.
newperms = 0x8
} else {
// Without user ns we need 'UID' search permissions.
newperms = 0x80000
}
// Create a unique per session container name that we can join in setns;
// However, other containers can also join it.
return "_ses." + l.config.ContainerId, 0xffffffff, newperms
}
func (l *linuxStandardInit) Init() error {
if !l.config.Config.NoNewKeyring {
if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
return err
}
defer selinux.SetKeyLabel("") //nolint: errcheck
ringname, keepperms, newperms := l.getSessionRingParams()
// Do not inherit the parent's session keyring.
if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
// If keyrings aren't supported then it is likely we are on an
// older kernel (or inside an LXC container). While we could bail,
// the security feature we are using here is best-effort (it only
// really provides marginal protection since VFS credentials are
// the only significant protection of keyrings).
//
// TODO(cyphar): Log this so people know what's going on, once we
// have proper logging in 'runc init'.
if !errors.Is(err, unix.ENOSYS) {
return fmt.Errorf("unable to join session keyring: %w", err)
}
} else {
// Make session keyring searchable. If we've gotten this far we
// bail on any error -- we don't want to have a keyring with bad
// permissions.
if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return fmt.Errorf("unable to mod keyring permissions: %w", err)
}
}
}
if err := setupNetwork(l.config); err != nil {
return err
}
if err := setupRoute(l.config.Config); err != nil {
return err
}
// initialises the labeling system
selinux.GetEnabled()
// We don't need the mountFds after prepareRootfs() nor if it fails.
err := prepareRootfs(l.pipe, l.config, l.mountFds)
for _, m := range l.mountFds {
if m == -1 {
continue
}
if err := unix.Close(m); err != nil {
return fmt.Errorf("Unable to close mountFds fds: %w", err)
}
}
if err != nil {
return err
}
// Set up the console. This has to be done *before* we finalize the rootfs,
// but *after* we've given the user the chance to set up all of the mounts
// they wanted.
if l.config.CreateConsole {
if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
return err
}
if err := system.Setctty(); err != nil {
return &os.SyscallError{Syscall: "ioctl(setctty)", Err: err}
}
}
// Finish the rootfs setup.
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := finalizeRootfs(l.config.Config); err != nil {
return err
}
}
if hostname := l.config.Config.Hostname; hostname != "" {
if err := unix.Sethostname([]byte(hostname)); err != nil {
return &os.SyscallError{Syscall: "sethostname", Err: err}
}
}
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return fmt.Errorf("unable to apply apparmor profile: %w", err)
}
for key, value := range l.config.Config.Sysctl {
if err := writeSystemProperty(key, value); err != nil {
return err
}
}
for _, path := range l.config.Config.ReadonlyPaths {
if err := readonlyPath(path); err != nil {
return fmt.Errorf("can't make %q read-only: %w", path, err)
}
}
for _, path := range l.config.Config.MaskPaths {
if err := maskPath(path, l.config.Config.MountLabel); err != nil {
return fmt.Errorf("can't mask path %s: %w", path, err)
}
}
pdeath, err := system.GetParentDeathSignal()
if err != nil {
return fmt.Errorf("can't get pdeath signal: %w", err)
}
if l.config.NoNewPrivileges {
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return &os.SyscallError{Syscall: "prctl(SET_NO_NEW_PRIVS)", Err: err}
}
}
// Tell our parent that we're ready to exec. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return fmt.Errorf("sync ready: %w", err)
}
if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
return fmt.Errorf("can't set process label: %w", err)
}
defer selinux.SetExecLabel("") //nolint: errcheck
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
if err != nil {
return err
}
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
return err
}
}
if err := finalizeNamespace(l.config); err != nil {
return err
}
// finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
if err := pdeath.Restore(); err != nil {
return fmt.Errorf("can't restore pdeath signal: %w", err)
}
// Compare the parent from the initial start of the init process and make
// sure that it did not change. if the parent changes that means it died
// and we were reparented to something else so we should just kill ourself
// and not cause problems for someone else.
if unix.Getppid() != l.parentPid {
return unix.Kill(unix.Getpid(), unix.SIGKILL)
}
// Check for the arg before waiting to make sure it exists and it is
// returned as a create time error.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
// exec.LookPath in Go < 1.20 might return no error for an executable
// residing on a file system mounted with noexec flag, so perform this
// extra check now while we can still return a proper error.
// TODO: remove this once go < 1.20 is not supported.
if err := eaccess(name); err != nil {
return &os.PathError{Op: "eaccess", Path: name, Err: err}
}
// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles). However, this needs to be done
// before closing the pipe since we need it to pass the seccompFd to
// the parent.
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
if err != nil {
return fmt.Errorf("unable to init seccomp: %w", err)
}
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
return err
}
}
// Close the pipe to signal that we have completed our init.
logrus.Debugf("init: closing the pipe to signal completion")
_ = l.pipe.Close()
// Close the log pipe fd so the parent's ForwardLogs can exit.
if err := unix.Close(l.logFd); err != nil {
return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
}
// Wait for the FIFO to be opened on the other side before exec-ing the
// user process. We open it through /proc/self/fd/$fd, because the fd that
// was given to us was an O_PATH fd to the fifo itself. Linux allows us to
// re-open an O_PATH fd through /proc.
fifoPath := "/proc/self/fd/" + strconv.Itoa(l.fifoFd)
fd, err := unix.Open(fifoPath, unix.O_WRONLY|unix.O_CLOEXEC, 0)
if err != nil {
return &os.PathError{Op: "open exec fifo", Path: fifoPath, Err: err}
}
if _, err := unix.Write(fd, []byte("0")); err != nil {
return &os.PathError{Op: "write exec fifo", Path: fifoPath, Err: err}
}
// Close the O_PATH fifofd fd before exec because the kernel resets
// dumpable in the wrong order. This has been fixed in newer kernels, but
// we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
// N.B. the core issue itself (passing dirfds to the host filesystem) has
// since been resolved.
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
_ = unix.Close(l.fifoFd)
s := l.config.SpecState
s.Pid = unix.Getpid()
s.Status = specs.StateCreated
if err := l.config.Config.Hooks[configs.StartContainer].RunHooks(s); err != nil {
return err
}
// Close all file descriptors we are not passing to the container. This is
// necessary because the execve target could use internal runc fds as the
// execve path, potentially giving access to binary files from the host
// (which can then be opened by container processes, leading to container
// escapes). Note that because this operation will close any open file
// descriptors that are referenced by (*os.File) handles from underneath
// the Go runtime, we must not do any file operations after this point
// (otherwise the (*os.File) finaliser could close the wrong file). See
// CVE-2024-21626 for more information as to why this protection is
// necessary.
//
// This is not needed for runc-dmz, because the extra execve(2) step means
// that all O_CLOEXEC file descriptors have already been closed and thus
// the second execve(2) from runc-dmz cannot access internal file
// descriptors from runc.
if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
return err
}
return system.Exec(name, l.config.Args[0:], os.Environ())
}

View File

@@ -1,243 +0,0 @@
package libcontainer
import (
"fmt"
"os"
"path/filepath"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func newStateTransitionError(from, to containerState) error {
return &stateTransitionError{
From: from.status().String(),
To: to.status().String(),
}
}
// stateTransitionError is returned when an invalid state transition happens from one
// state to another.
type stateTransitionError struct {
From string
To string
}
func (s *stateTransitionError) Error() string {
return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To)
}
type containerState interface {
transition(containerState) error
destroy() error
status() Status
}
func destroy(c *linuxContainer) error {
if !c.config.Namespaces.Contains(configs.NEWPID) ||
c.config.Namespaces.PathOf(configs.NEWPID) != "" {
if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil {
logrus.Warn(err)
}
}
err := c.cgroupManager.Destroy()
if c.intelRdtManager != nil {
if ierr := c.intelRdtManager.Destroy(); err == nil {
err = ierr
}
}
if rerr := os.RemoveAll(c.root); err == nil {
err = rerr
}
c.initProcess = nil
if herr := runPoststopHooks(c); err == nil {
err = herr
}
c.state = &stoppedState{c: c}
return err
}
func runPoststopHooks(c *linuxContainer) error {
hooks := c.config.Hooks
if hooks == nil {
return nil
}
s, err := c.currentOCIState()
if err != nil {
return err
}
s.Status = specs.StateStopped
if err := hooks[configs.Poststop].RunHooks(s); err != nil {
return err
}
return nil
}
// stoppedState represents a container is a stopped/destroyed state.
type stoppedState struct {
c *linuxContainer
}
func (b *stoppedState) status() Status {
return Stopped
}
func (b *stoppedState) transition(s containerState) error {
switch s.(type) {
case *runningState, *restoredState:
b.c.state = s
return nil
case *stoppedState:
return nil
}
return newStateTransitionError(b, s)
}
func (b *stoppedState) destroy() error {
return destroy(b.c)
}
// runningState represents a container that is currently running.
type runningState struct {
c *linuxContainer
}
func (r *runningState) status() Status {
return Running
}
func (r *runningState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
if r.c.runType() == Running {
return ErrRunning
}
r.c.state = s
return nil
case *pausedState:
r.c.state = s
return nil
case *runningState:
return nil
}
return newStateTransitionError(r, s)
}
func (r *runningState) destroy() error {
if r.c.runType() == Running {
return ErrRunning
}
return destroy(r.c)
}
type createdState struct {
c *linuxContainer
}
func (i *createdState) status() Status {
return Created
}
func (i *createdState) transition(s containerState) error {
switch s.(type) {
case *runningState, *pausedState, *stoppedState:
i.c.state = s
return nil
case *createdState:
return nil
}
return newStateTransitionError(i, s)
}
func (i *createdState) destroy() error {
_ = i.c.initProcess.signal(unix.SIGKILL)
return destroy(i.c)
}
// pausedState represents a container that is currently pause. It cannot be destroyed in a
// paused state and must transition back to running first.
type pausedState struct {
c *linuxContainer
}
func (p *pausedState) status() Status {
return Paused
}
func (p *pausedState) transition(s containerState) error {
switch s.(type) {
case *runningState, *stoppedState:
p.c.state = s
return nil
case *pausedState:
return nil
}
return newStateTransitionError(p, s)
}
func (p *pausedState) destroy() error {
t := p.c.runType()
if t != Running && t != Created {
if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
}
return destroy(p.c)
}
return ErrPaused
}
// restoredState is the same as the running state but also has associated checkpoint
// information that maybe need destroyed when the container is stopped and destroy is called.
type restoredState struct {
imageDir string
c *linuxContainer
}
func (r *restoredState) status() Status {
return Running
}
func (r *restoredState) transition(s containerState) error {
switch s.(type) {
case *stoppedState, *runningState:
return nil
}
return newStateTransitionError(r, s)
}
func (r *restoredState) destroy() error {
if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil {
if !os.IsNotExist(err) {
return err
}
}
return destroy(r.c)
}
// loadedState is used whenever a container is restored, loaded, or setting additional
// processes inside and it should not be destroyed when it is exiting.
type loadedState struct {
c *linuxContainer
s Status
}
func (n *loadedState) status() Status {
return n.s
}
func (n *loadedState) transition(s containerState) error {
n.c.state = s
return nil
}
func (n *loadedState) destroy() error {
if err := n.c.refreshState(); err != nil {
return err
}
return n.c.state.destroy()
}

View File

@@ -1,13 +0,0 @@
package libcontainer
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/types"
)
type Stats struct {
Interfaces []*types.NetworkInterface
CgroupStats *cgroups.Stats
IntelRdtStats *intelrdt.Stats
}

View File

@@ -1,126 +0,0 @@
package libcontainer
import (
"encoding/json"
"errors"
"fmt"
"io"
"github.com/opencontainers/runc/libcontainer/utils"
)
type syncType string
// Constants that are used for synchronisation between the parent and child
// during container setup. They come in pairs (with procError being a generic
// response which is followed by an &initError).
//
// [ child ] <-> [ parent ]
//
// procHooks --> [run hooks]
// <-- procResume
//
// procReady --> [final setup]
// <-- procRun
//
// procSeccomp --> [pick up seccomp fd with pidfd_getfd()]
// <-- procSeccompDone
const (
procError syncType = "procError"
procReady syncType = "procReady"
procRun syncType = "procRun"
procHooks syncType = "procHooks"
procResume syncType = "procResume"
procSeccomp syncType = "procSeccomp"
procSeccompDone syncType = "procSeccompDone"
)
type syncT struct {
Type syncType `json:"type"`
Fd int `json:"fd"`
}
// initError is used to wrap errors for passing them via JSON,
// as encoding/json can't unmarshal into error type.
type initError struct {
Message string `json:"message,omitempty"`
}
func (i initError) Error() string {
return i.Message
}
// writeSync is used to write to a synchronisation pipe. An error is returned
// if there was a problem writing the payload.
func writeSync(pipe io.Writer, sync syncType) error {
return writeSyncWithFd(pipe, sync, -1)
}
// writeSyncWithFd is used to write to a synchronisation pipe. An error is
// returned if there was a problem writing the payload.
func writeSyncWithFd(pipe io.Writer, sync syncType, fd int) error {
if err := utils.WriteJSON(pipe, syncT{sync, fd}); err != nil {
return fmt.Errorf("writing syncT %q: %w", string(sync), err)
}
return nil
}
// readSync is used to read from a synchronisation pipe. An error is returned
// if we got an initError, the pipe was closed, or we got an unexpected flag.
func readSync(pipe io.Reader, expected syncType) error {
var procSync syncT
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if errors.Is(err, io.EOF) {
return errors.New("parent closed synchronisation channel")
}
return fmt.Errorf("failed reading error from parent: %w", err)
}
if procSync.Type == procError {
var ierr initError
if err := json.NewDecoder(pipe).Decode(&ierr); err != nil {
return fmt.Errorf("failed reading error from parent: %w", err)
}
return &ierr
}
if procSync.Type != expected {
return errors.New("invalid synchronisation flag from parent")
}
return nil
}
// parseSync runs the given callback function on each syncT received from the
// child. It will return once io.EOF is returned from the given pipe.
func parseSync(pipe io.Reader, fn func(*syncT) error) error {
dec := json.NewDecoder(pipe)
for {
var sync syncT
if err := dec.Decode(&sync); err != nil {
if errors.Is(err, io.EOF) {
break
}
return err
}
// We handle this case outside fn for cleanliness reasons.
var ierr *initError
if sync.Type == procError {
if err := dec.Decode(&ierr); err != nil && !errors.Is(err, io.EOF) {
return fmt.Errorf("error decoding proc error from init: %w", err)
}
if ierr != nil {
return ierr
}
// Programmer error.
panic("No error following JSON procError payload.")
}
if err := fn(&sync); err != nil {
return err
}
}
return nil
}

View File

@@ -1,155 +0,0 @@
package types
import "github.com/opencontainers/runc/libcontainer/intelrdt"
// Event struct for encoding the event data to json.
type Event struct {
Type string `json:"type"`
ID string `json:"id"`
Data interface{} `json:"data,omitempty"`
}
// stats is the runc specific stats structure for stability when encoding and decoding stats.
type Stats struct {
CPU Cpu `json:"cpu"`
CPUSet CPUSet `json:"cpuset"`
Memory Memory `json:"memory"`
Pids Pids `json:"pids"`
Blkio Blkio `json:"blkio"`
Hugetlb map[string]Hugetlb `json:"hugetlb"`
IntelRdt IntelRdt `json:"intel_rdt"`
NetworkInterfaces []*NetworkInterface `json:"network_interfaces"`
}
type Hugetlb struct {
Usage uint64 `json:"usage,omitempty"`
Max uint64 `json:"max,omitempty"`
Failcnt uint64 `json:"failcnt"`
}
type BlkioEntry struct {
Major uint64 `json:"major,omitempty"`
Minor uint64 `json:"minor,omitempty"`
Op string `json:"op,omitempty"`
Value uint64 `json:"value,omitempty"`
}
type Blkio struct {
IoServiceBytesRecursive []BlkioEntry `json:"ioServiceBytesRecursive,omitempty"`
IoServicedRecursive []BlkioEntry `json:"ioServicedRecursive,omitempty"`
IoQueuedRecursive []BlkioEntry `json:"ioQueueRecursive,omitempty"`
IoServiceTimeRecursive []BlkioEntry `json:"ioServiceTimeRecursive,omitempty"`
IoWaitTimeRecursive []BlkioEntry `json:"ioWaitTimeRecursive,omitempty"`
IoMergedRecursive []BlkioEntry `json:"ioMergedRecursive,omitempty"`
IoTimeRecursive []BlkioEntry `json:"ioTimeRecursive,omitempty"`
SectorsRecursive []BlkioEntry `json:"sectorsRecursive,omitempty"`
}
type Pids struct {
Current uint64 `json:"current,omitempty"`
Limit uint64 `json:"limit,omitempty"`
}
type Throttling struct {
Periods uint64 `json:"periods,omitempty"`
ThrottledPeriods uint64 `json:"throttledPeriods,omitempty"`
ThrottledTime uint64 `json:"throttledTime,omitempty"`
}
type CpuUsage struct {
// Units: nanoseconds.
Total uint64 `json:"total,omitempty"`
Percpu []uint64 `json:"percpu,omitempty"`
PercpuKernel []uint64 `json:"percpu_kernel,omitempty"`
PercpuUser []uint64 `json:"percpu_user,omitempty"`
Kernel uint64 `json:"kernel"`
User uint64 `json:"user"`
}
type Cpu struct {
Usage CpuUsage `json:"usage,omitempty"`
Throttling Throttling `json:"throttling,omitempty"`
}
type CPUSet struct {
CPUs []uint16 `json:"cpus,omitempty"`
CPUExclusive uint64 `json:"cpu_exclusive"`
Mems []uint16 `json:"mems,omitempty"`
MemHardwall uint64 `json:"mem_hardwall"`
MemExclusive uint64 `json:"mem_exclusive"`
MemoryMigrate uint64 `json:"memory_migrate"`
MemorySpreadPage uint64 `json:"memory_spread_page"`
MemorySpreadSlab uint64 `json:"memory_spread_slab"`
MemoryPressure uint64 `json:"memory_pressure"`
SchedLoadBalance uint64 `json:"sched_load_balance"`
SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"`
}
type MemoryEntry struct {
Limit uint64 `json:"limit"`
Usage uint64 `json:"usage,omitempty"`
Max uint64 `json:"max,omitempty"`
Failcnt uint64 `json:"failcnt"`
}
type Memory struct {
Cache uint64 `json:"cache,omitempty"`
Usage MemoryEntry `json:"usage,omitempty"`
Swap MemoryEntry `json:"swap,omitempty"`
Kernel MemoryEntry `json:"kernel,omitempty"`
KernelTCP MemoryEntry `json:"kernelTCP,omitempty"`
Raw map[string]uint64 `json:"raw,omitempty"`
}
type L3CacheInfo struct {
CbmMask string `json:"cbm_mask,omitempty"`
MinCbmBits uint64 `json:"min_cbm_bits,omitempty"`
NumClosids uint64 `json:"num_closids,omitempty"`
}
type MemBwInfo struct {
BandwidthGran uint64 `json:"bandwidth_gran,omitempty"`
DelayLinear uint64 `json:"delay_linear,omitempty"`
MinBandwidth uint64 `json:"min_bandwidth,omitempty"`
NumClosids uint64 `json:"num_closids,omitempty"`
}
type IntelRdt struct {
// The read-only L3 cache information
L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"`
// The read-only L3 cache schema in root
L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"`
// The L3 cache schema in 'container_id' group
L3CacheSchema string `json:"l3_cache_schema,omitempty"`
// The read-only memory bandwidth information
MemBwInfo *MemBwInfo `json:"mem_bw_info,omitempty"`
// The read-only memory bandwidth schema in root
MemBwSchemaRoot string `json:"mem_bw_schema_root,omitempty"`
// The memory bandwidth schema in 'container_id' group
MemBwSchema string `json:"mem_bw_schema,omitempty"`
// The memory bandwidth monitoring statistics from NUMA nodes in 'container_id' group
MBMStats *[]intelrdt.MBMNumaNodeStats `json:"mbm_stats,omitempty"`
// The cache monitoring technology statistics from NUMA nodes in 'container_id' group
CMTStats *[]intelrdt.CMTNumaNodeStats `json:"cmt_stats,omitempty"`
}
type NetworkInterface struct {
// Name is the name of the network interface.
Name string
RxBytes uint64
RxPackets uint64
RxErrors uint64
RxDropped uint64
TxBytes uint64
TxPackets uint64
TxErrors uint64
TxDropped uint64
}