mirror of
https://github.com/optim-enterprises-bv/kubernetes.git
synced 2025-12-11 18:45:36 +00:00
Update to latest cadvisor
Signed-off-by: Davanum Srinivas <davanum@gmail.com>
This commit is contained in:
318
vendor/github.com/opencontainers/runc/libcontainer/README.md
generated
vendored
318
vendor/github.com/opencontainers/runc/libcontainer/README.md
generated
vendored
@@ -1,318 +0,0 @@
|
||||
# libcontainer
|
||||
|
||||
[](https://pkg.go.dev/github.com/opencontainers/runc/libcontainer)
|
||||
|
||||
Libcontainer provides a native Go implementation for creating containers
|
||||
with namespaces, cgroups, capabilities, and filesystem access controls.
|
||||
It allows you to manage the lifecycle of the container performing additional operations
|
||||
after the container is created.
|
||||
|
||||
|
||||
#### Container
|
||||
A container is a self contained execution environment that shares the kernel of the
|
||||
host system and which is (optionally) isolated from other containers in the system.
|
||||
|
||||
#### Using libcontainer
|
||||
|
||||
Because containers are spawned in a two step process you will need a binary that
|
||||
will be executed as the init process for the container. In libcontainer, we use
|
||||
the current binary (/proc/self/exe) to be executed as the init process, and use
|
||||
arg "init", we call the first step process "bootstrap", so you always need a "init"
|
||||
function as the entry of "bootstrap".
|
||||
|
||||
In addition to the go init function the early stage bootstrap is handled by importing
|
||||
[nsenter](https://github.com/opencontainers/runc/blob/master/libcontainer/nsenter/README.md).
|
||||
|
||||
```go
|
||||
import (
|
||||
_ "github.com/opencontainers/runc/libcontainer/nsenter"
|
||||
)
|
||||
|
||||
func init() {
|
||||
if len(os.Args) > 1 && os.Args[1] == "init" {
|
||||
runtime.GOMAXPROCS(1)
|
||||
runtime.LockOSThread()
|
||||
factory, _ := libcontainer.New("")
|
||||
if err := factory.StartInitialization(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
panic("--this line should have never been executed, congratulations--")
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Then to create a container you first have to initialize an instance of a factory
|
||||
that will handle the creation and initialization for a container.
|
||||
|
||||
```go
|
||||
factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init"))
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
return
|
||||
}
|
||||
```
|
||||
|
||||
Once you have an instance of the factory created we can create a configuration
|
||||
struct describing how the container is to be created. A sample would look similar to this:
|
||||
|
||||
```go
|
||||
defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
|
||||
var devices []*configs.DeviceRule
|
||||
for _, device := range specconv.AllowedDevices {
|
||||
devices = append(devices, &device.Rule)
|
||||
}
|
||||
config := &configs.Config{
|
||||
Rootfs: "/your/path/to/rootfs",
|
||||
Capabilities: &configs.Capabilities{
|
||||
Bounding: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
Effective: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
Permitted: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
Ambient: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
},
|
||||
Namespaces: configs.Namespaces([]configs.Namespace{
|
||||
{Type: configs.NEWNS},
|
||||
{Type: configs.NEWUTS},
|
||||
{Type: configs.NEWIPC},
|
||||
{Type: configs.NEWPID},
|
||||
{Type: configs.NEWUSER},
|
||||
{Type: configs.NEWNET},
|
||||
{Type: configs.NEWCGROUP},
|
||||
}),
|
||||
Cgroups: &configs.Cgroup{
|
||||
Name: "test-container",
|
||||
Parent: "system",
|
||||
Resources: &configs.Resources{
|
||||
MemorySwappiness: nil,
|
||||
Devices: devices,
|
||||
},
|
||||
},
|
||||
MaskPaths: []string{
|
||||
"/proc/kcore",
|
||||
"/sys/firmware",
|
||||
},
|
||||
ReadonlyPaths: []string{
|
||||
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
|
||||
},
|
||||
Devices: specconv.AllowedDevices,
|
||||
Hostname: "testing",
|
||||
Mounts: []*configs.Mount{
|
||||
{
|
||||
Source: "proc",
|
||||
Destination: "/proc",
|
||||
Device: "proc",
|
||||
Flags: defaultMountFlags,
|
||||
},
|
||||
{
|
||||
Source: "tmpfs",
|
||||
Destination: "/dev",
|
||||
Device: "tmpfs",
|
||||
Flags: unix.MS_NOSUID | unix.MS_STRICTATIME,
|
||||
Data: "mode=755",
|
||||
},
|
||||
{
|
||||
Source: "devpts",
|
||||
Destination: "/dev/pts",
|
||||
Device: "devpts",
|
||||
Flags: unix.MS_NOSUID | unix.MS_NOEXEC,
|
||||
Data: "newinstance,ptmxmode=0666,mode=0620,gid=5",
|
||||
},
|
||||
{
|
||||
Device: "tmpfs",
|
||||
Source: "shm",
|
||||
Destination: "/dev/shm",
|
||||
Data: "mode=1777,size=65536k",
|
||||
Flags: defaultMountFlags,
|
||||
},
|
||||
{
|
||||
Source: "mqueue",
|
||||
Destination: "/dev/mqueue",
|
||||
Device: "mqueue",
|
||||
Flags: defaultMountFlags,
|
||||
},
|
||||
{
|
||||
Source: "sysfs",
|
||||
Destination: "/sys",
|
||||
Device: "sysfs",
|
||||
Flags: defaultMountFlags | unix.MS_RDONLY,
|
||||
},
|
||||
},
|
||||
UidMappings: []configs.IDMap{
|
||||
{
|
||||
ContainerID: 0,
|
||||
HostID: 1000,
|
||||
Size: 65536,
|
||||
},
|
||||
},
|
||||
GidMappings: []configs.IDMap{
|
||||
{
|
||||
ContainerID: 0,
|
||||
HostID: 1000,
|
||||
Size: 65536,
|
||||
},
|
||||
},
|
||||
Networks: []*configs.Network{
|
||||
{
|
||||
Type: "loopback",
|
||||
Address: "127.0.0.1/0",
|
||||
Gateway: "localhost",
|
||||
},
|
||||
},
|
||||
Rlimits: []configs.Rlimit{
|
||||
{
|
||||
Type: unix.RLIMIT_NOFILE,
|
||||
Hard: uint64(1025),
|
||||
Soft: uint64(1025),
|
||||
},
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
Once you have the configuration populated you can create a container:
|
||||
|
||||
```go
|
||||
container, err := factory.Create("container-id", config)
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
return
|
||||
}
|
||||
```
|
||||
|
||||
To spawn bash as the initial process inside the container and have the
|
||||
processes pid returned in order to wait, signal, or kill the process:
|
||||
|
||||
```go
|
||||
process := &libcontainer.Process{
|
||||
Args: []string{"/bin/bash"},
|
||||
Env: []string{"PATH=/bin"},
|
||||
User: "daemon",
|
||||
Stdin: os.Stdin,
|
||||
Stdout: os.Stdout,
|
||||
Stderr: os.Stderr,
|
||||
Init: true,
|
||||
}
|
||||
|
||||
err := container.Run(process)
|
||||
if err != nil {
|
||||
container.Destroy()
|
||||
logrus.Fatal(err)
|
||||
return
|
||||
}
|
||||
|
||||
// wait for the process to finish.
|
||||
_, err := process.Wait()
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
|
||||
// destroy the container.
|
||||
container.Destroy()
|
||||
```
|
||||
|
||||
Additional ways to interact with a running container are:
|
||||
|
||||
```go
|
||||
// return all the pids for all processes running inside the container.
|
||||
processes, err := container.Processes()
|
||||
|
||||
// get detailed cpu, memory, io, and network statistics for the container and
|
||||
// it's processes.
|
||||
stats, err := container.Stats()
|
||||
|
||||
// pause all processes inside the container.
|
||||
container.Pause()
|
||||
|
||||
// resume all paused processes.
|
||||
container.Resume()
|
||||
|
||||
// send signal to container's init process.
|
||||
container.Signal(signal)
|
||||
|
||||
// update container resource constraints.
|
||||
container.Set(config)
|
||||
|
||||
// get current status of the container.
|
||||
status, err := container.Status()
|
||||
|
||||
// get current container's state information.
|
||||
state, err := container.State()
|
||||
```
|
||||
|
||||
|
||||
#### Checkpoint & Restore
|
||||
|
||||
libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
|
||||
This lets you save the state of a process running inside a container to disk, and then restore
|
||||
that state into a new process, on the same machine or on another machine.
|
||||
|
||||
`criu` version 1.5.2 or higher is required to use checkpoint and restore.
|
||||
If you don't already have `criu` installed, you can build it from source, following the
|
||||
[online instructions](http://criu.org/Installation). `criu` is also installed in the docker image
|
||||
generated when building libcontainer with docker.
|
||||
|
||||
|
||||
## Copyright and license
|
||||
|
||||
Code and documentation copyright 2014 Docker, inc.
|
||||
The code and documentation are released under the [Apache 2.0 license](../LICENSE).
|
||||
The documentation is also released under Creative Commons Attribution 4.0 International License.
|
||||
You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/.
|
||||
465
vendor/github.com/opencontainers/runc/libcontainer/SPEC.md
generated
vendored
465
vendor/github.com/opencontainers/runc/libcontainer/SPEC.md
generated
vendored
@@ -1,465 +0,0 @@
|
||||
## Container Specification - v1
|
||||
|
||||
This is the standard configuration for version 1 containers. It includes
|
||||
namespaces, standard filesystem setup, a default Linux capability set, and
|
||||
information about resource reservations. It also has information about any
|
||||
populated environment settings for the processes running inside a container.
|
||||
|
||||
Along with the configuration of how a container is created the standard also
|
||||
discusses actions that can be performed on a container to manage and inspect
|
||||
information about the processes running inside.
|
||||
|
||||
The v1 profile is meant to be able to accommodate the majority of applications
|
||||
with a strong security configuration.
|
||||
|
||||
### System Requirements and Compatibility
|
||||
|
||||
Minimum requirements:
|
||||
* Kernel version - 3.10 recommended 2.6.2x minimum(with backported patches)
|
||||
* Mounted cgroups with each subsystem in its own hierarchy
|
||||
|
||||
|
||||
### Namespaces
|
||||
|
||||
| Flag | Enabled |
|
||||
| --------------- | ------- |
|
||||
| CLONE_NEWPID | 1 |
|
||||
| CLONE_NEWUTS | 1 |
|
||||
| CLONE_NEWIPC | 1 |
|
||||
| CLONE_NEWNET | 1 |
|
||||
| CLONE_NEWNS | 1 |
|
||||
| CLONE_NEWUSER | 1 |
|
||||
| CLONE_NEWCGROUP | 1 |
|
||||
|
||||
Namespaces are created for the container via the `unshare` syscall.
|
||||
|
||||
|
||||
### Filesystem
|
||||
|
||||
A root filesystem must be provided to a container for execution. The container
|
||||
will use this root filesystem (rootfs) to jail and spawn processes inside where
|
||||
the binaries and system libraries are local to that directory. Any binaries
|
||||
to be executed must be contained within this rootfs.
|
||||
|
||||
Mounts that happen inside the container are automatically cleaned up when the
|
||||
container exits as the mount namespace is destroyed and the kernel will
|
||||
unmount all the mounts that were setup within that namespace.
|
||||
|
||||
For a container to execute properly there are certain filesystems that
|
||||
are required to be mounted within the rootfs that the runtime will setup.
|
||||
|
||||
| Path | Type | Flags | Data |
|
||||
| ----------- | ------ | -------------------------------------- | ---------------------------------------- |
|
||||
| /proc | proc | MS_NOEXEC,MS_NOSUID,MS_NODEV | |
|
||||
| /dev | tmpfs | MS_NOEXEC,MS_STRICTATIME | mode=755 |
|
||||
| /dev/shm | tmpfs | MS_NOEXEC,MS_NOSUID,MS_NODEV | mode=1777,size=65536k |
|
||||
| /dev/mqueue | mqueue | MS_NOEXEC,MS_NOSUID,MS_NODEV | |
|
||||
| /dev/pts | devpts | MS_NOEXEC,MS_NOSUID | newinstance,ptmxmode=0666,mode=620,gid=5 |
|
||||
| /sys | sysfs | MS_NOEXEC,MS_NOSUID,MS_NODEV,MS_RDONLY | |
|
||||
|
||||
|
||||
After a container's filesystems are mounted within the newly created
|
||||
mount namespace `/dev` will need to be populated with a set of device nodes.
|
||||
It is expected that a rootfs does not need to have any device nodes specified
|
||||
for `/dev` within the rootfs as the container will setup the correct devices
|
||||
that are required for executing a container's process.
|
||||
|
||||
| Path | Mode | Access |
|
||||
| ------------ | ---- | ---------- |
|
||||
| /dev/null | 0666 | rwm |
|
||||
| /dev/zero | 0666 | rwm |
|
||||
| /dev/full | 0666 | rwm |
|
||||
| /dev/tty | 0666 | rwm |
|
||||
| /dev/random | 0666 | rwm |
|
||||
| /dev/urandom | 0666 | rwm |
|
||||
|
||||
|
||||
**ptmx**
|
||||
`/dev/ptmx` will need to be a symlink to the host's `/dev/ptmx` within
|
||||
the container.
|
||||
|
||||
The use of a pseudo TTY is optional within a container and it should support both.
|
||||
If a pseudo is provided to the container `/dev/console` will need to be
|
||||
setup by binding the console in `/dev/` after it has been populated and mounted
|
||||
in tmpfs.
|
||||
|
||||
| Source | Destination | UID GID | Mode | Type |
|
||||
| --------------- | ------------ | ------- | ---- | ---- |
|
||||
| *pty host path* | /dev/console | 0 0 | 0600 | bind |
|
||||
|
||||
|
||||
After `/dev/null` has been setup we check for any external links between
|
||||
the container's io, STDIN, STDOUT, STDERR. If the container's io is pointing
|
||||
to `/dev/null` outside the container we close and `dup2` the `/dev/null`
|
||||
that is local to the container's rootfs.
|
||||
|
||||
|
||||
After the container has `/proc` mounted a few standard symlinks are setup
|
||||
within `/dev/` for the io.
|
||||
|
||||
| Source | Destination |
|
||||
| --------------- | ----------- |
|
||||
| /proc/self/fd | /dev/fd |
|
||||
| /proc/self/fd/0 | /dev/stdin |
|
||||
| /proc/self/fd/1 | /dev/stdout |
|
||||
| /proc/self/fd/2 | /dev/stderr |
|
||||
|
||||
A `pivot_root` is used to change the root for the process, effectively
|
||||
jailing the process inside the rootfs.
|
||||
|
||||
```c
|
||||
put_old = mkdir(...);
|
||||
pivot_root(rootfs, put_old);
|
||||
chdir("/");
|
||||
unmount(put_old, MS_DETACH);
|
||||
rmdir(put_old);
|
||||
```
|
||||
|
||||
For container's running with a rootfs inside `ramfs` a `MS_MOVE` combined
|
||||
with a `chroot` is required as `pivot_root` is not supported in `ramfs`.
|
||||
|
||||
```c
|
||||
mount(rootfs, "/", NULL, MS_MOVE, NULL);
|
||||
chroot(".");
|
||||
chdir("/");
|
||||
```
|
||||
|
||||
The `umask` is set back to `0022` after the filesystem setup has been completed.
|
||||
|
||||
### Resources
|
||||
|
||||
Cgroups are used to handle resource allocation for containers. This includes
|
||||
system resources like cpu, memory, and device access.
|
||||
|
||||
| Subsystem | Enabled |
|
||||
| ---------- | ------- |
|
||||
| devices | 1 |
|
||||
| memory | 1 |
|
||||
| cpu | 1 |
|
||||
| cpuacct | 1 |
|
||||
| cpuset | 1 |
|
||||
| blkio | 1 |
|
||||
| perf_event | 1 |
|
||||
| freezer | 1 |
|
||||
| hugetlb | 1 |
|
||||
| pids | 1 |
|
||||
|
||||
|
||||
All cgroup subsystem are joined so that statistics can be collected from
|
||||
each of the subsystems. Freezer does not expose any stats but is joined
|
||||
so that containers can be paused and resumed.
|
||||
|
||||
The parent process of the container's init must place the init pid inside
|
||||
the correct cgroups before the initialization begins. This is done so
|
||||
that no processes or threads escape the cgroups. This sync is
|
||||
done via a pipe ( specified in the runtime section below ) that the container's
|
||||
init process will block waiting for the parent to finish setup.
|
||||
|
||||
### IntelRdt
|
||||
|
||||
Intel platforms with new Xeon CPU support Resource Director Technology (RDT).
|
||||
Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are
|
||||
two sub-features of RDT.
|
||||
|
||||
Cache Allocation Technology (CAT) provides a way for the software to restrict
|
||||
cache allocation to a defined 'subset' of L3 cache which may be overlapping
|
||||
with other 'subsets'. The different subsets are identified by class of
|
||||
service (CLOS) and each CLOS has a capacity bitmask (CBM).
|
||||
|
||||
Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle
|
||||
over memory bandwidth for the software. A user controls the resource by
|
||||
indicating the percentage of maximum memory bandwidth or memory bandwidth limit
|
||||
in MBps unit if MBA Software Controller is enabled.
|
||||
|
||||
It can be used to handle L3 cache and memory bandwidth resources allocation
|
||||
for containers if hardware and kernel support Intel RDT CAT and MBA features.
|
||||
|
||||
In Linux 4.10 kernel or newer, the interface is defined and exposed via
|
||||
"resource control" filesystem, which is a "cgroup-like" interface.
|
||||
|
||||
Comparing with cgroups, it has similar process management lifecycle and
|
||||
interfaces in a container. But unlike cgroups' hierarchy, it has single level
|
||||
filesystem layout.
|
||||
|
||||
CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via
|
||||
"resource control" filesystem.
|
||||
|
||||
Intel RDT "resource control" filesystem hierarchy:
|
||||
```
|
||||
mount -t resctrl resctrl /sys/fs/resctrl
|
||||
tree /sys/fs/resctrl
|
||||
/sys/fs/resctrl/
|
||||
|-- info
|
||||
| |-- L3
|
||||
| | |-- cbm_mask
|
||||
| | |-- min_cbm_bits
|
||||
| | |-- num_closids
|
||||
| |-- MB
|
||||
| |-- bandwidth_gran
|
||||
| |-- delay_linear
|
||||
| |-- min_bandwidth
|
||||
| |-- num_closids
|
||||
|-- ...
|
||||
|-- schemata
|
||||
|-- tasks
|
||||
|-- <container_id>
|
||||
|-- ...
|
||||
|-- schemata
|
||||
|-- tasks
|
||||
```
|
||||
|
||||
For runc, we can make use of `tasks` and `schemata` configuration for L3
|
||||
cache and memory bandwidth resources constraints.
|
||||
|
||||
The file `tasks` has a list of tasks that belongs to this group (e.g.,
|
||||
<container_id>" group). Tasks can be added to a group by writing the task ID
|
||||
to the "tasks" file (which will automatically remove them from the previous
|
||||
group to which they belonged). New tasks created by fork(2) and clone(2) are
|
||||
added to the same group as their parent.
|
||||
|
||||
The file `schemata` has a list of all the resources available to this group.
|
||||
Each resource (L3 cache, memory bandwidth) has its own line and format.
|
||||
|
||||
L3 cache schema:
|
||||
It has allocation bitmasks/values for L3 cache on each socket, which
|
||||
contains L3 cache id and capacity bitmask (CBM).
|
||||
```
|
||||
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
|
||||
```
|
||||
For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0"
|
||||
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
|
||||
|
||||
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
|
||||
be set is less than the max bit. The max bits in the CBM is varied among
|
||||
supported Intel CPU models. Kernel will check if it is valid when writing.
|
||||
e.g., default value 0xfffff in root indicates the max bits of CBM is 20
|
||||
bits, which mapping to entire L3 cache capacity. Some valid CBM values to
|
||||
set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
|
||||
|
||||
Memory bandwidth schema:
|
||||
It has allocation values for memory bandwidth on each socket, which contains
|
||||
L3 cache id and memory bandwidth.
|
||||
```
|
||||
Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
|
||||
```
|
||||
For example, on a two-socket machine, the schema line could be "MB:0=20;1=70"
|
||||
|
||||
The minimum bandwidth percentage value for each CPU model is predefined and
|
||||
can be looked up through "info/MB/min_bandwidth". The bandwidth granularity
|
||||
that is allocated is also dependent on the CPU model and can be looked up at
|
||||
"info/MB/bandwidth_gran". The available bandwidth control steps are:
|
||||
min_bw + N * bw_gran. Intermediate values are rounded to the next control
|
||||
step available on the hardware.
|
||||
|
||||
If MBA Software Controller is enabled through mount option "-o mba_MBps"
|
||||
mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl
|
||||
We could specify memory bandwidth in "MBps" (Mega Bytes per second) unit
|
||||
instead of "percentages". The kernel underneath would use a software feedback
|
||||
mechanism or a "Software Controller" which reads the actual bandwidth using
|
||||
MBM counters and adjust the memory bandwidth percentages to ensure:
|
||||
"actual memory bandwidth < user specified memory bandwidth".
|
||||
|
||||
For example, on a two-socket machine, the schema line could be
|
||||
"MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on socket 0
|
||||
and 7000 MBps memory bandwidth limit on socket 1.
|
||||
|
||||
For more information about Intel RDT kernel interface:
|
||||
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
|
||||
|
||||
```
|
||||
An example for runc:
|
||||
Consider a two-socket machine with two L3 caches where the default CBM is
|
||||
0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10%
|
||||
with a memory bandwidth granularity of 10%.
|
||||
|
||||
Tasks inside the container only have access to the "upper" 7/11 of L3 cache
|
||||
on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a
|
||||
maximum memory bandwidth of 20% on socket 0 and 70% on socket 1.
|
||||
|
||||
"linux": {
|
||||
"intelRdt": {
|
||||
"closID": "guaranteed_group",
|
||||
"l3CacheSchema": "L3:0=7f0;1=1f",
|
||||
"memBwSchema": "MB:0=20;1=70"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Security
|
||||
|
||||
The standard set of Linux capabilities that are set in a container
|
||||
provide a good default for security and flexibility for the applications.
|
||||
|
||||
|
||||
| Capability | Enabled |
|
||||
| -------------------- | ------- |
|
||||
| CAP_NET_RAW | 1 |
|
||||
| CAP_NET_BIND_SERVICE | 1 |
|
||||
| CAP_AUDIT_READ | 1 |
|
||||
| CAP_AUDIT_WRITE | 1 |
|
||||
| CAP_DAC_OVERRIDE | 1 |
|
||||
| CAP_SETFCAP | 1 |
|
||||
| CAP_SETPCAP | 1 |
|
||||
| CAP_SETGID | 1 |
|
||||
| CAP_SETUID | 1 |
|
||||
| CAP_MKNOD | 1 |
|
||||
| CAP_CHOWN | 1 |
|
||||
| CAP_FOWNER | 1 |
|
||||
| CAP_FSETID | 1 |
|
||||
| CAP_KILL | 1 |
|
||||
| CAP_SYS_CHROOT | 1 |
|
||||
| CAP_NET_BROADCAST | 0 |
|
||||
| CAP_SYS_MODULE | 0 |
|
||||
| CAP_SYS_RAWIO | 0 |
|
||||
| CAP_SYS_PACCT | 0 |
|
||||
| CAP_SYS_ADMIN | 0 |
|
||||
| CAP_SYS_NICE | 0 |
|
||||
| CAP_SYS_RESOURCE | 0 |
|
||||
| CAP_SYS_TIME | 0 |
|
||||
| CAP_SYS_TTY_CONFIG | 0 |
|
||||
| CAP_AUDIT_CONTROL | 0 |
|
||||
| CAP_MAC_OVERRIDE | 0 |
|
||||
| CAP_MAC_ADMIN | 0 |
|
||||
| CAP_NET_ADMIN | 0 |
|
||||
| CAP_SYSLOG | 0 |
|
||||
| CAP_DAC_READ_SEARCH | 0 |
|
||||
| CAP_LINUX_IMMUTABLE | 0 |
|
||||
| CAP_IPC_LOCK | 0 |
|
||||
| CAP_IPC_OWNER | 0 |
|
||||
| CAP_SYS_PTRACE | 0 |
|
||||
| CAP_SYS_BOOT | 0 |
|
||||
| CAP_LEASE | 0 |
|
||||
| CAP_WAKE_ALARM | 0 |
|
||||
| CAP_BLOCK_SUSPEND | 0 |
|
||||
|
||||
|
||||
Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor)
|
||||
and [selinux](http://selinuxproject.org/page/Main_Page) can be used with
|
||||
the containers. A container should support setting an apparmor profile or
|
||||
selinux process and mount labels if provided in the configuration.
|
||||
|
||||
Standard apparmor profile:
|
||||
```c
|
||||
#include <tunables/global>
|
||||
profile <profile_name> flags=(attach_disconnected,mediate_deleted) {
|
||||
#include <abstractions/base>
|
||||
network,
|
||||
capability,
|
||||
file,
|
||||
umount,
|
||||
|
||||
deny @{PROC}/sys/fs/** wklx,
|
||||
deny @{PROC}/sysrq-trigger rwklx,
|
||||
deny @{PROC}/mem rwklx,
|
||||
deny @{PROC}/kmem rwklx,
|
||||
deny @{PROC}/sys/kernel/[^s][^h][^m]* wklx,
|
||||
deny @{PROC}/sys/kernel/*/** wklx,
|
||||
|
||||
deny mount,
|
||||
|
||||
deny /sys/[^f]*/** wklx,
|
||||
deny /sys/f[^s]*/** wklx,
|
||||
deny /sys/fs/[^c]*/** wklx,
|
||||
deny /sys/fs/c[^g]*/** wklx,
|
||||
deny /sys/fs/cg[^r]*/** wklx,
|
||||
deny /sys/firmware/efi/efivars/** rwklx,
|
||||
deny /sys/kernel/security/** rwklx,
|
||||
}
|
||||
```
|
||||
|
||||
*TODO: seccomp work is being done to find a good default config*
|
||||
|
||||
### Runtime and Init Process
|
||||
|
||||
During container creation the parent process needs to talk to the container's init
|
||||
process and have a form of synchronization. This is accomplished by creating
|
||||
a pipe that is passed to the container's init. When the init process first spawns
|
||||
it will block on its side of the pipe until the parent closes its side. This
|
||||
allows the parent to have time to set the new process inside a cgroup hierarchy
|
||||
and/or write any uid/gid mappings required for user namespaces.
|
||||
The pipe is passed to the init process via FD 3.
|
||||
|
||||
The application consuming libcontainer should be compiled statically. libcontainer
|
||||
does not define any init process and the arguments provided are used to `exec` the
|
||||
process inside the application. There should be no long running init within the
|
||||
container spec.
|
||||
|
||||
If a pseudo tty is provided to a container it will open and `dup2` the console
|
||||
as the container's STDIN, STDOUT, STDERR as well as mounting the console
|
||||
as `/dev/console`.
|
||||
|
||||
An extra set of mounts are provided to a container and setup for use. A container's
|
||||
rootfs can contain some non portable files inside that can cause side effects during
|
||||
execution of a process. These files are usually created and populated with the container
|
||||
specific information via the runtime.
|
||||
|
||||
**Extra runtime files:**
|
||||
* /etc/hosts
|
||||
* /etc/resolv.conf
|
||||
* /etc/hostname
|
||||
* /etc/localtime
|
||||
|
||||
|
||||
#### Defaults
|
||||
|
||||
There are a few defaults that can be overridden by users, but in their omission
|
||||
these apply to processes within a container.
|
||||
|
||||
| Type | Value |
|
||||
| ------------------- | ------------------------------ |
|
||||
| Parent Death Signal | SIGKILL |
|
||||
| UID | 0 |
|
||||
| GID | 0 |
|
||||
| GROUPS | 0, NULL |
|
||||
| CWD | "/" |
|
||||
| $HOME | Current user's home dir or "/" |
|
||||
| Readonly rootfs | false |
|
||||
| Pseudo TTY | false |
|
||||
|
||||
|
||||
## Actions
|
||||
|
||||
After a container is created there is a standard set of actions that can
|
||||
be done to the container. These actions are part of the public API for
|
||||
a container.
|
||||
|
||||
| Action | Description |
|
||||
| -------------- | ------------------------------------------------------------------ |
|
||||
| Get processes | Return all the pids for processes running inside a container |
|
||||
| Get Stats | Return resource statistics for the container as a whole |
|
||||
| Wait | Waits on the container's init process ( pid 1 ) |
|
||||
| Wait Process | Wait on any of the container's processes returning the exit status |
|
||||
| Destroy | Kill the container's init process and remove any filesystem state |
|
||||
| Signal | Send a signal to the container's init process |
|
||||
| Signal Process | Send a signal to any of the container's processes |
|
||||
| Pause | Pause all processes inside the container |
|
||||
| Resume | Resume all processes inside the container if paused |
|
||||
| Exec | Execute a new process inside of the container ( requires setns ) |
|
||||
| Set | Setup configs of the container after it's created |
|
||||
|
||||
### Execute a new process inside of a running container
|
||||
|
||||
User can execute a new process inside of a running container. Any binaries to be
|
||||
executed must be accessible within the container's rootfs.
|
||||
|
||||
The started process will run inside the container's rootfs. Any changes
|
||||
made by the process to the container's filesystem will persist after the
|
||||
process finished executing.
|
||||
|
||||
The started process will join all the container's existing namespaces. When the
|
||||
container is paused, the process will also be paused and will resume when
|
||||
the container is unpaused. The started process will only run when the container's
|
||||
primary process (PID 1) is running, and will not be restarted when the container
|
||||
is restarted.
|
||||
|
||||
#### Planned additions
|
||||
|
||||
The started process will have its own cgroups nested inside the container's
|
||||
cgroups. This is used for process tracking and optionally resource allocation
|
||||
handling for the new process. Freezer cgroup is required, the rest of the cgroups
|
||||
are optional. The process executor must place its pid inside the correct
|
||||
cgroups before starting the process. This is done so that no child processes or
|
||||
threads can escape the cgroups.
|
||||
|
||||
When the process is stopped, the process executor will try (in a best-effort way)
|
||||
to stop all its children and remove the sub-cgroups.
|
||||
16
vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go
generated
vendored
16
vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go
generated
vendored
@@ -1,16 +0,0 @@
|
||||
package apparmor
|
||||
|
||||
import "errors"
|
||||
|
||||
var (
|
||||
// IsEnabled returns true if apparmor is enabled for the host.
|
||||
IsEnabled = isEnabled
|
||||
|
||||
// ApplyProfile will apply the profile with the specified name to the process after
|
||||
// the next exec. It is only supported on Linux and produces an ErrApparmorNotEnabled
|
||||
// on other platforms.
|
||||
ApplyProfile = applyProfile
|
||||
|
||||
// ErrApparmorNotEnabled indicates that AppArmor is not enabled or not supported.
|
||||
ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported")
|
||||
)
|
||||
68
vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_linux.go
generated
vendored
68
vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_linux.go
generated
vendored
@@ -1,68 +0,0 @@
|
||||
package apparmor
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
)
|
||||
|
||||
var (
|
||||
appArmorEnabled bool
|
||||
checkAppArmor sync.Once
|
||||
)
|
||||
|
||||
// isEnabled returns true if apparmor is enabled for the host.
|
||||
func isEnabled() bool {
|
||||
checkAppArmor.Do(func() {
|
||||
if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil {
|
||||
buf, err := os.ReadFile("/sys/module/apparmor/parameters/enabled")
|
||||
appArmorEnabled = err == nil && len(buf) > 1 && buf[0] == 'Y'
|
||||
}
|
||||
})
|
||||
return appArmorEnabled
|
||||
}
|
||||
|
||||
func setProcAttr(attr, value string) error {
|
||||
// Under AppArmor you can only change your own attr, so use /proc/self/
|
||||
// instead of /proc/<tid>/ like libapparmor does
|
||||
attrPath := "/proc/self/attr/apparmor/" + attr
|
||||
if _, err := os.Stat(attrPath); errors.Is(err, os.ErrNotExist) {
|
||||
// fall back to the old convention
|
||||
attrPath = "/proc/self/attr/" + attr
|
||||
}
|
||||
|
||||
f, err := os.OpenFile(attrPath, os.O_WRONLY, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
if err := utils.EnsureProcHandle(f); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = f.WriteString(value)
|
||||
return err
|
||||
}
|
||||
|
||||
// changeOnExec reimplements aa_change_onexec from libapparmor in Go
|
||||
func changeOnExec(name string) error {
|
||||
if err := setProcAttr("exec", "exec "+name); err != nil {
|
||||
return fmt.Errorf("apparmor failed to apply profile: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// applyProfile will apply the profile with the specified name to the process after
|
||||
// the next exec. It is only supported on Linux and produces an error on other
|
||||
// platforms.
|
||||
func applyProfile(name string) error {
|
||||
if name == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
return changeOnExec(name)
|
||||
}
|
||||
15
vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_unsupported.go
generated
vendored
15
vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_unsupported.go
generated
vendored
@@ -1,15 +0,0 @@
|
||||
//go:build !linux
|
||||
// +build !linux
|
||||
|
||||
package apparmor
|
||||
|
||||
func isEnabled() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func applyProfile(name string) error {
|
||||
if name != "" {
|
||||
return ErrApparmorNotEnabled
|
||||
}
|
||||
return nil
|
||||
}
|
||||
123
vendor/github.com/opencontainers/runc/libcontainer/capabilities/capabilities.go
generated
vendored
123
vendor/github.com/opencontainers/runc/libcontainer/capabilities/capabilities.go
generated
vendored
@@ -1,123 +0,0 @@
|
||||
//go:build linux
|
||||
// +build linux
|
||||
|
||||
package capabilities
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/syndtr/gocapability/capability"
|
||||
)
|
||||
|
||||
const allCapabilityTypes = capability.CAPS | capability.BOUNDING | capability.AMBIENT
|
||||
|
||||
var (
|
||||
capabilityMap map[string]capability.Cap
|
||||
capTypes = []capability.CapType{
|
||||
capability.BOUNDING,
|
||||
capability.PERMITTED,
|
||||
capability.INHERITABLE,
|
||||
capability.EFFECTIVE,
|
||||
capability.AMBIENT,
|
||||
}
|
||||
)
|
||||
|
||||
func init() {
|
||||
capabilityMap = make(map[string]capability.Cap, capability.CAP_LAST_CAP+1)
|
||||
for _, c := range capability.List() {
|
||||
if c > capability.CAP_LAST_CAP {
|
||||
continue
|
||||
}
|
||||
capabilityMap["CAP_"+strings.ToUpper(c.String())] = c
|
||||
}
|
||||
}
|
||||
|
||||
// KnownCapabilities returns the list of the known capabilities.
|
||||
// Used by `runc features`.
|
||||
func KnownCapabilities() []string {
|
||||
list := capability.List()
|
||||
res := make([]string, len(list))
|
||||
for i, c := range list {
|
||||
res[i] = "CAP_" + strings.ToUpper(c.String())
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
// New creates a new Caps from the given Capabilities config. Unknown Capabilities
|
||||
// or Capabilities that are unavailable in the current environment are ignored,
|
||||
// printing a warning instead.
|
||||
func New(capConfig *configs.Capabilities) (*Caps, error) {
|
||||
var (
|
||||
err error
|
||||
c Caps
|
||||
)
|
||||
|
||||
unknownCaps := make(map[string]struct{})
|
||||
c.caps = map[capability.CapType][]capability.Cap{
|
||||
capability.BOUNDING: capSlice(capConfig.Bounding, unknownCaps),
|
||||
capability.EFFECTIVE: capSlice(capConfig.Effective, unknownCaps),
|
||||
capability.INHERITABLE: capSlice(capConfig.Inheritable, unknownCaps),
|
||||
capability.PERMITTED: capSlice(capConfig.Permitted, unknownCaps),
|
||||
capability.AMBIENT: capSlice(capConfig.Ambient, unknownCaps),
|
||||
}
|
||||
if c.pid, err = capability.NewPid2(0); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err = c.pid.Load(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(unknownCaps) > 0 {
|
||||
logrus.Warn("ignoring unknown or unavailable capabilities: ", mapKeys(unknownCaps))
|
||||
}
|
||||
return &c, nil
|
||||
}
|
||||
|
||||
// capSlice converts the slice of capability names in caps, to their numeric
|
||||
// equivalent, and returns them as a slice. Unknown or unavailable capabilities
|
||||
// are not returned, but appended to unknownCaps.
|
||||
func capSlice(caps []string, unknownCaps map[string]struct{}) []capability.Cap {
|
||||
var out []capability.Cap
|
||||
for _, c := range caps {
|
||||
if v, ok := capabilityMap[c]; !ok {
|
||||
unknownCaps[c] = struct{}{}
|
||||
} else {
|
||||
out = append(out, v)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// mapKeys returns the keys of input in sorted order
|
||||
func mapKeys(input map[string]struct{}) []string {
|
||||
var keys []string
|
||||
for c := range input {
|
||||
keys = append(keys, c)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
return keys
|
||||
}
|
||||
|
||||
// Caps holds the capabilities for a container.
|
||||
type Caps struct {
|
||||
pid capability.Capabilities
|
||||
caps map[capability.CapType][]capability.Cap
|
||||
}
|
||||
|
||||
// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist.
|
||||
func (c *Caps) ApplyBoundingSet() error {
|
||||
c.pid.Clear(capability.BOUNDING)
|
||||
c.pid.Set(capability.BOUNDING, c.caps[capability.BOUNDING]...)
|
||||
return c.pid.Apply(capability.BOUNDING)
|
||||
}
|
||||
|
||||
// Apply sets all the capabilities for the current process in the config.
|
||||
func (c *Caps) ApplyCaps() error {
|
||||
c.pid.Clear(allCapabilityTypes)
|
||||
for _, g := range capTypes {
|
||||
c.pid.Set(g, c.caps[g]...)
|
||||
}
|
||||
return c.pid.Apply(allCapabilityTypes)
|
||||
}
|
||||
@@ -1,4 +0,0 @@
|
||||
//go:build !linux
|
||||
// +build !linux
|
||||
|
||||
package capabilities
|
||||
86
vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go
generated
vendored
86
vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go
generated
vendored
@@ -1,86 +0,0 @@
|
||||
package validate
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
)
|
||||
|
||||
// rootlessEUID makes sure that the config can be applied when runc
|
||||
// is being executed as a non-root user (euid != 0) in the current user namespace.
|
||||
func (v *ConfigValidator) rootlessEUID(config *configs.Config) error {
|
||||
if !config.RootlessEUID {
|
||||
return nil
|
||||
}
|
||||
if err := rootlessEUIDMappings(config); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := rootlessEUIDMount(config); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// XXX: We currently can't verify the user config at all, because
|
||||
// configs.Config doesn't store the user-related configs. So this
|
||||
// has to be verified by setupUser() in init_linux.go.
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func rootlessEUIDMappings(config *configs.Config) error {
|
||||
if !config.Namespaces.Contains(configs.NEWUSER) {
|
||||
return errors.New("rootless container requires user namespaces")
|
||||
}
|
||||
// We only require mappings if we are not joining another userns.
|
||||
if path := config.Namespaces.PathOf(configs.NEWUSER); path == "" {
|
||||
if len(config.UidMappings) == 0 {
|
||||
return errors.New("rootless containers requires at least one UID mapping")
|
||||
}
|
||||
if len(config.GidMappings) == 0 {
|
||||
return errors.New("rootless containers requires at least one GID mapping")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// mount verifies that the user isn't trying to set up any mounts they don't have
|
||||
// the rights to do. In addition, it makes sure that no mount has a `uid=` or
|
||||
// `gid=` option that doesn't resolve to root.
|
||||
func rootlessEUIDMount(config *configs.Config) error {
|
||||
// XXX: We could whitelist allowed devices at this point, but I'm not
|
||||
// convinced that's a good idea. The kernel is the best arbiter of
|
||||
// access control.
|
||||
|
||||
for _, mount := range config.Mounts {
|
||||
// Check that the options list doesn't contain any uid= or gid= entries
|
||||
// that don't resolve to root.
|
||||
for _, opt := range strings.Split(mount.Data, ",") {
|
||||
if strings.HasPrefix(opt, "uid=") {
|
||||
var uid int
|
||||
n, err := fmt.Sscanf(opt, "uid=%d", &uid)
|
||||
if n != 1 || err != nil {
|
||||
// Ignore unknown mount options.
|
||||
continue
|
||||
}
|
||||
if _, err := config.HostUID(uid); err != nil {
|
||||
return fmt.Errorf("cannot specify uid=%d mount option for rootless container: %w", uid, err)
|
||||
}
|
||||
}
|
||||
|
||||
if strings.HasPrefix(opt, "gid=") {
|
||||
var gid int
|
||||
n, err := fmt.Sscanf(opt, "gid=%d", &gid)
|
||||
if n != 1 || err != nil {
|
||||
// Ignore unknown mount options.
|
||||
continue
|
||||
}
|
||||
if _, err := config.HostGID(gid); err != nil {
|
||||
return fmt.Errorf("cannot specify gid=%d mount option for rootless container: %w", gid, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
306
vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go
generated
vendored
306
vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go
generated
vendored
@@ -1,306 +0,0 @@
|
||||
package validate
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/libcontainer/intelrdt"
|
||||
selinux "github.com/opencontainers/selinux/go-selinux"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
type Validator interface {
|
||||
Validate(*configs.Config) error
|
||||
}
|
||||
|
||||
func New() Validator {
|
||||
return &ConfigValidator{}
|
||||
}
|
||||
|
||||
type ConfigValidator struct{}
|
||||
|
||||
type check func(config *configs.Config) error
|
||||
|
||||
func (v *ConfigValidator) Validate(config *configs.Config) error {
|
||||
checks := []check{
|
||||
v.cgroups,
|
||||
v.rootfs,
|
||||
v.network,
|
||||
v.hostname,
|
||||
v.security,
|
||||
v.usernamespace,
|
||||
v.cgroupnamespace,
|
||||
v.sysctl,
|
||||
v.intelrdt,
|
||||
v.rootlessEUID,
|
||||
}
|
||||
for _, c := range checks {
|
||||
if err := c(config); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
// Relaxed validation rules for backward compatibility
|
||||
warns := []check{
|
||||
v.mounts, // TODO (runc v1.x.x): make this an error instead of a warning
|
||||
}
|
||||
for _, c := range warns {
|
||||
if err := c(config); err != nil {
|
||||
logrus.WithError(err).Warn("invalid configuration")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// rootfs validates if the rootfs is an absolute path and is not a symlink
|
||||
// to the container's root filesystem.
|
||||
func (v *ConfigValidator) rootfs(config *configs.Config) error {
|
||||
if _, err := os.Stat(config.Rootfs); err != nil {
|
||||
return fmt.Errorf("invalid rootfs: %w", err)
|
||||
}
|
||||
cleaned, err := filepath.Abs(config.Rootfs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid rootfs: %w", err)
|
||||
}
|
||||
if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil {
|
||||
return fmt.Errorf("invalid rootfs: %w", err)
|
||||
}
|
||||
if filepath.Clean(config.Rootfs) != cleaned {
|
||||
return errors.New("invalid rootfs: not an absolute path, or a symlink")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (v *ConfigValidator) network(config *configs.Config) error {
|
||||
if !config.Namespaces.Contains(configs.NEWNET) {
|
||||
if len(config.Networks) > 0 || len(config.Routes) > 0 {
|
||||
return errors.New("unable to apply network settings without a private NET namespace")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (v *ConfigValidator) hostname(config *configs.Config) error {
|
||||
if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) {
|
||||
return errors.New("unable to set hostname without a private UTS namespace")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (v *ConfigValidator) security(config *configs.Config) error {
|
||||
// restrict sys without mount namespace
|
||||
if (len(config.MaskPaths) > 0 || len(config.ReadonlyPaths) > 0) &&
|
||||
!config.Namespaces.Contains(configs.NEWNS) {
|
||||
return errors.New("unable to restrict sys entries without a private MNT namespace")
|
||||
}
|
||||
if config.ProcessLabel != "" && !selinux.GetEnabled() {
|
||||
return errors.New("selinux label is specified in config, but selinux is disabled or not supported")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (v *ConfigValidator) usernamespace(config *configs.Config) error {
|
||||
if config.Namespaces.Contains(configs.NEWUSER) {
|
||||
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
|
||||
return errors.New("user namespaces aren't enabled in the kernel")
|
||||
}
|
||||
hasPath := config.Namespaces.PathOf(configs.NEWUSER) != ""
|
||||
hasMappings := config.UidMappings != nil || config.GidMappings != nil
|
||||
if !hasPath && !hasMappings {
|
||||
return errors.New("user namespaces enabled, but no namespace path to join nor mappings to apply specified")
|
||||
}
|
||||
// The hasPath && hasMappings validation case is handled in specconv --
|
||||
// we cache the mappings in Config during specconv in the hasPath case,
|
||||
// so we cannot do that validation here.
|
||||
} else {
|
||||
if config.UidMappings != nil || config.GidMappings != nil {
|
||||
return errors.New("user namespace mappings specified, but user namespace isn't enabled in the config")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error {
|
||||
if config.Namespaces.Contains(configs.NEWCGROUP) {
|
||||
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
|
||||
return errors.New("cgroup namespaces aren't enabled in the kernel")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// convertSysctlVariableToDotsSeparator can return sysctl variables in dots separator format.
|
||||
// The '/' separator is also accepted in place of a '.'.
|
||||
// Convert the sysctl variables to dots separator format for validation.
|
||||
// More info: sysctl(8), sysctl.d(5).
|
||||
//
|
||||
// For example:
|
||||
// Input sysctl variable "net/ipv4/conf/eno2.100.rp_filter"
|
||||
// will return the converted value "net.ipv4.conf.eno2/100.rp_filter"
|
||||
func convertSysctlVariableToDotsSeparator(val string) string {
|
||||
if val == "" {
|
||||
return val
|
||||
}
|
||||
firstSepIndex := strings.IndexAny(val, "./")
|
||||
if firstSepIndex == -1 || val[firstSepIndex] == '.' {
|
||||
return val
|
||||
}
|
||||
|
||||
f := func(r rune) rune {
|
||||
switch r {
|
||||
case '.':
|
||||
return '/'
|
||||
case '/':
|
||||
return '.'
|
||||
}
|
||||
return r
|
||||
}
|
||||
return strings.Map(f, val)
|
||||
}
|
||||
|
||||
// sysctl validates that the specified sysctl keys are valid or not.
|
||||
// /proc/sys isn't completely namespaced and depending on which namespaces
|
||||
// are specified, a subset of sysctls are permitted.
|
||||
func (v *ConfigValidator) sysctl(config *configs.Config) error {
|
||||
validSysctlMap := map[string]bool{
|
||||
"kernel.msgmax": true,
|
||||
"kernel.msgmnb": true,
|
||||
"kernel.msgmni": true,
|
||||
"kernel.sem": true,
|
||||
"kernel.shmall": true,
|
||||
"kernel.shmmax": true,
|
||||
"kernel.shmmni": true,
|
||||
"kernel.shm_rmid_forced": true,
|
||||
}
|
||||
|
||||
var (
|
||||
netOnce sync.Once
|
||||
hostnet bool
|
||||
hostnetErr error
|
||||
)
|
||||
|
||||
for s := range config.Sysctl {
|
||||
s := convertSysctlVariableToDotsSeparator(s)
|
||||
if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") {
|
||||
if config.Namespaces.Contains(configs.NEWIPC) {
|
||||
continue
|
||||
} else {
|
||||
return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s)
|
||||
}
|
||||
}
|
||||
if strings.HasPrefix(s, "net.") {
|
||||
// Is container using host netns?
|
||||
// Here "host" means "current", not "initial".
|
||||
netOnce.Do(func() {
|
||||
if !config.Namespaces.Contains(configs.NEWNET) {
|
||||
hostnet = true
|
||||
return
|
||||
}
|
||||
path := config.Namespaces.PathOf(configs.NEWNET)
|
||||
if path == "" {
|
||||
// own netns, so hostnet = false
|
||||
return
|
||||
}
|
||||
hostnet, hostnetErr = isHostNetNS(path)
|
||||
})
|
||||
if hostnetErr != nil {
|
||||
return fmt.Errorf("invalid netns path: %w", hostnetErr)
|
||||
}
|
||||
if hostnet {
|
||||
return fmt.Errorf("sysctl %q not allowed in host network namespace", s)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if config.Namespaces.Contains(configs.NEWUTS) {
|
||||
switch s {
|
||||
case "kernel.domainname":
|
||||
// This is namespaced and there's no explicit OCI field for it.
|
||||
continue
|
||||
case "kernel.hostname":
|
||||
// This is namespaced but there's a conflicting (dedicated) OCI field for it.
|
||||
return fmt.Errorf("sysctl %q is not allowed as it conflicts with the OCI %q field", s, "hostname")
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (v *ConfigValidator) intelrdt(config *configs.Config) error {
|
||||
if config.IntelRdt != nil {
|
||||
if config.IntelRdt.ClosID == "." || config.IntelRdt.ClosID == ".." || strings.Contains(config.IntelRdt.ClosID, "/") {
|
||||
return fmt.Errorf("invalid intelRdt.ClosID %q", config.IntelRdt.ClosID)
|
||||
}
|
||||
|
||||
if !intelrdt.IsCATEnabled() && config.IntelRdt.L3CacheSchema != "" {
|
||||
return errors.New("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled")
|
||||
}
|
||||
if !intelrdt.IsMBAEnabled() && config.IntelRdt.MemBwSchema != "" {
|
||||
return errors.New("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled")
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (v *ConfigValidator) cgroups(config *configs.Config) error {
|
||||
c := config.Cgroups
|
||||
if c == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if (c.Name != "" || c.Parent != "") && c.Path != "" {
|
||||
return fmt.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c)
|
||||
}
|
||||
|
||||
r := c.Resources
|
||||
if r == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if !cgroups.IsCgroup2UnifiedMode() && r.Unified != nil {
|
||||
return cgroups.ErrV1NoUnified
|
||||
}
|
||||
|
||||
if cgroups.IsCgroup2UnifiedMode() {
|
||||
_, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (v *ConfigValidator) mounts(config *configs.Config) error {
|
||||
for _, m := range config.Mounts {
|
||||
if !filepath.IsAbs(m.Destination) {
|
||||
return fmt.Errorf("invalid mount %+v: mount destination not absolute", m)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func isHostNetNS(path string) (bool, error) {
|
||||
const currentProcessNetns = "/proc/self/ns/net"
|
||||
|
||||
var st1, st2 unix.Stat_t
|
||||
|
||||
if err := unix.Stat(currentProcessNetns, &st1); err != nil {
|
||||
return false, &os.PathError{Op: "stat", Path: currentProcessNetns, Err: err}
|
||||
}
|
||||
if err := unix.Stat(path, &st2); err != nil {
|
||||
return false, &os.PathError{Op: "stat", Path: path, Err: err}
|
||||
}
|
||||
|
||||
return (st1.Dev == st2.Dev) && (st1.Ino == st2.Ino), nil
|
||||
}
|
||||
41
vendor/github.com/opencontainers/runc/libcontainer/console_linux.go
generated
vendored
41
vendor/github.com/opencontainers/runc/libcontainer/console_linux.go
generated
vendored
@@ -1,41 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// mount initializes the console inside the rootfs mounting with the specified mount label
|
||||
// and applying the correct ownership of the console.
|
||||
func mountConsole(slavePath string) error {
|
||||
oldMask := unix.Umask(0o000)
|
||||
defer unix.Umask(oldMask)
|
||||
f, err := os.Create("/dev/console")
|
||||
if err != nil && !os.IsExist(err) {
|
||||
return err
|
||||
}
|
||||
if f != nil {
|
||||
f.Close()
|
||||
}
|
||||
return mount(slavePath, "/dev/console", "", "bind", unix.MS_BIND, "")
|
||||
}
|
||||
|
||||
// dupStdio opens the slavePath for the console and dups the fds to the current
|
||||
// processes stdio, fd 0,1,2.
|
||||
func dupStdio(slavePath string) error {
|
||||
fd, err := unix.Open(slavePath, unix.O_RDWR, 0)
|
||||
if err != nil {
|
||||
return &os.PathError{
|
||||
Op: "open",
|
||||
Path: slavePath,
|
||||
Err: err,
|
||||
}
|
||||
}
|
||||
for _, i := range []int{0, 1, 2} {
|
||||
if err := unix.Dup3(fd, i, 0); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
130
vendor/github.com/opencontainers/runc/libcontainer/container.go
generated
vendored
130
vendor/github.com/opencontainers/runc/libcontainer/container.go
generated
vendored
@@ -1,130 +0,0 @@
|
||||
// Package libcontainer provides a native Go implementation for creating containers
|
||||
// with namespaces, cgroups, capabilities, and filesystem access controls.
|
||||
// It allows you to manage the lifecycle of the container performing additional operations
|
||||
// after the container is created.
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
)
|
||||
|
||||
// Status is the status of a container.
|
||||
type Status int
|
||||
|
||||
const (
|
||||
// Created is the status that denotes the container exists but has not been run yet.
|
||||
Created Status = iota
|
||||
// Running is the status that denotes the container exists and is running.
|
||||
Running
|
||||
// Pausing is the status that denotes the container exists, it is in the process of being paused.
|
||||
Pausing
|
||||
// Paused is the status that denotes the container exists, but all its processes are paused.
|
||||
Paused
|
||||
// Stopped is the status that denotes the container does not have a created or running process.
|
||||
Stopped
|
||||
)
|
||||
|
||||
func (s Status) String() string {
|
||||
switch s {
|
||||
case Created:
|
||||
return "created"
|
||||
case Running:
|
||||
return "running"
|
||||
case Pausing:
|
||||
return "pausing"
|
||||
case Paused:
|
||||
return "paused"
|
||||
case Stopped:
|
||||
return "stopped"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// BaseState represents the platform agnostic pieces relating to a
|
||||
// running container's state
|
||||
type BaseState struct {
|
||||
// ID is the container ID.
|
||||
ID string `json:"id"`
|
||||
|
||||
// InitProcessPid is the init process id in the parent namespace.
|
||||
InitProcessPid int `json:"init_process_pid"`
|
||||
|
||||
// InitProcessStartTime is the init process start time in clock cycles since boot time.
|
||||
InitProcessStartTime uint64 `json:"init_process_start"`
|
||||
|
||||
// Created is the unix timestamp for the creation time of the container in UTC
|
||||
Created time.Time `json:"created"`
|
||||
|
||||
// Config is the container's configuration.
|
||||
Config configs.Config `json:"config"`
|
||||
}
|
||||
|
||||
// BaseContainer is a libcontainer container object.
|
||||
//
|
||||
// Each container is thread-safe within the same process. Since a container can
|
||||
// be destroyed by a separate process, any function may return that the container
|
||||
// was not found. BaseContainer includes methods that are platform agnostic.
|
||||
type BaseContainer interface {
|
||||
// Returns the ID of the container
|
||||
ID() string
|
||||
|
||||
// Returns the current status of the container.
|
||||
Status() (Status, error)
|
||||
|
||||
// State returns the current container's state information.
|
||||
State() (*State, error)
|
||||
|
||||
// OCIState returns the current container's state information.
|
||||
OCIState() (*specs.State, error)
|
||||
|
||||
// Returns the current config of the container.
|
||||
Config() configs.Config
|
||||
|
||||
// Returns the PIDs inside this container. The PIDs are in the namespace of the calling process.
|
||||
//
|
||||
// Some of the returned PIDs may no longer refer to processes in the Container, unless
|
||||
// the Container state is PAUSED in which case every PID in the slice is valid.
|
||||
Processes() ([]int, error)
|
||||
|
||||
// Returns statistics for the container.
|
||||
Stats() (*Stats, error)
|
||||
|
||||
// Set resources of container as configured
|
||||
//
|
||||
// We can use this to change resources when containers are running.
|
||||
//
|
||||
Set(config configs.Config) error
|
||||
|
||||
// Start a process inside the container. Returns error if process fails to
|
||||
// start. You can track process lifecycle with passed Process structure.
|
||||
Start(process *Process) (err error)
|
||||
|
||||
// Run immediately starts the process inside the container. Returns error if process
|
||||
// fails to start. It does not block waiting for the exec fifo after start returns but
|
||||
// opens the fifo after start returns.
|
||||
Run(process *Process) (err error)
|
||||
|
||||
// Destroys the container, if its in a valid state, after killing any
|
||||
// remaining running processes.
|
||||
//
|
||||
// Any event registrations are removed before the container is destroyed.
|
||||
// No error is returned if the container is already destroyed.
|
||||
//
|
||||
// Running containers must first be stopped using Signal(..).
|
||||
// Paused containers must first be resumed using Resume(..).
|
||||
Destroy() error
|
||||
|
||||
// Signal sends the provided signal code to the container's initial process.
|
||||
//
|
||||
// If all is specified the signal is sent to all processes in the container
|
||||
// including the initial process.
|
||||
Signal(s os.Signal, all bool) error
|
||||
|
||||
// Exec signals the container to exec the users process at the end of the init.
|
||||
Exec() error
|
||||
}
|
||||
2267
vendor/github.com/opencontainers/runc/libcontainer/container_linux.go
generated
vendored
2267
vendor/github.com/opencontainers/runc/libcontainer/container_linux.go
generated
vendored
File diff suppressed because it is too large
Load Diff
34
vendor/github.com/opencontainers/runc/libcontainer/criu_opts_linux.go
generated
vendored
34
vendor/github.com/opencontainers/runc/libcontainer/criu_opts_linux.go
generated
vendored
@@ -1,34 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import criu "github.com/checkpoint-restore/go-criu/v5/rpc"
|
||||
|
||||
type CriuPageServerInfo struct {
|
||||
Address string // IP address of CRIU page server
|
||||
Port int32 // port number of CRIU page server
|
||||
}
|
||||
|
||||
type VethPairName struct {
|
||||
ContainerInterfaceName string
|
||||
HostInterfaceName string
|
||||
}
|
||||
|
||||
type CriuOpts struct {
|
||||
ImagesDirectory string // directory for storing image files
|
||||
WorkDirectory string // directory to cd and write logs/pidfiles/stats to
|
||||
ParentImage string // directory for storing parent image files in pre-dump and dump
|
||||
LeaveRunning bool // leave container in running state after checkpoint
|
||||
TcpEstablished bool // checkpoint/restore established TCP connections
|
||||
ExternalUnixConnections bool // allow external unix connections
|
||||
ShellJob bool // allow to dump and restore shell jobs
|
||||
FileLocks bool // handle file locks, for safety
|
||||
PreDump bool // call criu predump to perform iterative checkpoint
|
||||
PageServer CriuPageServerInfo // allow to dump to criu page server
|
||||
VethPairs []VethPairName // pass the veth to criu when restore
|
||||
ManageCgroupsMode criu.CriuCgMode // dump or restore cgroup mode
|
||||
EmptyNs uint32 // don't c/r properties for namespace from this mask
|
||||
AutoDedup bool // auto deduplication for incremental dumps
|
||||
LazyPages bool // restore memory pages lazily using userfaultfd
|
||||
StatusFd int // fd for feedback when lazy server is ready
|
||||
LsmProfile string // LSM profile used to restore the container
|
||||
LsmMountContext string // LSM mount context value to use during restore
|
||||
}
|
||||
17
vendor/github.com/opencontainers/runc/libcontainer/eaccess_go119.go
generated
vendored
17
vendor/github.com/opencontainers/runc/libcontainer/eaccess_go119.go
generated
vendored
@@ -1,17 +0,0 @@
|
||||
//go:build !go1.20
|
||||
// +build !go1.20
|
||||
|
||||
package libcontainer
|
||||
|
||||
import "golang.org/x/sys/unix"
|
||||
|
||||
func eaccess(path string) error {
|
||||
// This check is similar to access(2) with X_OK except for
|
||||
// setuid/setgid binaries where it checks against the effective
|
||||
// (rather than real) uid and gid. It is not needed in go 1.20
|
||||
// and beyond and will be removed later.
|
||||
|
||||
// Relies on code added in https://go-review.googlesource.com/c/sys/+/468877
|
||||
// and older CLs linked from there.
|
||||
return unix.Faccessat(unix.AT_FDCWD, path, unix.X_OK, unix.AT_EACCESS)
|
||||
}
|
||||
10
vendor/github.com/opencontainers/runc/libcontainer/eaccess_stub.go
generated
vendored
10
vendor/github.com/opencontainers/runc/libcontainer/eaccess_stub.go
generated
vendored
@@ -1,10 +0,0 @@
|
||||
//go:build go1.20
|
||||
|
||||
package libcontainer
|
||||
|
||||
func eaccess(path string) error {
|
||||
// Not needed in Go 1.20+ as the functionality is already in there
|
||||
// (added by https://go.dev/cl/416115, https://go.dev/cl/414824,
|
||||
// and fixed in Go 1.20.2 by https://go.dev/cl/469956).
|
||||
return nil
|
||||
}
|
||||
13
vendor/github.com/opencontainers/runc/libcontainer/error.go
generated
vendored
13
vendor/github.com/opencontainers/runc/libcontainer/error.go
generated
vendored
@@ -1,13 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import "errors"
|
||||
|
||||
var (
|
||||
ErrExist = errors.New("container with given ID already exists")
|
||||
ErrInvalidID = errors.New("invalid container ID format")
|
||||
ErrNotExist = errors.New("container does not exist")
|
||||
ErrPaused = errors.New("container paused")
|
||||
ErrRunning = errors.New("container still running")
|
||||
ErrNotRunning = errors.New("container not running")
|
||||
ErrNotPaused = errors.New("container not paused")
|
||||
)
|
||||
30
vendor/github.com/opencontainers/runc/libcontainer/factory.go
generated
vendored
30
vendor/github.com/opencontainers/runc/libcontainer/factory.go
generated
vendored
@@ -1,30 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
)
|
||||
|
||||
type Factory interface {
|
||||
// Creates a new container with the given id and starts the initial process inside it.
|
||||
// id must be a string containing only letters, digits and underscores and must contain
|
||||
// between 1 and 1024 characters, inclusive.
|
||||
//
|
||||
// The id must not already be in use by an existing container. Containers created using
|
||||
// a factory with the same path (and filesystem) must have distinct ids.
|
||||
//
|
||||
// Returns the new container with a running process.
|
||||
//
|
||||
// On error, any partially created container parts are cleaned up (the operation is atomic).
|
||||
Create(id string, config *configs.Config) (Container, error)
|
||||
|
||||
// Load takes an ID for an existing container and returns the container information
|
||||
// from the state. This presents a read only view of the container.
|
||||
Load(id string) (Container, error)
|
||||
|
||||
// StartInitialization is an internal API to libcontainer used during the reexec of the
|
||||
// container.
|
||||
StartInitialization() error
|
||||
|
||||
// Type returns info string about factory type (e.g. lxc, libcontainer...)
|
||||
Type() string
|
||||
}
|
||||
402
vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go
generated
vendored
402
vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go
generated
vendored
@@ -1,402 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"runtime/debug"
|
||||
"strconv"
|
||||
|
||||
securejoin "github.com/cyphar/filepath-securejoin"
|
||||
"github.com/moby/sys/mountinfo"
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups/manager"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/libcontainer/configs/validate"
|
||||
"github.com/opencontainers/runc/libcontainer/intelrdt"
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
const (
|
||||
stateFilename = "state.json"
|
||||
execFifoFilename = "exec.fifo"
|
||||
)
|
||||
|
||||
var idRegex = regexp.MustCompile(`^[\w+-\.]+$`)
|
||||
|
||||
// InitArgs returns an options func to configure a LinuxFactory with the
|
||||
// provided init binary path and arguments.
|
||||
func InitArgs(args ...string) func(*LinuxFactory) error {
|
||||
return func(l *LinuxFactory) (err error) {
|
||||
if len(args) > 0 {
|
||||
// Resolve relative paths to ensure that its available
|
||||
// after directory changes.
|
||||
if args[0], err = filepath.Abs(args[0]); err != nil {
|
||||
// The only error returned from filepath.Abs is
|
||||
// the one from os.Getwd, i.e. a system error.
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
l.InitArgs = args
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs.
|
||||
func TmpfsRoot(l *LinuxFactory) error {
|
||||
mounted, err := mountinfo.Mounted(l.Root)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !mounted {
|
||||
if err := mount("tmpfs", l.Root, "", "tmpfs", 0, ""); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// CriuPath returns an option func to configure a LinuxFactory with the
|
||||
// provided criupath
|
||||
func CriuPath(criupath string) func(*LinuxFactory) error {
|
||||
return func(l *LinuxFactory) error {
|
||||
l.CriuPath = criupath
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// New returns a linux based container factory based in the root directory and
|
||||
// configures the factory with the provided option funcs.
|
||||
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
|
||||
if root != "" {
|
||||
if err := os.MkdirAll(root, 0o700); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
l := &LinuxFactory{
|
||||
Root: root,
|
||||
InitPath: "/proc/self/exe",
|
||||
InitArgs: []string{os.Args[0], "init"},
|
||||
Validator: validate.New(),
|
||||
CriuPath: "criu",
|
||||
}
|
||||
|
||||
for _, opt := range options {
|
||||
if opt == nil {
|
||||
continue
|
||||
}
|
||||
if err := opt(l); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return l, nil
|
||||
}
|
||||
|
||||
// LinuxFactory implements the default factory interface for linux based systems.
|
||||
type LinuxFactory struct {
|
||||
// Root directory for the factory to store state.
|
||||
Root string
|
||||
|
||||
// InitPath is the path for calling the init responsibilities for spawning
|
||||
// a container.
|
||||
InitPath string
|
||||
|
||||
// InitArgs are arguments for calling the init responsibilities for spawning
|
||||
// a container.
|
||||
InitArgs []string
|
||||
|
||||
// CriuPath is the path to the criu binary used for checkpoint and restore of
|
||||
// containers.
|
||||
CriuPath string
|
||||
|
||||
// New{u,g}idmapPath is the path to the binaries used for mapping with
|
||||
// rootless containers.
|
||||
NewuidmapPath string
|
||||
NewgidmapPath string
|
||||
|
||||
// Validator provides validation to container configurations.
|
||||
Validator validate.Validator
|
||||
}
|
||||
|
||||
func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
|
||||
if l.Root == "" {
|
||||
return nil, errors.New("root not set")
|
||||
}
|
||||
if err := l.validateID(id); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := l.Validator.Validate(config); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
containerRoot, err := securejoin.SecureJoin(l.Root, id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if _, err := os.Stat(containerRoot); err == nil {
|
||||
return nil, ErrExist
|
||||
} else if !os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
cm, err := manager.New(config.Cgroups)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Check that cgroup does not exist or empty (no processes).
|
||||
// Note for cgroup v1 this check is not thorough, as there are multiple
|
||||
// separate hierarchies, while both Exists() and GetAllPids() only use
|
||||
// one for "devices" controller (assuming others are the same, which is
|
||||
// probably true in almost all scenarios). Checking all the hierarchies
|
||||
// would be too expensive.
|
||||
if cm.Exists() {
|
||||
pids, err := cm.GetAllPids()
|
||||
// Reading PIDs can race with cgroups removal, so ignore ENOENT and ENODEV.
|
||||
if err != nil && !errors.Is(err, os.ErrNotExist) && !errors.Is(err, unix.ENODEV) {
|
||||
return nil, fmt.Errorf("unable to get cgroup PIDs: %w", err)
|
||||
}
|
||||
if len(pids) != 0 {
|
||||
if config.Cgroups.Systemd {
|
||||
// systemd cgroup driver can't add a pid to an
|
||||
// existing systemd unit and will return an
|
||||
// error anyway, so let's error out early.
|
||||
return nil, fmt.Errorf("container's cgroup is not empty: %d process(es) found", len(pids))
|
||||
}
|
||||
// TODO: return an error.
|
||||
logrus.Warnf("container's cgroup is not empty: %d process(es) found", len(pids))
|
||||
logrus.Warn("DEPRECATED: running container in a non-empty cgroup won't be supported in runc 1.2; https://github.com/opencontainers/runc/issues/3132")
|
||||
}
|
||||
}
|
||||
|
||||
// Check that cgroup is not frozen. Do not use Exists() here
|
||||
// since in cgroup v1 it only checks "devices" controller.
|
||||
st, err := cm.GetFreezerState()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to get cgroup freezer state: %w", err)
|
||||
}
|
||||
if st == configs.Frozen {
|
||||
return nil, errors.New("container's cgroup unexpectedly frozen")
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(containerRoot, 0o711); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
c := &linuxContainer{
|
||||
id: id,
|
||||
root: containerRoot,
|
||||
config: config,
|
||||
initPath: l.InitPath,
|
||||
initArgs: l.InitArgs,
|
||||
criuPath: l.CriuPath,
|
||||
newuidmapPath: l.NewuidmapPath,
|
||||
newgidmapPath: l.NewgidmapPath,
|
||||
cgroupManager: cm,
|
||||
intelRdtManager: intelrdt.NewManager(config, id, ""),
|
||||
}
|
||||
c.state = &stoppedState{c: c}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
func (l *LinuxFactory) Load(id string) (Container, error) {
|
||||
if l.Root == "" {
|
||||
return nil, errors.New("root not set")
|
||||
}
|
||||
// when load, we need to check id is valid or not.
|
||||
if err := l.validateID(id); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
containerRoot, err := securejoin.SecureJoin(l.Root, id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
state, err := l.loadState(containerRoot)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
r := &nonChildProcess{
|
||||
processPid: state.InitProcessPid,
|
||||
processStartTime: state.InitProcessStartTime,
|
||||
fds: state.ExternalDescriptors,
|
||||
}
|
||||
cm, err := manager.NewWithPaths(state.Config.Cgroups, state.CgroupPaths)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
c := &linuxContainer{
|
||||
initProcess: r,
|
||||
initProcessStartTime: state.InitProcessStartTime,
|
||||
id: id,
|
||||
config: &state.Config,
|
||||
initPath: l.InitPath,
|
||||
initArgs: l.InitArgs,
|
||||
criuPath: l.CriuPath,
|
||||
newuidmapPath: l.NewuidmapPath,
|
||||
newgidmapPath: l.NewgidmapPath,
|
||||
cgroupManager: cm,
|
||||
intelRdtManager: intelrdt.NewManager(&state.Config, id, state.IntelRdtPath),
|
||||
root: containerRoot,
|
||||
created: state.Created,
|
||||
}
|
||||
c.state = &loadedState{c: c}
|
||||
if err := c.refreshState(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
func (l *LinuxFactory) Type() string {
|
||||
return "libcontainer"
|
||||
}
|
||||
|
||||
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
|
||||
// This is a low level implementation detail of the reexec and should not be consumed externally
|
||||
func (l *LinuxFactory) StartInitialization() (err error) {
|
||||
// Get the INITPIPE.
|
||||
envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE")
|
||||
pipefd, err := strconv.Atoi(envInitPipe)
|
||||
if err != nil {
|
||||
err = fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err)
|
||||
logrus.Error(err)
|
||||
return err
|
||||
}
|
||||
pipe := os.NewFile(uintptr(pipefd), "pipe")
|
||||
defer pipe.Close()
|
||||
|
||||
defer func() {
|
||||
// We have an error during the initialization of the container's init,
|
||||
// send it back to the parent process in the form of an initError.
|
||||
if werr := writeSync(pipe, procError); werr != nil {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
return
|
||||
}
|
||||
if werr := utils.WriteJSON(pipe, &initError{Message: err.Error()}); werr != nil {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
return
|
||||
}
|
||||
}()
|
||||
|
||||
// Only init processes have FIFOFD.
|
||||
fifofd := -1
|
||||
envInitType := os.Getenv("_LIBCONTAINER_INITTYPE")
|
||||
it := initType(envInitType)
|
||||
if it == initStandard {
|
||||
envFifoFd := os.Getenv("_LIBCONTAINER_FIFOFD")
|
||||
if fifofd, err = strconv.Atoi(envFifoFd); err != nil {
|
||||
return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
var consoleSocket *os.File
|
||||
if envConsole := os.Getenv("_LIBCONTAINER_CONSOLE"); envConsole != "" {
|
||||
console, err := strconv.Atoi(envConsole)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE: %w", err)
|
||||
}
|
||||
consoleSocket = os.NewFile(uintptr(console), "console-socket")
|
||||
defer consoleSocket.Close()
|
||||
}
|
||||
|
||||
logPipeFdStr := os.Getenv("_LIBCONTAINER_LOGPIPE")
|
||||
logPipeFd, err := strconv.Atoi(logPipeFdStr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE: %w", err)
|
||||
}
|
||||
|
||||
// Get mount files (O_PATH).
|
||||
mountFds, err := parseMountFds()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// clear the current process's environment to clean any libcontainer
|
||||
// specific env vars.
|
||||
os.Clearenv()
|
||||
|
||||
defer func() {
|
||||
if e := recover(); e != nil {
|
||||
if ee, ok := e.(error); ok {
|
||||
err = fmt.Errorf("panic from initialization: %w, %s", ee, debug.Stack())
|
||||
} else {
|
||||
err = fmt.Errorf("panic from initialization: %v, %s", e, debug.Stack())
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
|
||||
return i.Init()
|
||||
}
|
||||
|
||||
func (l *LinuxFactory) loadState(root string) (*State, error) {
|
||||
stateFilePath, err := securejoin.SecureJoin(root, stateFilename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
f, err := os.Open(stateFilePath)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil, ErrNotExist
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
var state *State
|
||||
if err := json.NewDecoder(f).Decode(&state); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return state, nil
|
||||
}
|
||||
|
||||
func (l *LinuxFactory) validateID(id string) error {
|
||||
if !idRegex.MatchString(id) || string(os.PathSeparator)+id != utils.CleanPath(string(os.PathSeparator)+id) {
|
||||
return ErrInvalidID
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// NewuidmapPath returns an option func to configure a LinuxFactory with the
|
||||
// provided ..
|
||||
func NewuidmapPath(newuidmapPath string) func(*LinuxFactory) error {
|
||||
return func(l *LinuxFactory) error {
|
||||
l.NewuidmapPath = newuidmapPath
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// NewgidmapPath returns an option func to configure a LinuxFactory with the
|
||||
// provided ..
|
||||
func NewgidmapPath(newgidmapPath string) func(*LinuxFactory) error {
|
||||
return func(l *LinuxFactory) error {
|
||||
l.NewgidmapPath = newgidmapPath
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func parseMountFds() ([]int, error) {
|
||||
fdsJson := os.Getenv("_LIBCONTAINER_MOUNT_FDS")
|
||||
if fdsJson == "" {
|
||||
// Always return the nil slice if no fd is present.
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var mountFds []int
|
||||
if err := json.Unmarshal([]byte(fdsJson), &mountFds); err != nil {
|
||||
return nil, fmt.Errorf("Error unmarshalling _LIBCONTAINER_MOUNT_FDS: %w", err)
|
||||
}
|
||||
|
||||
return mountFds, nil
|
||||
}
|
||||
641
vendor/github.com/opencontainers/runc/libcontainer/init_linux.go
generated
vendored
641
vendor/github.com/opencontainers/runc/libcontainer/init_linux.go
generated
vendored
@@ -1,641 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
|
||||
"github.com/containerd/console"
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/vishvananda/netlink"
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/capabilities"
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/libcontainer/system"
|
||||
"github.com/opencontainers/runc/libcontainer/user"
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
)
|
||||
|
||||
type initType string
|
||||
|
||||
const (
|
||||
initSetns initType = "setns"
|
||||
initStandard initType = "standard"
|
||||
)
|
||||
|
||||
type pid struct {
|
||||
Pid int `json:"stage2_pid"`
|
||||
PidFirstChild int `json:"stage1_pid"`
|
||||
}
|
||||
|
||||
// network is an internal struct used to setup container networks.
|
||||
type network struct {
|
||||
configs.Network
|
||||
|
||||
// TempVethPeerName is a unique temporary veth peer name that was placed into
|
||||
// the container's namespace.
|
||||
TempVethPeerName string `json:"temp_veth_peer_name"`
|
||||
}
|
||||
|
||||
// initConfig is used for transferring parameters from Exec() to Init()
|
||||
type initConfig struct {
|
||||
Args []string `json:"args"`
|
||||
Env []string `json:"env"`
|
||||
Cwd string `json:"cwd"`
|
||||
Capabilities *configs.Capabilities `json:"capabilities"`
|
||||
ProcessLabel string `json:"process_label"`
|
||||
AppArmorProfile string `json:"apparmor_profile"`
|
||||
NoNewPrivileges bool `json:"no_new_privileges"`
|
||||
User string `json:"user"`
|
||||
AdditionalGroups []string `json:"additional_groups"`
|
||||
Config *configs.Config `json:"config"`
|
||||
Networks []*network `json:"network"`
|
||||
PassedFilesCount int `json:"passed_files_count"`
|
||||
ContainerId string `json:"containerid"`
|
||||
Rlimits []configs.Rlimit `json:"rlimits"`
|
||||
CreateConsole bool `json:"create_console"`
|
||||
ConsoleWidth uint16 `json:"console_width"`
|
||||
ConsoleHeight uint16 `json:"console_height"`
|
||||
RootlessEUID bool `json:"rootless_euid,omitempty"`
|
||||
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
|
||||
SpecState *specs.State `json:"spec_state,omitempty"`
|
||||
Cgroup2Path string `json:"cgroup2_path,omitempty"`
|
||||
}
|
||||
|
||||
type initer interface {
|
||||
Init() error
|
||||
}
|
||||
|
||||
func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds []int) (initer, error) {
|
||||
var config *initConfig
|
||||
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := populateProcessEnvironment(config.Env); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Clean the RLIMIT_NOFILE cache in go runtime.
|
||||
// Issue: https://github.com/opencontainers/runc/issues/4195
|
||||
maybeClearRlimitNofileCache(config.Rlimits)
|
||||
|
||||
switch t {
|
||||
case initSetns:
|
||||
// mountFds must be nil in this case. We don't mount while doing runc exec.
|
||||
if mountFds != nil {
|
||||
return nil, errors.New("mountFds must be nil. Can't mount while doing runc exec.")
|
||||
}
|
||||
|
||||
return &linuxSetnsInit{
|
||||
pipe: pipe,
|
||||
consoleSocket: consoleSocket,
|
||||
config: config,
|
||||
logFd: logFd,
|
||||
}, nil
|
||||
case initStandard:
|
||||
return &linuxStandardInit{
|
||||
pipe: pipe,
|
||||
consoleSocket: consoleSocket,
|
||||
parentPid: unix.Getppid(),
|
||||
config: config,
|
||||
fifoFd: fifoFd,
|
||||
logFd: logFd,
|
||||
mountFds: mountFds,
|
||||
}, nil
|
||||
}
|
||||
return nil, fmt.Errorf("unknown init type %q", t)
|
||||
}
|
||||
|
||||
// populateProcessEnvironment loads the provided environment variables into the
|
||||
// current processes's environment.
|
||||
func populateProcessEnvironment(env []string) error {
|
||||
for _, pair := range env {
|
||||
p := strings.SplitN(pair, "=", 2)
|
||||
if len(p) < 2 {
|
||||
return errors.New("invalid environment variable: missing '='")
|
||||
}
|
||||
name, val := p[0], p[1]
|
||||
if name == "" {
|
||||
return errors.New("invalid environment variable: name cannot be empty")
|
||||
}
|
||||
if strings.IndexByte(name, 0) >= 0 {
|
||||
return fmt.Errorf("invalid environment variable %q: name contains nul byte (\\x00)", name)
|
||||
}
|
||||
if strings.IndexByte(val, 0) >= 0 {
|
||||
return fmt.Errorf("invalid environment variable %q: value contains nul byte (\\x00)", name)
|
||||
}
|
||||
if err := os.Setenv(name, val); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// verifyCwd ensures that the current directory is actually inside the mount
|
||||
// namespace root of the current process.
|
||||
func verifyCwd() error {
|
||||
// getcwd(2) on Linux detects if cwd is outside of the rootfs of the
|
||||
// current mount namespace root, and in that case prefixes "(unreachable)"
|
||||
// to the returned string. glibc's getcwd(3) and Go's Getwd() both detect
|
||||
// when this happens and return ENOENT rather than returning a non-absolute
|
||||
// path. In both cases we can therefore easily detect if we have an invalid
|
||||
// cwd by checking the return value of getcwd(3). See getcwd(3) for more
|
||||
// details, and CVE-2024-21626 for the security issue that motivated this
|
||||
// check.
|
||||
//
|
||||
// We have to use unix.Getwd() here because os.Getwd() has a workaround for
|
||||
// $PWD which involves doing stat(.), which can fail if the current
|
||||
// directory is inaccessible to the container process.
|
||||
if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) {
|
||||
return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected")
|
||||
} else if err != nil {
|
||||
return fmt.Errorf("failed to verify if current working directory is safe: %w", err)
|
||||
} else if !filepath.IsAbs(wd) {
|
||||
// We shouldn't ever hit this, but check just in case.
|
||||
return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// finalizeNamespace drops the caps, sets the correct user
|
||||
// and working dir, and closes any leaked file descriptors
|
||||
// before executing the command inside the namespace
|
||||
func finalizeNamespace(config *initConfig) error {
|
||||
// Ensure that all unwanted fds we may have accidentally
|
||||
// inherited are marked close-on-exec so they stay out of the
|
||||
// container
|
||||
if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
|
||||
return fmt.Errorf("error closing exec fds: %w", err)
|
||||
}
|
||||
|
||||
// we only do chdir if it's specified
|
||||
doChdir := config.Cwd != ""
|
||||
if doChdir {
|
||||
// First, attempt the chdir before setting up the user.
|
||||
// This could allow us to access a directory that the user running runc can access
|
||||
// but the container user cannot.
|
||||
err := unix.Chdir(config.Cwd)
|
||||
switch {
|
||||
case err == nil:
|
||||
doChdir = false
|
||||
case os.IsPermission(err):
|
||||
// If we hit an EPERM, we should attempt again after setting up user.
|
||||
// This will allow us to successfully chdir if the container user has access
|
||||
// to the directory, but the user running runc does not.
|
||||
// This is useful in cases where the cwd is also a volume that's been chowned to the container user.
|
||||
default:
|
||||
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
|
||||
}
|
||||
}
|
||||
|
||||
caps := &configs.Capabilities{}
|
||||
if config.Capabilities != nil {
|
||||
caps = config.Capabilities
|
||||
} else if config.Config.Capabilities != nil {
|
||||
caps = config.Config.Capabilities
|
||||
}
|
||||
w, err := capabilities.New(caps)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// drop capabilities in bounding set before changing user
|
||||
if err := w.ApplyBoundingSet(); err != nil {
|
||||
return fmt.Errorf("unable to apply bounding set: %w", err)
|
||||
}
|
||||
// preserve existing capabilities while we change users
|
||||
if err := system.SetKeepCaps(); err != nil {
|
||||
return fmt.Errorf("unable to set keep caps: %w", err)
|
||||
}
|
||||
if err := setupUser(config); err != nil {
|
||||
return fmt.Errorf("unable to setup user: %w", err)
|
||||
}
|
||||
// Change working directory AFTER the user has been set up, if we haven't done it yet.
|
||||
if doChdir {
|
||||
if err := unix.Chdir(config.Cwd); err != nil {
|
||||
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
|
||||
}
|
||||
}
|
||||
// Make sure our final working directory is inside the container.
|
||||
if err := verifyCwd(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := system.ClearKeepCaps(); err != nil {
|
||||
return fmt.Errorf("unable to clear keep caps: %w", err)
|
||||
}
|
||||
if err := w.ApplyCaps(); err != nil {
|
||||
return fmt.Errorf("unable to apply caps: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// setupConsole sets up the console from inside the container, and sends the
|
||||
// master pty fd to the config.Pipe (using cmsg). This is done to ensure that
|
||||
// consoles are scoped to a container properly (see runc#814 and the many
|
||||
// issues related to that). This has to be run *after* we've pivoted to the new
|
||||
// rootfs (and the users' configuration is entirely set up).
|
||||
func setupConsole(socket *os.File, config *initConfig, mount bool) error {
|
||||
defer socket.Close()
|
||||
// At this point, /dev/ptmx points to something that we would expect. We
|
||||
// used to change the owner of the slave path, but since the /dev/pts mount
|
||||
// can have gid=X set (at the users' option). So touching the owner of the
|
||||
// slave PTY is not necessary, as the kernel will handle that for us. Note
|
||||
// however, that setupUser (specifically fixStdioPermissions) *will* change
|
||||
// the UID owner of the console to be the user the process will run as (so
|
||||
// they can actually control their console).
|
||||
|
||||
pty, slavePath, err := console.NewPty()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// After we return from here, we don't need the console anymore.
|
||||
defer pty.Close()
|
||||
|
||||
if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 {
|
||||
err = pty.Resize(console.WinSize{
|
||||
Height: config.ConsoleHeight,
|
||||
Width: config.ConsoleWidth,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// Mount the console inside our rootfs.
|
||||
if mount {
|
||||
if err := mountConsole(slavePath); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
// While we can access console.master, using the API is a good idea.
|
||||
if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil {
|
||||
return err
|
||||
}
|
||||
// Now, dup over all the things.
|
||||
return dupStdio(slavePath)
|
||||
}
|
||||
|
||||
// syncParentReady sends to the given pipe a JSON payload which indicates that
|
||||
// the init is ready to Exec the child process. It then waits for the parent to
|
||||
// indicate that it is cleared to Exec.
|
||||
func syncParentReady(pipe io.ReadWriter) error {
|
||||
// Tell parent.
|
||||
if err := writeSync(pipe, procReady); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Wait for parent to give the all-clear.
|
||||
return readSync(pipe, procRun)
|
||||
}
|
||||
|
||||
// syncParentHooks sends to the given pipe a JSON payload which indicates that
|
||||
// the parent should execute pre-start hooks. It then waits for the parent to
|
||||
// indicate that it is cleared to resume.
|
||||
func syncParentHooks(pipe io.ReadWriter) error {
|
||||
// Tell parent.
|
||||
if err := writeSync(pipe, procHooks); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Wait for parent to give the all-clear.
|
||||
return readSync(pipe, procResume)
|
||||
}
|
||||
|
||||
// syncParentSeccomp sends to the given pipe a JSON payload which
|
||||
// indicates that the parent should pick up the seccomp fd with pidfd_getfd()
|
||||
// and send it to the seccomp agent over a unix socket. It then waits for
|
||||
// the parent to indicate that it is cleared to resume and closes the seccompFd.
|
||||
// If the seccompFd is -1, there isn't anything to sync with the parent, so it
|
||||
// returns no error.
|
||||
func syncParentSeccomp(pipe io.ReadWriter, seccompFd int) error {
|
||||
if seccompFd == -1 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Tell parent.
|
||||
if err := writeSyncWithFd(pipe, procSeccomp, seccompFd); err != nil {
|
||||
unix.Close(seccompFd)
|
||||
return err
|
||||
}
|
||||
|
||||
// Wait for parent to give the all-clear.
|
||||
if err := readSync(pipe, procSeccompDone); err != nil {
|
||||
unix.Close(seccompFd)
|
||||
return fmt.Errorf("sync parent seccomp: %w", err)
|
||||
}
|
||||
|
||||
if err := unix.Close(seccompFd); err != nil {
|
||||
return fmt.Errorf("close seccomp fd: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// setupUser changes the groups, gid, and uid for the user inside the container
|
||||
func setupUser(config *initConfig) error {
|
||||
// Set up defaults.
|
||||
defaultExecUser := user.ExecUser{
|
||||
Uid: 0,
|
||||
Gid: 0,
|
||||
Home: "/",
|
||||
}
|
||||
|
||||
passwdPath, err := user.GetPasswdPath()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
groupPath, err := user.GetGroupPath()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var addGroups []int
|
||||
if len(config.AdditionalGroups) > 0 {
|
||||
addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// Rather than just erroring out later in setuid(2) and setgid(2), check
|
||||
// that the user is mapped here.
|
||||
if _, err := config.Config.HostUID(execUser.Uid); err != nil {
|
||||
return errors.New("cannot set uid to unmapped user in user namespace")
|
||||
}
|
||||
if _, err := config.Config.HostGID(execUser.Gid); err != nil {
|
||||
return errors.New("cannot set gid to unmapped user in user namespace")
|
||||
}
|
||||
|
||||
if config.RootlessEUID {
|
||||
// We cannot set any additional groups in a rootless container and thus
|
||||
// we bail if the user asked us to do so. TODO: We currently can't do
|
||||
// this check earlier, but if libcontainer.Process.User was typesafe
|
||||
// this might work.
|
||||
if len(addGroups) > 0 {
|
||||
return errors.New("cannot set any additional groups in a rootless container")
|
||||
}
|
||||
}
|
||||
|
||||
// Before we change to the container's user make sure that the processes
|
||||
// STDIO is correctly owned by the user that we are switching to.
|
||||
if err := fixStdioPermissions(execUser); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
setgroups, err := os.ReadFile("/proc/self/setgroups")
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
|
||||
// This isn't allowed in an unprivileged user namespace since Linux 3.19.
|
||||
// There's nothing we can do about /etc/group entries, so we silently
|
||||
// ignore setting groups here (since the user didn't explicitly ask us to
|
||||
// set the group).
|
||||
allowSupGroups := !config.RootlessEUID && string(bytes.TrimSpace(setgroups)) != "deny"
|
||||
|
||||
if allowSupGroups {
|
||||
suppGroups := append(execUser.Sgids, addGroups...)
|
||||
if err := unix.Setgroups(suppGroups); err != nil {
|
||||
return &os.SyscallError{Syscall: "setgroups", Err: err}
|
||||
}
|
||||
}
|
||||
|
||||
if err := system.Setgid(execUser.Gid); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := system.Setuid(execUser.Uid); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// if we didn't get HOME already, set it based on the user's HOME
|
||||
if envHome := os.Getenv("HOME"); envHome == "" {
|
||||
if err := os.Setenv("HOME", execUser.Home); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
|
||||
// The ownership needs to match because it is created outside of the container and needs to be
|
||||
// localized.
|
||||
func fixStdioPermissions(u *user.ExecUser) error {
|
||||
var null unix.Stat_t
|
||||
if err := unix.Stat("/dev/null", &null); err != nil {
|
||||
return &os.PathError{Op: "stat", Path: "/dev/null", Err: err}
|
||||
}
|
||||
for _, file := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
|
||||
var s unix.Stat_t
|
||||
if err := unix.Fstat(int(file.Fd()), &s); err != nil {
|
||||
return &os.PathError{Op: "fstat", Path: file.Name(), Err: err}
|
||||
}
|
||||
|
||||
// Skip chown if uid is already the one we want or any of the STDIO descriptors
|
||||
// were redirected to /dev/null.
|
||||
if int(s.Uid) == u.Uid || s.Rdev == null.Rdev {
|
||||
continue
|
||||
}
|
||||
|
||||
// We only change the uid (as it is possible for the mount to
|
||||
// prefer a different gid, and there's no reason for us to change it).
|
||||
// The reason why we don't just leave the default uid=X mount setup is
|
||||
// that users expect to be able to actually use their console. Without
|
||||
// this code, you couldn't effectively run as a non-root user inside a
|
||||
// container and also have a console set up.
|
||||
if err := file.Chown(u.Uid, int(s.Gid)); err != nil {
|
||||
// If we've hit an EINVAL then s.Gid isn't mapped in the user
|
||||
// namespace. If we've hit an EPERM then the inode's current owner
|
||||
// is not mapped in our user namespace (in particular,
|
||||
// privileged_wrt_inode_uidgid() has failed). Read-only
|
||||
// /dev can result in EROFS error. In any case, it's
|
||||
// better for us to just not touch the stdio rather
|
||||
// than bail at this point.
|
||||
|
||||
if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) {
|
||||
continue
|
||||
}
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// setupNetwork sets up and initializes any network interface inside the container.
|
||||
func setupNetwork(config *initConfig) error {
|
||||
for _, config := range config.Networks {
|
||||
strategy, err := getStrategy(config.Type)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := strategy.initialize(config); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func setupRoute(config *configs.Config) error {
|
||||
for _, config := range config.Routes {
|
||||
_, dst, err := net.ParseCIDR(config.Destination)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
src := net.ParseIP(config.Source)
|
||||
if src == nil {
|
||||
return fmt.Errorf("Invalid source for route: %s", config.Source)
|
||||
}
|
||||
gw := net.ParseIP(config.Gateway)
|
||||
if gw == nil {
|
||||
return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
|
||||
}
|
||||
l, err := netlink.LinkByName(config.InterfaceName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
route := &netlink.Route{
|
||||
Scope: netlink.SCOPE_UNIVERSE,
|
||||
Dst: dst,
|
||||
Src: src,
|
||||
Gw: gw,
|
||||
LinkIndex: l.Attrs().Index,
|
||||
}
|
||||
if err := netlink.RouteAdd(route); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func maybeClearRlimitNofileCache(limits []configs.Rlimit) {
|
||||
for _, rlimit := range limits {
|
||||
if rlimit.Type == syscall.RLIMIT_NOFILE {
|
||||
system.ClearRlimitNofileCache(&syscall.Rlimit{
|
||||
Cur: rlimit.Soft,
|
||||
Max: rlimit.Hard,
|
||||
})
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func setupRlimits(limits []configs.Rlimit, pid int) error {
|
||||
for _, rlimit := range limits {
|
||||
if err := unix.Prlimit(pid, rlimit.Type, &unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}, nil); err != nil {
|
||||
return fmt.Errorf("error setting rlimit type %v: %w", rlimit.Type, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
const _P_PID = 1
|
||||
|
||||
//nolint:structcheck,unused
|
||||
type siginfo struct {
|
||||
si_signo int32
|
||||
si_errno int32
|
||||
si_code int32
|
||||
// below here is a union; si_pid is the only field we use
|
||||
si_pid int32
|
||||
// Pad to 128 bytes as detailed in blockUntilWaitable
|
||||
pad [96]byte
|
||||
}
|
||||
|
||||
// isWaitable returns true if the process has exited false otherwise.
|
||||
// Its based off blockUntilWaitable in src/os/wait_waitid.go
|
||||
func isWaitable(pid int) (bool, error) {
|
||||
si := &siginfo{}
|
||||
_, _, e := unix.Syscall6(unix.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), unix.WEXITED|unix.WNOWAIT|unix.WNOHANG, 0, 0)
|
||||
if e != 0 {
|
||||
return false, &os.SyscallError{Syscall: "waitid", Err: e}
|
||||
}
|
||||
|
||||
return si.si_pid != 0, nil
|
||||
}
|
||||
|
||||
// signalAllProcesses freezes then iterates over all the processes inside the
|
||||
// manager's cgroups sending the signal s to them.
|
||||
// If s is SIGKILL then it will wait for each process to exit.
|
||||
// For all other signals it will check if the process is ready to report its
|
||||
// exit status and only if it is will a wait be performed.
|
||||
func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
|
||||
var procs []*os.Process
|
||||
if err := m.Freeze(configs.Frozen); err != nil {
|
||||
logrus.Warn(err)
|
||||
}
|
||||
pids, err := m.GetAllPids()
|
||||
if err != nil {
|
||||
if err := m.Freeze(configs.Thawed); err != nil {
|
||||
logrus.Warn(err)
|
||||
}
|
||||
return err
|
||||
}
|
||||
for _, pid := range pids {
|
||||
p, err := os.FindProcess(pid)
|
||||
if err != nil {
|
||||
logrus.Warn(err)
|
||||
continue
|
||||
}
|
||||
procs = append(procs, p)
|
||||
if err := p.Signal(s); err != nil {
|
||||
logrus.Warn(err)
|
||||
}
|
||||
}
|
||||
if err := m.Freeze(configs.Thawed); err != nil {
|
||||
logrus.Warn(err)
|
||||
}
|
||||
|
||||
subreaper, err := system.GetSubreaper()
|
||||
if err != nil {
|
||||
// The error here means that PR_GET_CHILD_SUBREAPER is not
|
||||
// supported because this code might run on a kernel older
|
||||
// than 3.4. We don't want to throw an error in that case,
|
||||
// and we simplify things, considering there is no subreaper
|
||||
// set.
|
||||
subreaper = 0
|
||||
}
|
||||
|
||||
for _, p := range procs {
|
||||
if s != unix.SIGKILL {
|
||||
if ok, err := isWaitable(p.Pid); err != nil {
|
||||
if !errors.Is(err, unix.ECHILD) {
|
||||
logrus.Warn("signalAllProcesses: ", p.Pid, err)
|
||||
}
|
||||
continue
|
||||
} else if !ok {
|
||||
// Not ready to report so don't wait
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// In case a subreaper has been setup, this code must not
|
||||
// wait for the process. Otherwise, we cannot be sure the
|
||||
// current process will be reaped by the subreaper, while
|
||||
// the subreaper might be waiting for this process in order
|
||||
// to retrieve its exit code.
|
||||
if subreaper == 0 {
|
||||
if _, err := p.Wait(); err != nil {
|
||||
if !errors.Is(err, unix.ECHILD) {
|
||||
logrus.Warn("wait: ", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
45
vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go
generated
vendored
45
vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go
generated
vendored
@@ -1,45 +0,0 @@
|
||||
package keys
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
type KeySerial uint32
|
||||
|
||||
func JoinSessionKeyring(name string) (KeySerial, error) {
|
||||
sessKeyID, err := unix.KeyctlJoinSessionKeyring(name)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("unable to create session key: %w", err)
|
||||
}
|
||||
return KeySerial(sessKeyID), nil
|
||||
}
|
||||
|
||||
// ModKeyringPerm modifies permissions on a keyring by reading the current permissions,
|
||||
// anding the bits with the given mask (clearing permissions) and setting
|
||||
// additional permission bits
|
||||
func ModKeyringPerm(ringID KeySerial, mask, setbits uint32) error {
|
||||
dest, err := unix.KeyctlString(unix.KEYCTL_DESCRIBE, int(ringID))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
res := strings.Split(dest, ";")
|
||||
if len(res) < 5 {
|
||||
return errors.New("Destination buffer for key description is too small")
|
||||
}
|
||||
|
||||
// parse permissions
|
||||
perm64, err := strconv.ParseUint(res[3], 16, 32)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
perm := (uint32(perm64) & mask) | setbits
|
||||
|
||||
return unix.KeyctlSetperm(int(ringID), perm)
|
||||
}
|
||||
56
vendor/github.com/opencontainers/runc/libcontainer/logs/logs.go
generated
vendored
56
vendor/github.com/opencontainers/runc/libcontainer/logs/logs.go
generated
vendored
@@ -1,56 +0,0 @@
|
||||
package logs
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"io"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func ForwardLogs(logPipe io.ReadCloser) chan error {
|
||||
done := make(chan error, 1)
|
||||
s := bufio.NewScanner(logPipe)
|
||||
|
||||
logger := logrus.StandardLogger()
|
||||
if logger.ReportCaller {
|
||||
// Need a copy of the standard logger, but with ReportCaller
|
||||
// turned off, as the logs are merely forwarded and their
|
||||
// true source is not this file/line/function.
|
||||
logNoCaller := *logrus.StandardLogger()
|
||||
logNoCaller.ReportCaller = false
|
||||
logger = &logNoCaller
|
||||
}
|
||||
|
||||
go func() {
|
||||
for s.Scan() {
|
||||
processEntry(s.Bytes(), logger)
|
||||
}
|
||||
if err := logPipe.Close(); err != nil {
|
||||
logrus.Errorf("error closing log source: %v", err)
|
||||
}
|
||||
// The only error we want to return is when reading from
|
||||
// logPipe has failed.
|
||||
done <- s.Err()
|
||||
close(done)
|
||||
}()
|
||||
|
||||
return done
|
||||
}
|
||||
|
||||
func processEntry(text []byte, logger *logrus.Logger) {
|
||||
if len(text) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
var jl struct {
|
||||
Level logrus.Level `json:"level"`
|
||||
Msg string `json:"msg"`
|
||||
}
|
||||
if err := json.Unmarshal(text, &jl); err != nil {
|
||||
logrus.Errorf("failed to decode %q to json: %v", text, err)
|
||||
return
|
||||
}
|
||||
|
||||
logger.Log(jl.Level, jl.Msg)
|
||||
}
|
||||
97
vendor/github.com/opencontainers/runc/libcontainer/message_linux.go
generated
vendored
97
vendor/github.com/opencontainers/runc/libcontainer/message_linux.go
generated
vendored
@@ -1,97 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/vishvananda/netlink/nl"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// list of known message types we want to send to bootstrap program
|
||||
// The number is randomly chosen to not conflict with known netlink types
|
||||
const (
|
||||
InitMsg uint16 = 62000
|
||||
CloneFlagsAttr uint16 = 27281
|
||||
NsPathsAttr uint16 = 27282
|
||||
UidmapAttr uint16 = 27283
|
||||
GidmapAttr uint16 = 27284
|
||||
SetgroupAttr uint16 = 27285
|
||||
OomScoreAdjAttr uint16 = 27286
|
||||
RootlessEUIDAttr uint16 = 27287
|
||||
UidmapPathAttr uint16 = 27288
|
||||
GidmapPathAttr uint16 = 27289
|
||||
MountSourcesAttr uint16 = 27290
|
||||
)
|
||||
|
||||
type Int32msg struct {
|
||||
Type uint16
|
||||
Value uint32
|
||||
}
|
||||
|
||||
// Serialize serializes the message.
|
||||
// Int32msg has the following representation
|
||||
// | nlattr len | nlattr type |
|
||||
// | uint32 value |
|
||||
func (msg *Int32msg) Serialize() []byte {
|
||||
buf := make([]byte, msg.Len())
|
||||
native := nl.NativeEndian()
|
||||
native.PutUint16(buf[0:2], uint16(msg.Len()))
|
||||
native.PutUint16(buf[2:4], msg.Type)
|
||||
native.PutUint32(buf[4:8], msg.Value)
|
||||
return buf
|
||||
}
|
||||
|
||||
func (msg *Int32msg) Len() int {
|
||||
return unix.NLA_HDRLEN + 4
|
||||
}
|
||||
|
||||
// Bytemsg has the following representation
|
||||
// | nlattr len | nlattr type |
|
||||
// | value | pad |
|
||||
type Bytemsg struct {
|
||||
Type uint16
|
||||
Value []byte
|
||||
}
|
||||
|
||||
func (msg *Bytemsg) Serialize() []byte {
|
||||
l := msg.Len()
|
||||
if l > math.MaxUint16 {
|
||||
// We cannot return nil nor an error here, so we panic with
|
||||
// a specific type instead, which is handled via recover in
|
||||
// bootstrapData.
|
||||
panic(netlinkError{fmt.Errorf("netlink: cannot serialize bytemsg of length %d (larger than UINT16_MAX)", l)})
|
||||
}
|
||||
buf := make([]byte, (l+unix.NLA_ALIGNTO-1) & ^(unix.NLA_ALIGNTO-1))
|
||||
native := nl.NativeEndian()
|
||||
native.PutUint16(buf[0:2], uint16(l))
|
||||
native.PutUint16(buf[2:4], msg.Type)
|
||||
copy(buf[4:], msg.Value)
|
||||
return buf
|
||||
}
|
||||
|
||||
func (msg *Bytemsg) Len() int {
|
||||
return unix.NLA_HDRLEN + len(msg.Value) + 1 // null-terminated
|
||||
}
|
||||
|
||||
type Boolmsg struct {
|
||||
Type uint16
|
||||
Value bool
|
||||
}
|
||||
|
||||
func (msg *Boolmsg) Serialize() []byte {
|
||||
buf := make([]byte, msg.Len())
|
||||
native := nl.NativeEndian()
|
||||
native.PutUint16(buf[0:2], uint16(msg.Len()))
|
||||
native.PutUint16(buf[2:4], msg.Type)
|
||||
if msg.Value {
|
||||
native.PutUint32(buf[4:8], uint32(1))
|
||||
} else {
|
||||
native.PutUint32(buf[4:8], uint32(0))
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func (msg *Boolmsg) Len() int {
|
||||
return unix.NLA_HDRLEN + 4 // alignment
|
||||
}
|
||||
101
vendor/github.com/opencontainers/runc/libcontainer/mount_linux.go
generated
vendored
101
vendor/github.com/opencontainers/runc/libcontainer/mount_linux.go
generated
vendored
@@ -1,101 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"io/fs"
|
||||
"strconv"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// mountError holds an error from a failed mount or unmount operation.
|
||||
type mountError struct {
|
||||
op string
|
||||
source string
|
||||
target string
|
||||
procfd string
|
||||
flags uintptr
|
||||
data string
|
||||
err error
|
||||
}
|
||||
|
||||
// Error provides a string error representation.
|
||||
func (e *mountError) Error() string {
|
||||
out := e.op + " "
|
||||
|
||||
if e.source != "" {
|
||||
out += e.source + ":" + e.target
|
||||
} else {
|
||||
out += e.target
|
||||
}
|
||||
if e.procfd != "" {
|
||||
out += " (via " + e.procfd + ")"
|
||||
}
|
||||
|
||||
if e.flags != uintptr(0) {
|
||||
out += ", flags: 0x" + strconv.FormatUint(uint64(e.flags), 16)
|
||||
}
|
||||
if e.data != "" {
|
||||
out += ", data: " + e.data
|
||||
}
|
||||
|
||||
out += ": " + e.err.Error()
|
||||
return out
|
||||
}
|
||||
|
||||
// Unwrap returns the underlying error.
|
||||
// This is a convention used by Go 1.13+ standard library.
|
||||
func (e *mountError) Unwrap() error {
|
||||
return e.err
|
||||
}
|
||||
|
||||
// mount is a simple unix.Mount wrapper. If procfd is not empty, it is used
|
||||
// instead of target (and the target is only used to add context to an error).
|
||||
func mount(source, target, procfd, fstype string, flags uintptr, data string) error {
|
||||
dst := target
|
||||
if procfd != "" {
|
||||
dst = procfd
|
||||
}
|
||||
if err := unix.Mount(source, dst, fstype, flags, data); err != nil {
|
||||
return &mountError{
|
||||
op: "mount",
|
||||
source: source,
|
||||
target: target,
|
||||
procfd: procfd,
|
||||
flags: flags,
|
||||
data: data,
|
||||
err: err,
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// unmount is a simple unix.Unmount wrapper.
|
||||
func unmount(target string, flags int) error {
|
||||
err := unix.Unmount(target, flags)
|
||||
if err != nil {
|
||||
return &mountError{
|
||||
op: "unmount",
|
||||
target: target,
|
||||
flags: uintptr(flags),
|
||||
err: err,
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// syscallMode returns the syscall-specific mode bits from Go's portable mode bits.
|
||||
// Copy from https://cs.opensource.google/go/go/+/refs/tags/go1.20.7:src/os/file_posix.go;l=61-75
|
||||
func syscallMode(i fs.FileMode) (o uint32) {
|
||||
o |= uint32(i.Perm())
|
||||
if i&fs.ModeSetuid != 0 {
|
||||
o |= unix.S_ISUID
|
||||
}
|
||||
if i&fs.ModeSetgid != 0 {
|
||||
o |= unix.S_ISGID
|
||||
}
|
||||
if i&fs.ModeSticky != 0 {
|
||||
o |= unix.S_ISVTX
|
||||
}
|
||||
// No mapping for Go's ModeTemporary (plan9 only).
|
||||
return
|
||||
}
|
||||
100
vendor/github.com/opencontainers/runc/libcontainer/network_linux.go
generated
vendored
100
vendor/github.com/opencontainers/runc/libcontainer/network_linux.go
generated
vendored
@@ -1,100 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/types"
|
||||
"github.com/vishvananda/netlink"
|
||||
)
|
||||
|
||||
var strategies = map[string]networkStrategy{
|
||||
"loopback": &loopback{},
|
||||
}
|
||||
|
||||
// networkStrategy represents a specific network configuration for
|
||||
// a container's networking stack
|
||||
type networkStrategy interface {
|
||||
create(*network, int) error
|
||||
initialize(*network) error
|
||||
detach(*configs.Network) error
|
||||
attach(*configs.Network) error
|
||||
}
|
||||
|
||||
// getStrategy returns the specific network strategy for the
|
||||
// provided type.
|
||||
func getStrategy(tpe string) (networkStrategy, error) {
|
||||
s, exists := strategies[tpe]
|
||||
if !exists {
|
||||
return nil, fmt.Errorf("unknown strategy type %q", tpe)
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Returns the network statistics for the network interfaces represented by the NetworkRuntimeInfo.
|
||||
func getNetworkInterfaceStats(interfaceName string) (*types.NetworkInterface, error) {
|
||||
out := &types.NetworkInterface{Name: interfaceName}
|
||||
// This can happen if the network runtime information is missing - possible if the
|
||||
// container was created by an old version of libcontainer.
|
||||
if interfaceName == "" {
|
||||
return out, nil
|
||||
}
|
||||
type netStatsPair struct {
|
||||
// Where to write the output.
|
||||
Out *uint64
|
||||
// The network stats file to read.
|
||||
File string
|
||||
}
|
||||
// Ingress for host veth is from the container. Hence tx_bytes stat on the host veth is actually number of bytes received by the container.
|
||||
netStats := []netStatsPair{
|
||||
{Out: &out.RxBytes, File: "tx_bytes"},
|
||||
{Out: &out.RxPackets, File: "tx_packets"},
|
||||
{Out: &out.RxErrors, File: "tx_errors"},
|
||||
{Out: &out.RxDropped, File: "tx_dropped"},
|
||||
|
||||
{Out: &out.TxBytes, File: "rx_bytes"},
|
||||
{Out: &out.TxPackets, File: "rx_packets"},
|
||||
{Out: &out.TxErrors, File: "rx_errors"},
|
||||
{Out: &out.TxDropped, File: "rx_dropped"},
|
||||
}
|
||||
for _, netStat := range netStats {
|
||||
data, err := readSysfsNetworkStats(interfaceName, netStat.File)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
*(netStat.Out) = data
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// Reads the specified statistics available under /sys/class/net/<EthInterface>/statistics
|
||||
func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) {
|
||||
data, err := os.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return strconv.ParseUint(string(bytes.TrimSpace(data)), 10, 64)
|
||||
}
|
||||
|
||||
// loopback is a network strategy that provides a basic loopback device
|
||||
type loopback struct{}
|
||||
|
||||
func (l *loopback) create(n *network, nspid int) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *loopback) initialize(config *network) error {
|
||||
return netlink.LinkSetUp(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "lo"}})
|
||||
}
|
||||
|
||||
func (l *loopback) attach(n *configs.Network) (err error) {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *loopback) detach(n *configs.Network) (err error) {
|
||||
return nil
|
||||
}
|
||||
84
vendor/github.com/opencontainers/runc/libcontainer/notify_linux.go
generated
vendored
84
vendor/github.com/opencontainers/runc/libcontainer/notify_linux.go
generated
vendored
@@ -1,84 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
type PressureLevel uint
|
||||
|
||||
const (
|
||||
LowPressure PressureLevel = iota
|
||||
MediumPressure
|
||||
CriticalPressure
|
||||
)
|
||||
|
||||
func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) {
|
||||
evFile, err := os.Open(filepath.Join(cgDir, evName))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
fd, err := unix.Eventfd(0, unix.EFD_CLOEXEC)
|
||||
if err != nil {
|
||||
evFile.Close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
eventfd := os.NewFile(uintptr(fd), "eventfd")
|
||||
|
||||
eventControlPath := filepath.Join(cgDir, "cgroup.event_control")
|
||||
data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg)
|
||||
if err := os.WriteFile(eventControlPath, []byte(data), 0o700); err != nil {
|
||||
eventfd.Close()
|
||||
evFile.Close()
|
||||
return nil, err
|
||||
}
|
||||
ch := make(chan struct{})
|
||||
go func() {
|
||||
defer func() {
|
||||
eventfd.Close()
|
||||
evFile.Close()
|
||||
close(ch)
|
||||
}()
|
||||
buf := make([]byte, 8)
|
||||
for {
|
||||
if _, err := eventfd.Read(buf); err != nil {
|
||||
return
|
||||
}
|
||||
// When a cgroup is destroyed, an event is sent to eventfd.
|
||||
// So if the control path is gone, return instead of notifying.
|
||||
if _, err := os.Lstat(eventControlPath); os.IsNotExist(err) {
|
||||
return
|
||||
}
|
||||
ch <- struct{}{}
|
||||
}
|
||||
}()
|
||||
return ch, nil
|
||||
}
|
||||
|
||||
// notifyOnOOM returns channel on which you can expect event about OOM,
|
||||
// if process died without OOM this channel will be closed.
|
||||
func notifyOnOOM(dir string) (<-chan struct{}, error) {
|
||||
if dir == "" {
|
||||
return nil, errors.New("memory controller missing")
|
||||
}
|
||||
|
||||
return registerMemoryEvent(dir, "memory.oom_control", "")
|
||||
}
|
||||
|
||||
func notifyMemoryPressure(dir string, level PressureLevel) (<-chan struct{}, error) {
|
||||
if dir == "" {
|
||||
return nil, errors.New("memory controller missing")
|
||||
}
|
||||
|
||||
if level > CriticalPressure {
|
||||
return nil, fmt.Errorf("invalid pressure level %d", level)
|
||||
}
|
||||
|
||||
levelStr := []string{"low", "medium", "critical"}[level]
|
||||
return registerMemoryEvent(dir, "memory.pressure_level", levelStr)
|
||||
}
|
||||
80
vendor/github.com/opencontainers/runc/libcontainer/notify_v2_linux.go
generated
vendored
80
vendor/github.com/opencontainers/runc/libcontainer/notify_v2_linux.go
generated
vendored
@@ -1,80 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"unsafe"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
func registerMemoryEventV2(cgDir, evName, cgEvName string) (<-chan struct{}, error) {
|
||||
fd, err := unix.InotifyInit()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to init inotify: %w", err)
|
||||
}
|
||||
// watching oom kill
|
||||
evFd, err := unix.InotifyAddWatch(fd, filepath.Join(cgDir, evName), unix.IN_MODIFY)
|
||||
if err != nil {
|
||||
unix.Close(fd)
|
||||
return nil, fmt.Errorf("unable to add inotify watch: %w", err)
|
||||
}
|
||||
// Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited
|
||||
cgFd, err := unix.InotifyAddWatch(fd, filepath.Join(cgDir, cgEvName), unix.IN_MODIFY)
|
||||
if err != nil {
|
||||
unix.Close(fd)
|
||||
return nil, fmt.Errorf("unable to add inotify watch: %w", err)
|
||||
}
|
||||
ch := make(chan struct{})
|
||||
go func() {
|
||||
var (
|
||||
buffer [unix.SizeofInotifyEvent + unix.PathMax + 1]byte
|
||||
offset uint32
|
||||
)
|
||||
defer func() {
|
||||
unix.Close(fd)
|
||||
close(ch)
|
||||
}()
|
||||
|
||||
for {
|
||||
n, err := unix.Read(fd, buffer[:])
|
||||
if err != nil {
|
||||
logrus.Warnf("unable to read event data from inotify, got error: %v", err)
|
||||
return
|
||||
}
|
||||
if n < unix.SizeofInotifyEvent {
|
||||
logrus.Warnf("we should read at least %d bytes from inotify, but got %d bytes.", unix.SizeofInotifyEvent, n)
|
||||
return
|
||||
}
|
||||
offset = 0
|
||||
for offset <= uint32(n-unix.SizeofInotifyEvent) {
|
||||
rawEvent := (*unix.InotifyEvent)(unsafe.Pointer(&buffer[offset]))
|
||||
offset += unix.SizeofInotifyEvent + rawEvent.Len
|
||||
if rawEvent.Mask&unix.IN_MODIFY != unix.IN_MODIFY {
|
||||
continue
|
||||
}
|
||||
switch int(rawEvent.Wd) {
|
||||
case evFd:
|
||||
oom, err := fscommon.GetValueByKey(cgDir, evName, "oom_kill")
|
||||
if err != nil || oom > 0 {
|
||||
ch <- struct{}{}
|
||||
}
|
||||
case cgFd:
|
||||
pids, err := fscommon.GetValueByKey(cgDir, cgEvName, "populated")
|
||||
if err != nil || pids == 0 {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
return ch, nil
|
||||
}
|
||||
|
||||
// notifyOnOOMV2 returns channel on which you can expect event about OOM,
|
||||
// if process died without OOM this channel will be closed.
|
||||
func notifyOnOOMV2(path string) (<-chan struct{}, error) {
|
||||
return registerMemoryEventV2(path, "memory.events", "cgroup.events")
|
||||
}
|
||||
126
vendor/github.com/opencontainers/runc/libcontainer/process.go
generated
vendored
126
vendor/github.com/opencontainers/runc/libcontainer/process.go
generated
vendored
@@ -1,126 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
"math"
|
||||
"os"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
)
|
||||
|
||||
var errInvalidProcess = errors.New("invalid process")
|
||||
|
||||
type processOperations interface {
|
||||
wait() (*os.ProcessState, error)
|
||||
signal(sig os.Signal) error
|
||||
pid() int
|
||||
}
|
||||
|
||||
// Process specifies the configuration and IO for a process inside
|
||||
// a container.
|
||||
type Process struct {
|
||||
// The command to be run followed by any arguments.
|
||||
Args []string
|
||||
|
||||
// Env specifies the environment variables for the process.
|
||||
Env []string
|
||||
|
||||
// User will set the uid and gid of the executing process running inside the container
|
||||
// local to the container's user and group configuration.
|
||||
User string
|
||||
|
||||
// AdditionalGroups specifies the gids that should be added to supplementary groups
|
||||
// in addition to those that the user belongs to.
|
||||
AdditionalGroups []string
|
||||
|
||||
// Cwd will change the processes current working directory inside the container's rootfs.
|
||||
Cwd string
|
||||
|
||||
// Stdin is a pointer to a reader which provides the standard input stream.
|
||||
Stdin io.Reader
|
||||
|
||||
// Stdout is a pointer to a writer which receives the standard output stream.
|
||||
Stdout io.Writer
|
||||
|
||||
// Stderr is a pointer to a writer which receives the standard error stream.
|
||||
Stderr io.Writer
|
||||
|
||||
// ExtraFiles specifies additional open files to be inherited by the container
|
||||
ExtraFiles []*os.File
|
||||
|
||||
// Initial sizings for the console
|
||||
ConsoleWidth uint16
|
||||
ConsoleHeight uint16
|
||||
|
||||
// Capabilities specify the capabilities to keep when executing the process inside the container
|
||||
// All capabilities not specified will be dropped from the processes capability mask
|
||||
Capabilities *configs.Capabilities
|
||||
|
||||
// AppArmorProfile specifies the profile to apply to the process and is
|
||||
// changed at the time the process is execed
|
||||
AppArmorProfile string
|
||||
|
||||
// Label specifies the label to apply to the process. It is commonly used by selinux
|
||||
Label string
|
||||
|
||||
// NoNewPrivileges controls whether processes can gain additional privileges.
|
||||
NoNewPrivileges *bool
|
||||
|
||||
// Rlimits specifies the resource limits, such as max open files, to set in the container
|
||||
// If Rlimits are not set, the container will inherit rlimits from the parent process
|
||||
Rlimits []configs.Rlimit
|
||||
|
||||
// ConsoleSocket provides the masterfd console.
|
||||
ConsoleSocket *os.File
|
||||
|
||||
// Init specifies whether the process is the first process in the container.
|
||||
Init bool
|
||||
|
||||
ops processOperations
|
||||
|
||||
LogLevel string
|
||||
|
||||
// SubCgroupPaths specifies sub-cgroups to run the process in.
|
||||
// Map keys are controller names, map values are paths (relative to
|
||||
// container's top-level cgroup).
|
||||
//
|
||||
// If empty, the default top-level container's cgroup is used.
|
||||
//
|
||||
// For cgroup v2, the only key allowed is "".
|
||||
SubCgroupPaths map[string]string
|
||||
}
|
||||
|
||||
// Wait waits for the process to exit.
|
||||
// Wait releases any resources associated with the Process
|
||||
func (p Process) Wait() (*os.ProcessState, error) {
|
||||
if p.ops == nil {
|
||||
return nil, errInvalidProcess
|
||||
}
|
||||
return p.ops.wait()
|
||||
}
|
||||
|
||||
// Pid returns the process ID
|
||||
func (p Process) Pid() (int, error) {
|
||||
// math.MinInt32 is returned here, because it's invalid value
|
||||
// for the kill() system call.
|
||||
if p.ops == nil {
|
||||
return math.MinInt32, errInvalidProcess
|
||||
}
|
||||
return p.ops.pid(), nil
|
||||
}
|
||||
|
||||
// Signal sends a signal to the Process.
|
||||
func (p Process) Signal(sig os.Signal) error {
|
||||
if p.ops == nil {
|
||||
return errInvalidProcess
|
||||
}
|
||||
return p.ops.signal(sig)
|
||||
}
|
||||
|
||||
// IO holds the process's STDIO
|
||||
type IO struct {
|
||||
Stdin io.WriteCloser
|
||||
Stdout io.ReadCloser
|
||||
Stderr io.ReadCloser
|
||||
}
|
||||
823
vendor/github.com/opencontainers/runc/libcontainer/process_linux.go
generated
vendored
823
vendor/github.com/opencontainers/runc/libcontainer/process_linux.go
generated
vendored
@@ -1,823 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/libcontainer/intelrdt"
|
||||
"github.com/opencontainers/runc/libcontainer/logs"
|
||||
"github.com/opencontainers/runc/libcontainer/system"
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
type parentProcess interface {
|
||||
// pid returns the pid for the running process.
|
||||
pid() int
|
||||
|
||||
// start starts the process execution.
|
||||
start() error
|
||||
|
||||
// send a SIGKILL to the process and wait for the exit.
|
||||
terminate() error
|
||||
|
||||
// wait waits on the process returning the process state.
|
||||
wait() (*os.ProcessState, error)
|
||||
|
||||
// startTime returns the process start time.
|
||||
startTime() (uint64, error)
|
||||
signal(os.Signal) error
|
||||
externalDescriptors() []string
|
||||
setExternalDescriptors(fds []string)
|
||||
forwardChildLogs() chan error
|
||||
}
|
||||
|
||||
type filePair struct {
|
||||
parent *os.File
|
||||
child *os.File
|
||||
}
|
||||
|
||||
type setnsProcess struct {
|
||||
cmd *exec.Cmd
|
||||
messageSockPair filePair
|
||||
logFilePair filePair
|
||||
cgroupPaths map[string]string
|
||||
rootlessCgroups bool
|
||||
manager cgroups.Manager
|
||||
intelRdtPath string
|
||||
config *initConfig
|
||||
fds []string
|
||||
process *Process
|
||||
bootstrapData io.Reader
|
||||
initProcessPid int
|
||||
}
|
||||
|
||||
func (p *setnsProcess) startTime() (uint64, error) {
|
||||
stat, err := system.Stat(p.pid())
|
||||
return stat.StartTime, err
|
||||
}
|
||||
|
||||
func (p *setnsProcess) signal(sig os.Signal) error {
|
||||
s, ok := sig.(unix.Signal)
|
||||
if !ok {
|
||||
return errors.New("os: unsupported signal type")
|
||||
}
|
||||
return unix.Kill(p.pid(), s)
|
||||
}
|
||||
|
||||
func (p *setnsProcess) start() (retErr error) {
|
||||
defer p.messageSockPair.parent.Close()
|
||||
// get the "before" value of oom kill count
|
||||
oom, _ := p.manager.OOMKillCount()
|
||||
err := p.cmd.Start()
|
||||
// close the write-side of the pipes (controlled by child)
|
||||
p.messageSockPair.child.Close()
|
||||
p.logFilePair.child.Close()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error starting setns process: %w", err)
|
||||
}
|
||||
|
||||
waitInit := initWaiter(p.messageSockPair.parent)
|
||||
defer func() {
|
||||
if retErr != nil {
|
||||
if newOom, err := p.manager.OOMKillCount(); err == nil && newOom != oom {
|
||||
// Someone in this cgroup was killed, this _might_ be us.
|
||||
retErr = fmt.Errorf("%w (possibly OOM-killed)", retErr)
|
||||
}
|
||||
werr := <-waitInit
|
||||
if werr != nil {
|
||||
logrus.WithError(werr).Warn()
|
||||
}
|
||||
err := ignoreTerminateErrors(p.terminate())
|
||||
if err != nil {
|
||||
logrus.WithError(err).Warn("unable to terminate setnsProcess")
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
if p.bootstrapData != nil {
|
||||
if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
|
||||
return fmt.Errorf("error copying bootstrap data to pipe: %w", err)
|
||||
}
|
||||
}
|
||||
err = <-waitInit
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := p.execSetns(); err != nil {
|
||||
return fmt.Errorf("error executing setns process: %w", err)
|
||||
}
|
||||
for _, path := range p.cgroupPaths {
|
||||
if err := cgroups.WriteCgroupProc(path, p.pid()); err != nil && !p.rootlessCgroups {
|
||||
// On cgroup v2 + nesting + domain controllers, WriteCgroupProc may fail with EBUSY.
|
||||
// https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
|
||||
// Try to join the cgroup of InitProcessPid.
|
||||
if cgroups.IsCgroup2UnifiedMode() && p.initProcessPid != 0 {
|
||||
initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid)
|
||||
initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile)
|
||||
if initCgErr == nil {
|
||||
if initCgPath, ok := initCg[""]; ok {
|
||||
initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath)
|
||||
logrus.Debugf("adding pid %d to cgroups %v failed (%v), attempting to join %q (obtained from %s)",
|
||||
p.pid(), p.cgroupPaths, err, initCg, initCgDirpath)
|
||||
// NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container.
|
||||
err = cgroups.WriteCgroupProc(initCgDirpath, p.pid())
|
||||
}
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
|
||||
}
|
||||
}
|
||||
}
|
||||
if p.intelRdtPath != "" {
|
||||
// if Intel RDT "resource control" filesystem path exists
|
||||
_, err := os.Stat(p.intelRdtPath)
|
||||
if err == nil {
|
||||
if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
|
||||
return fmt.Errorf("error adding pid %d to Intel RDT: %w", p.pid(), err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := utils.WriteJSON(p.messageSockPair.parent, p.config); err != nil {
|
||||
return fmt.Errorf("error writing config to pipe: %w", err)
|
||||
}
|
||||
|
||||
ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
|
||||
switch sync.Type {
|
||||
case procReady:
|
||||
// Set rlimits, this has to be done here because we lose permissions
|
||||
// to raise the limits once we enter a user-namespace
|
||||
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
|
||||
return fmt.Errorf("error setting rlimits for ready process: %w", err)
|
||||
}
|
||||
|
||||
// Sync with child.
|
||||
return writeSync(p.messageSockPair.parent, procRun)
|
||||
case procHooks:
|
||||
// This shouldn't happen.
|
||||
panic("unexpected procHooks in setns")
|
||||
case procSeccomp:
|
||||
if p.config.Config.Seccomp.ListenerPath == "" {
|
||||
return errors.New("listenerPath is not set")
|
||||
}
|
||||
|
||||
seccompFd, err := recvSeccompFd(uintptr(p.pid()), uintptr(sync.Fd))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer unix.Close(seccompFd)
|
||||
|
||||
bundle, annotations := utils.Annotations(p.config.Config.Labels)
|
||||
containerProcessState := &specs.ContainerProcessState{
|
||||
Version: specs.Version,
|
||||
Fds: []string{specs.SeccompFdName},
|
||||
Pid: p.cmd.Process.Pid,
|
||||
Metadata: p.config.Config.Seccomp.ListenerMetadata,
|
||||
State: specs.State{
|
||||
Version: specs.Version,
|
||||
ID: p.config.ContainerId,
|
||||
Status: specs.StateRunning,
|
||||
Pid: p.initProcessPid,
|
||||
Bundle: bundle,
|
||||
Annotations: annotations,
|
||||
},
|
||||
}
|
||||
if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath,
|
||||
containerProcessState, seccompFd); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Sync with child.
|
||||
if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
default:
|
||||
return errors.New("invalid JSON payload from child")
|
||||
}
|
||||
})
|
||||
|
||||
if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
|
||||
return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err}
|
||||
}
|
||||
// Must be done after Shutdown so the child will exit and we can wait for it.
|
||||
if ierr != nil {
|
||||
_, _ = p.wait()
|
||||
return ierr
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// execSetns runs the process that executes C code to perform the setns calls
|
||||
// because setns support requires the C process to fork off a child and perform the setns
|
||||
// before the go runtime boots, we wait on the process to die and receive the child's pid
|
||||
// over the provided pipe.
|
||||
func (p *setnsProcess) execSetns() error {
|
||||
status, err := p.cmd.Process.Wait()
|
||||
if err != nil {
|
||||
_ = p.cmd.Wait()
|
||||
return fmt.Errorf("error waiting on setns process to finish: %w", err)
|
||||
}
|
||||
if !status.Success() {
|
||||
_ = p.cmd.Wait()
|
||||
return &exec.ExitError{ProcessState: status}
|
||||
}
|
||||
var pid *pid
|
||||
if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
|
||||
_ = p.cmd.Wait()
|
||||
return fmt.Errorf("error reading pid from init pipe: %w", err)
|
||||
}
|
||||
|
||||
// Clean up the zombie parent process
|
||||
// On Unix systems FindProcess always succeeds.
|
||||
firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)
|
||||
|
||||
// Ignore the error in case the child has already been reaped for any reason
|
||||
_, _ = firstChildProcess.Wait()
|
||||
|
||||
process, err := os.FindProcess(pid.Pid)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
p.cmd.Process = process
|
||||
p.process.ops = p
|
||||
return nil
|
||||
}
|
||||
|
||||
// terminate sends a SIGKILL to the forked process for the setns routine then waits to
|
||||
// avoid the process becoming a zombie.
|
||||
func (p *setnsProcess) terminate() error {
|
||||
if p.cmd.Process == nil {
|
||||
return nil
|
||||
}
|
||||
err := p.cmd.Process.Kill()
|
||||
if _, werr := p.wait(); err == nil {
|
||||
err = werr
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func (p *setnsProcess) wait() (*os.ProcessState, error) {
|
||||
err := p.cmd.Wait()
|
||||
|
||||
// Return actual ProcessState even on Wait error
|
||||
return p.cmd.ProcessState, err
|
||||
}
|
||||
|
||||
func (p *setnsProcess) pid() int {
|
||||
return p.cmd.Process.Pid
|
||||
}
|
||||
|
||||
func (p *setnsProcess) externalDescriptors() []string {
|
||||
return p.fds
|
||||
}
|
||||
|
||||
func (p *setnsProcess) setExternalDescriptors(newFds []string) {
|
||||
p.fds = newFds
|
||||
}
|
||||
|
||||
func (p *setnsProcess) forwardChildLogs() chan error {
|
||||
return logs.ForwardLogs(p.logFilePair.parent)
|
||||
}
|
||||
|
||||
type initProcess struct {
|
||||
cmd *exec.Cmd
|
||||
messageSockPair filePair
|
||||
logFilePair filePair
|
||||
config *initConfig
|
||||
manager cgroups.Manager
|
||||
intelRdtManager *intelrdt.Manager
|
||||
container *linuxContainer
|
||||
fds []string
|
||||
process *Process
|
||||
bootstrapData io.Reader
|
||||
sharePidns bool
|
||||
}
|
||||
|
||||
func (p *initProcess) pid() int {
|
||||
return p.cmd.Process.Pid
|
||||
}
|
||||
|
||||
func (p *initProcess) externalDescriptors() []string {
|
||||
return p.fds
|
||||
}
|
||||
|
||||
// getChildPid receives the final child's pid over the provided pipe.
|
||||
func (p *initProcess) getChildPid() (int, error) {
|
||||
var pid pid
|
||||
if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
|
||||
_ = p.cmd.Wait()
|
||||
return -1, err
|
||||
}
|
||||
|
||||
// Clean up the zombie parent process
|
||||
// On Unix systems FindProcess always succeeds.
|
||||
firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)
|
||||
|
||||
// Ignore the error in case the child has already been reaped for any reason
|
||||
_, _ = firstChildProcess.Wait()
|
||||
|
||||
return pid.Pid, nil
|
||||
}
|
||||
|
||||
func (p *initProcess) waitForChildExit(childPid int) error {
|
||||
status, err := p.cmd.Process.Wait()
|
||||
if err != nil {
|
||||
_ = p.cmd.Wait()
|
||||
return err
|
||||
}
|
||||
if !status.Success() {
|
||||
_ = p.cmd.Wait()
|
||||
return &exec.ExitError{ProcessState: status}
|
||||
}
|
||||
|
||||
process, err := os.FindProcess(childPid)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
p.cmd.Process = process
|
||||
p.process.ops = p
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *initProcess) start() (retErr error) {
|
||||
defer p.messageSockPair.parent.Close() //nolint: errcheck
|
||||
err := p.cmd.Start()
|
||||
p.process.ops = p
|
||||
// close the write-side of the pipes (controlled by child)
|
||||
_ = p.messageSockPair.child.Close()
|
||||
_ = p.logFilePair.child.Close()
|
||||
if err != nil {
|
||||
p.process.ops = nil
|
||||
return fmt.Errorf("unable to start init: %w", err)
|
||||
}
|
||||
|
||||
waitInit := initWaiter(p.messageSockPair.parent)
|
||||
defer func() {
|
||||
if retErr != nil {
|
||||
// Find out if init is killed by the kernel's OOM killer.
|
||||
// Get the count before killing init as otherwise cgroup
|
||||
// might be removed by systemd.
|
||||
oom, err := p.manager.OOMKillCount()
|
||||
if err != nil {
|
||||
logrus.WithError(err).Warn("unable to get oom kill count")
|
||||
} else if oom > 0 {
|
||||
// Does not matter what the particular error was,
|
||||
// its cause is most probably OOM, so report that.
|
||||
const oomError = "container init was OOM-killed (memory limit too low?)"
|
||||
|
||||
if logrus.GetLevel() >= logrus.DebugLevel {
|
||||
// Only show the original error if debug is set,
|
||||
// as it is not generally very useful.
|
||||
retErr = fmt.Errorf(oomError+": %w", retErr)
|
||||
} else {
|
||||
retErr = errors.New(oomError)
|
||||
}
|
||||
}
|
||||
|
||||
werr := <-waitInit
|
||||
if werr != nil {
|
||||
logrus.WithError(werr).Warn()
|
||||
}
|
||||
|
||||
// Terminate the process to ensure we can remove cgroups.
|
||||
if err := ignoreTerminateErrors(p.terminate()); err != nil {
|
||||
logrus.WithError(err).Warn("unable to terminate initProcess")
|
||||
}
|
||||
|
||||
_ = p.manager.Destroy()
|
||||
if p.intelRdtManager != nil {
|
||||
_ = p.intelRdtManager.Destroy()
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Do this before syncing with child so that no children can escape the
|
||||
// cgroup. We don't need to worry about not doing this and not being root
|
||||
// because we'd be using the rootless cgroup manager in that case.
|
||||
if err := p.manager.Apply(p.pid()); err != nil {
|
||||
return fmt.Errorf("unable to apply cgroup configuration: %w", err)
|
||||
}
|
||||
if p.intelRdtManager != nil {
|
||||
if err := p.intelRdtManager.Apply(p.pid()); err != nil {
|
||||
return fmt.Errorf("unable to apply Intel RDT configuration: %w", err)
|
||||
}
|
||||
}
|
||||
if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
|
||||
return fmt.Errorf("can't copy bootstrap data to pipe: %w", err)
|
||||
}
|
||||
err = <-waitInit
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
childPid, err := p.getChildPid()
|
||||
if err != nil {
|
||||
return fmt.Errorf("can't get final child's PID from pipe: %w", err)
|
||||
}
|
||||
|
||||
// Save the standard descriptor names before the container process
|
||||
// can potentially move them (e.g., via dup2()). If we don't do this now,
|
||||
// we won't know at checkpoint time which file descriptor to look up.
|
||||
fds, err := getPipeFds(childPid)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error getting pipe fds for pid %d: %w", childPid, err)
|
||||
}
|
||||
p.setExternalDescriptors(fds)
|
||||
|
||||
// Wait for our first child to exit
|
||||
if err := p.waitForChildExit(childPid); err != nil {
|
||||
return fmt.Errorf("error waiting for our first child to exit: %w", err)
|
||||
}
|
||||
|
||||
if err := p.createNetworkInterfaces(); err != nil {
|
||||
return fmt.Errorf("error creating network interfaces: %w", err)
|
||||
}
|
||||
if err := p.updateSpecState(); err != nil {
|
||||
return fmt.Errorf("error updating spec state: %w", err)
|
||||
}
|
||||
if err := p.sendConfig(); err != nil {
|
||||
return fmt.Errorf("error sending config to init process: %w", err)
|
||||
}
|
||||
var (
|
||||
sentRun bool
|
||||
sentResume bool
|
||||
)
|
||||
|
||||
ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
|
||||
switch sync.Type {
|
||||
case procSeccomp:
|
||||
if p.config.Config.Seccomp.ListenerPath == "" {
|
||||
return errors.New("listenerPath is not set")
|
||||
}
|
||||
|
||||
seccompFd, err := recvSeccompFd(uintptr(childPid), uintptr(sync.Fd))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer unix.Close(seccompFd)
|
||||
|
||||
s, err := p.container.currentOCIState()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// initProcessStartTime hasn't been set yet.
|
||||
s.Pid = p.cmd.Process.Pid
|
||||
s.Status = specs.StateCreating
|
||||
containerProcessState := &specs.ContainerProcessState{
|
||||
Version: specs.Version,
|
||||
Fds: []string{specs.SeccompFdName},
|
||||
Pid: s.Pid,
|
||||
Metadata: p.config.Config.Seccomp.ListenerMetadata,
|
||||
State: *s,
|
||||
}
|
||||
if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath,
|
||||
containerProcessState, seccompFd); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Sync with child.
|
||||
if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
|
||||
return err
|
||||
}
|
||||
case procReady:
|
||||
// Set rlimits, this has to be done here because we lose permissions
|
||||
// to raise the limits once we enter a user-namespace
|
||||
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
|
||||
return fmt.Errorf("error setting rlimits for ready process: %w", err)
|
||||
}
|
||||
// call prestart and CreateRuntime hooks
|
||||
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
|
||||
// Setup cgroup before the hook, so that the prestart and CreateRuntime hook could apply cgroup permissions.
|
||||
if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
|
||||
return fmt.Errorf("error setting cgroup config for ready process: %w", err)
|
||||
}
|
||||
if p.intelRdtManager != nil {
|
||||
if err := p.intelRdtManager.Set(p.config.Config); err != nil {
|
||||
return fmt.Errorf("error setting Intel RDT config for ready process: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
if len(p.config.Config.Hooks) != 0 {
|
||||
s, err := p.container.currentOCIState()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// initProcessStartTime hasn't been set yet.
|
||||
s.Pid = p.cmd.Process.Pid
|
||||
s.Status = specs.StateCreating
|
||||
hooks := p.config.Config.Hooks
|
||||
|
||||
if err := hooks[configs.Prestart].RunHooks(s); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// generate a timestamp indicating when the container was started
|
||||
p.container.created = time.Now().UTC()
|
||||
p.container.state = &createdState{
|
||||
c: p.container,
|
||||
}
|
||||
|
||||
// NOTE: If the procRun state has been synced and the
|
||||
// runc-create process has been killed for some reason,
|
||||
// the runc-init[2:stage] process will be leaky. And
|
||||
// the runc command also fails to parse root directory
|
||||
// because the container doesn't have state.json.
|
||||
//
|
||||
// In order to cleanup the runc-init[2:stage] by
|
||||
// runc-delete/stop, we should store the status before
|
||||
// procRun sync.
|
||||
state, uerr := p.container.updateState(p)
|
||||
if uerr != nil {
|
||||
return fmt.Errorf("unable to store init state: %w", err)
|
||||
}
|
||||
p.container.initProcessStartTime = state.InitProcessStartTime
|
||||
|
||||
// Sync with child.
|
||||
if err := writeSync(p.messageSockPair.parent, procRun); err != nil {
|
||||
return err
|
||||
}
|
||||
sentRun = true
|
||||
case procHooks:
|
||||
// Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
|
||||
if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
|
||||
return fmt.Errorf("error setting cgroup config for procHooks process: %w", err)
|
||||
}
|
||||
if p.intelRdtManager != nil {
|
||||
if err := p.intelRdtManager.Set(p.config.Config); err != nil {
|
||||
return fmt.Errorf("error setting Intel RDT config for procHooks process: %w", err)
|
||||
}
|
||||
}
|
||||
if len(p.config.Config.Hooks) != 0 {
|
||||
s, err := p.container.currentOCIState()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// initProcessStartTime hasn't been set yet.
|
||||
s.Pid = p.cmd.Process.Pid
|
||||
s.Status = specs.StateCreating
|
||||
hooks := p.config.Config.Hooks
|
||||
|
||||
if err := hooks[configs.Prestart].RunHooks(s); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
// Sync with child.
|
||||
if err := writeSync(p.messageSockPair.parent, procResume); err != nil {
|
||||
return err
|
||||
}
|
||||
sentResume = true
|
||||
default:
|
||||
return errors.New("invalid JSON payload from child")
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
|
||||
if !sentRun {
|
||||
return fmt.Errorf("error during container init: %w", ierr)
|
||||
}
|
||||
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
|
||||
return errors.New("could not synchronise after executing prestart and CreateRuntime hooks with container process")
|
||||
}
|
||||
if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
|
||||
return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err}
|
||||
}
|
||||
|
||||
// Must be done after Shutdown so the child will exit and we can wait for it.
|
||||
if ierr != nil {
|
||||
_, _ = p.wait()
|
||||
return ierr
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *initProcess) wait() (*os.ProcessState, error) {
|
||||
err := p.cmd.Wait()
|
||||
// we should kill all processes in cgroup when init is died if we use host PID namespace
|
||||
if p.sharePidns {
|
||||
_ = signalAllProcesses(p.manager, unix.SIGKILL)
|
||||
}
|
||||
return p.cmd.ProcessState, err
|
||||
}
|
||||
|
||||
func (p *initProcess) terminate() error {
|
||||
if p.cmd.Process == nil {
|
||||
return nil
|
||||
}
|
||||
err := p.cmd.Process.Kill()
|
||||
if _, werr := p.wait(); err == nil {
|
||||
err = werr
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func (p *initProcess) startTime() (uint64, error) {
|
||||
stat, err := system.Stat(p.pid())
|
||||
return stat.StartTime, err
|
||||
}
|
||||
|
||||
func (p *initProcess) updateSpecState() error {
|
||||
s, err := p.container.currentOCIState()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
p.config.SpecState = s
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *initProcess) sendConfig() error {
|
||||
// send the config to the container's init process, we don't use JSON Encode
|
||||
// here because there might be a problem in JSON decoder in some cases, see:
|
||||
// https://github.com/docker/docker/issues/14203#issuecomment-174177790
|
||||
return utils.WriteJSON(p.messageSockPair.parent, p.config)
|
||||
}
|
||||
|
||||
func (p *initProcess) createNetworkInterfaces() error {
|
||||
for _, config := range p.config.Config.Networks {
|
||||
strategy, err := getStrategy(config.Type)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
n := &network{
|
||||
Network: *config,
|
||||
}
|
||||
if err := strategy.create(n, p.pid()); err != nil {
|
||||
return err
|
||||
}
|
||||
p.config.Networks = append(p.config.Networks, n)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *initProcess) signal(sig os.Signal) error {
|
||||
s, ok := sig.(unix.Signal)
|
||||
if !ok {
|
||||
return errors.New("os: unsupported signal type")
|
||||
}
|
||||
return unix.Kill(p.pid(), s)
|
||||
}
|
||||
|
||||
func (p *initProcess) setExternalDescriptors(newFds []string) {
|
||||
p.fds = newFds
|
||||
}
|
||||
|
||||
func (p *initProcess) forwardChildLogs() chan error {
|
||||
return logs.ForwardLogs(p.logFilePair.parent)
|
||||
}
|
||||
|
||||
func recvSeccompFd(childPid, childFd uintptr) (int, error) {
|
||||
pidfd, _, errno := unix.Syscall(unix.SYS_PIDFD_OPEN, childPid, 0, 0)
|
||||
if errno != 0 {
|
||||
return -1, fmt.Errorf("performing SYS_PIDFD_OPEN syscall: %w", errno)
|
||||
}
|
||||
defer unix.Close(int(pidfd))
|
||||
|
||||
seccompFd, _, errno := unix.Syscall(unix.SYS_PIDFD_GETFD, pidfd, childFd, 0)
|
||||
if errno != 0 {
|
||||
return -1, fmt.Errorf("performing SYS_PIDFD_GETFD syscall: %w", errno)
|
||||
}
|
||||
|
||||
return int(seccompFd), nil
|
||||
}
|
||||
|
||||
func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, fd int) error {
|
||||
conn, err := net.Dial("unix", listenerPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err)
|
||||
}
|
||||
|
||||
socket, err := conn.(*net.UnixConn).File()
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot get seccomp socket: %w", err)
|
||||
}
|
||||
defer socket.Close()
|
||||
|
||||
b, err := json.Marshal(state)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot marshall seccomp state: %w", err)
|
||||
}
|
||||
|
||||
err = utils.SendFds(socket, b, fd)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func getPipeFds(pid int) ([]string, error) {
|
||||
fds := make([]string, 3)
|
||||
|
||||
dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
|
||||
for i := 0; i < 3; i++ {
|
||||
// XXX: This breaks if the path is not a valid symlink (which can
|
||||
// happen in certain particularly unlucky mount namespace setups).
|
||||
f := filepath.Join(dirPath, strconv.Itoa(i))
|
||||
target, err := os.Readlink(f)
|
||||
if err != nil {
|
||||
// Ignore permission errors, for rootless containers and other
|
||||
// non-dumpable processes. if we can't get the fd for a particular
|
||||
// file, there's not much we can do.
|
||||
if os.IsPermission(err) {
|
||||
continue
|
||||
}
|
||||
return fds, err
|
||||
}
|
||||
fds[i] = target
|
||||
}
|
||||
return fds, nil
|
||||
}
|
||||
|
||||
// InitializeIO creates pipes for use with the process's stdio and returns the
|
||||
// opposite side for each. Do not use this if you want to have a pseudoterminal
|
||||
// set up for you by libcontainer (TODO: fix that too).
|
||||
// TODO: This is mostly unnecessary, and should be handled by clients.
|
||||
func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
|
||||
var fds []uintptr
|
||||
i = &IO{}
|
||||
// cleanup in case of an error
|
||||
defer func() {
|
||||
if err != nil {
|
||||
for _, fd := range fds {
|
||||
_ = unix.Close(int(fd))
|
||||
}
|
||||
}
|
||||
}()
|
||||
// STDIN
|
||||
r, w, err := os.Pipe()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
fds = append(fds, r.Fd(), w.Fd())
|
||||
p.Stdin, i.Stdin = r, w
|
||||
// STDOUT
|
||||
if r, w, err = os.Pipe(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
fds = append(fds, r.Fd(), w.Fd())
|
||||
p.Stdout, i.Stdout = w, r
|
||||
// STDERR
|
||||
if r, w, err = os.Pipe(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
fds = append(fds, r.Fd(), w.Fd())
|
||||
p.Stderr, i.Stderr = w, r
|
||||
// change ownership of the pipes in case we are in a user namespace
|
||||
for _, fd := range fds {
|
||||
if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
|
||||
return nil, &os.PathError{Op: "fchown", Path: "fd " + strconv.Itoa(int(fd)), Err: err}
|
||||
}
|
||||
}
|
||||
return i, nil
|
||||
}
|
||||
|
||||
// initWaiter returns a channel to wait on for making sure
|
||||
// runc init has finished the initial setup.
|
||||
func initWaiter(r io.Reader) chan error {
|
||||
ch := make(chan error, 1)
|
||||
go func() {
|
||||
defer close(ch)
|
||||
|
||||
inited := make([]byte, 1)
|
||||
n, err := r.Read(inited)
|
||||
if err == nil {
|
||||
if n < 1 {
|
||||
err = errors.New("short read")
|
||||
} else if inited[0] != 0 {
|
||||
err = fmt.Errorf("unexpected %d != 0", inited[0])
|
||||
} else {
|
||||
ch <- nil
|
||||
return
|
||||
}
|
||||
}
|
||||
ch <- fmt.Errorf("waiting for init preliminary setup: %w", err)
|
||||
}()
|
||||
|
||||
return ch
|
||||
}
|
||||
128
vendor/github.com/opencontainers/runc/libcontainer/restored_process.go
generated
vendored
128
vendor/github.com/opencontainers/runc/libcontainer/restored_process.go
generated
vendored
@@ -1,128 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"os/exec"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/system"
|
||||
)
|
||||
|
||||
func newRestoredProcess(cmd *exec.Cmd, fds []string) (*restoredProcess, error) {
|
||||
var err error
|
||||
pid := cmd.Process.Pid
|
||||
stat, err := system.Stat(pid)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &restoredProcess{
|
||||
cmd: cmd,
|
||||
processStartTime: stat.StartTime,
|
||||
fds: fds,
|
||||
}, nil
|
||||
}
|
||||
|
||||
type restoredProcess struct {
|
||||
cmd *exec.Cmd
|
||||
processStartTime uint64
|
||||
fds []string
|
||||
}
|
||||
|
||||
func (p *restoredProcess) start() error {
|
||||
return errors.New("restored process cannot be started")
|
||||
}
|
||||
|
||||
func (p *restoredProcess) pid() int {
|
||||
return p.cmd.Process.Pid
|
||||
}
|
||||
|
||||
func (p *restoredProcess) terminate() error {
|
||||
err := p.cmd.Process.Kill()
|
||||
if _, werr := p.wait(); err == nil {
|
||||
err = werr
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func (p *restoredProcess) wait() (*os.ProcessState, error) {
|
||||
// TODO: how do we wait on the actual process?
|
||||
// maybe use --exec-cmd in criu
|
||||
err := p.cmd.Wait()
|
||||
if err != nil {
|
||||
var exitErr *exec.ExitError
|
||||
if !errors.As(err, &exitErr) {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
st := p.cmd.ProcessState
|
||||
return st, nil
|
||||
}
|
||||
|
||||
func (p *restoredProcess) startTime() (uint64, error) {
|
||||
return p.processStartTime, nil
|
||||
}
|
||||
|
||||
func (p *restoredProcess) signal(s os.Signal) error {
|
||||
return p.cmd.Process.Signal(s)
|
||||
}
|
||||
|
||||
func (p *restoredProcess) externalDescriptors() []string {
|
||||
return p.fds
|
||||
}
|
||||
|
||||
func (p *restoredProcess) setExternalDescriptors(newFds []string) {
|
||||
p.fds = newFds
|
||||
}
|
||||
|
||||
func (p *restoredProcess) forwardChildLogs() chan error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// nonChildProcess represents a process where the calling process is not
|
||||
// the parent process. This process is created when a factory loads a container from
|
||||
// a persisted state.
|
||||
type nonChildProcess struct {
|
||||
processPid int
|
||||
processStartTime uint64
|
||||
fds []string
|
||||
}
|
||||
|
||||
func (p *nonChildProcess) start() error {
|
||||
return errors.New("restored process cannot be started")
|
||||
}
|
||||
|
||||
func (p *nonChildProcess) pid() int {
|
||||
return p.processPid
|
||||
}
|
||||
|
||||
func (p *nonChildProcess) terminate() error {
|
||||
return errors.New("restored process cannot be terminated")
|
||||
}
|
||||
|
||||
func (p *nonChildProcess) wait() (*os.ProcessState, error) {
|
||||
return nil, errors.New("restored process cannot be waited on")
|
||||
}
|
||||
|
||||
func (p *nonChildProcess) startTime() (uint64, error) {
|
||||
return p.processStartTime, nil
|
||||
}
|
||||
|
||||
func (p *nonChildProcess) signal(s os.Signal) error {
|
||||
proc, err := os.FindProcess(p.processPid)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return proc.Signal(s)
|
||||
}
|
||||
|
||||
func (p *nonChildProcess) externalDescriptors() []string {
|
||||
return p.fds
|
||||
}
|
||||
|
||||
func (p *nonChildProcess) setExternalDescriptors(newFds []string) {
|
||||
p.fds = newFds
|
||||
}
|
||||
|
||||
func (p *nonChildProcess) forwardChildLogs() chan error {
|
||||
return nil
|
||||
}
|
||||
1155
vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go
generated
vendored
1155
vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go
generated
vendored
File diff suppressed because it is too large
Load Diff
113
vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go
generated
vendored
113
vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go
generated
vendored
@@ -1,113 +0,0 @@
|
||||
package seccomp
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
)
|
||||
|
||||
var operators = map[string]configs.Operator{
|
||||
"SCMP_CMP_NE": configs.NotEqualTo,
|
||||
"SCMP_CMP_LT": configs.LessThan,
|
||||
"SCMP_CMP_LE": configs.LessThanOrEqualTo,
|
||||
"SCMP_CMP_EQ": configs.EqualTo,
|
||||
"SCMP_CMP_GE": configs.GreaterThanOrEqualTo,
|
||||
"SCMP_CMP_GT": configs.GreaterThan,
|
||||
"SCMP_CMP_MASKED_EQ": configs.MaskEqualTo,
|
||||
}
|
||||
|
||||
// KnownOperators returns the list of the known operations.
|
||||
// Used by `runc features`.
|
||||
func KnownOperators() []string {
|
||||
var res []string
|
||||
for k := range operators {
|
||||
res = append(res, k)
|
||||
}
|
||||
sort.Strings(res)
|
||||
return res
|
||||
}
|
||||
|
||||
var actions = map[string]configs.Action{
|
||||
"SCMP_ACT_KILL": configs.Kill,
|
||||
"SCMP_ACT_ERRNO": configs.Errno,
|
||||
"SCMP_ACT_TRAP": configs.Trap,
|
||||
"SCMP_ACT_ALLOW": configs.Allow,
|
||||
"SCMP_ACT_TRACE": configs.Trace,
|
||||
"SCMP_ACT_LOG": configs.Log,
|
||||
"SCMP_ACT_NOTIFY": configs.Notify,
|
||||
"SCMP_ACT_KILL_THREAD": configs.KillThread,
|
||||
"SCMP_ACT_KILL_PROCESS": configs.KillProcess,
|
||||
}
|
||||
|
||||
// KnownActions returns the list of the known actions.
|
||||
// Used by `runc features`.
|
||||
func KnownActions() []string {
|
||||
var res []string
|
||||
for k := range actions {
|
||||
res = append(res, k)
|
||||
}
|
||||
sort.Strings(res)
|
||||
return res
|
||||
}
|
||||
|
||||
var archs = map[string]string{
|
||||
"SCMP_ARCH_X86": "x86",
|
||||
"SCMP_ARCH_X86_64": "amd64",
|
||||
"SCMP_ARCH_X32": "x32",
|
||||
"SCMP_ARCH_ARM": "arm",
|
||||
"SCMP_ARCH_AARCH64": "arm64",
|
||||
"SCMP_ARCH_MIPS": "mips",
|
||||
"SCMP_ARCH_MIPS64": "mips64",
|
||||
"SCMP_ARCH_MIPS64N32": "mips64n32",
|
||||
"SCMP_ARCH_MIPSEL": "mipsel",
|
||||
"SCMP_ARCH_MIPSEL64": "mipsel64",
|
||||
"SCMP_ARCH_MIPSEL64N32": "mipsel64n32",
|
||||
"SCMP_ARCH_PPC": "ppc",
|
||||
"SCMP_ARCH_PPC64": "ppc64",
|
||||
"SCMP_ARCH_PPC64LE": "ppc64le",
|
||||
"SCMP_ARCH_RISCV64": "riscv64",
|
||||
"SCMP_ARCH_S390": "s390",
|
||||
"SCMP_ARCH_S390X": "s390x",
|
||||
}
|
||||
|
||||
// KnownArchs returns the list of the known archs.
|
||||
// Used by `runc features`.
|
||||
func KnownArchs() []string {
|
||||
var res []string
|
||||
for k := range archs {
|
||||
res = append(res, k)
|
||||
}
|
||||
sort.Strings(res)
|
||||
return res
|
||||
}
|
||||
|
||||
// ConvertStringToOperator converts a string into a Seccomp comparison operator.
|
||||
// Comparison operators use the names they are assigned by Libseccomp's header.
|
||||
// Attempting to convert a string that is not a valid operator results in an
|
||||
// error.
|
||||
func ConvertStringToOperator(in string) (configs.Operator, error) {
|
||||
if op, ok := operators[in]; ok {
|
||||
return op, nil
|
||||
}
|
||||
return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in)
|
||||
}
|
||||
|
||||
// ConvertStringToAction converts a string into a Seccomp rule match action.
|
||||
// Actions use the names they are assigned in Libseccomp's header.
|
||||
// Attempting to convert a string that is not a valid action results in an
|
||||
// error.
|
||||
func ConvertStringToAction(in string) (configs.Action, error) {
|
||||
if act, ok := actions[in]; ok {
|
||||
return act, nil
|
||||
}
|
||||
return 0, fmt.Errorf("string %s is not a valid action for seccomp", in)
|
||||
}
|
||||
|
||||
// ConvertStringToArch converts a string into a Seccomp comparison arch.
|
||||
func ConvertStringToArch(in string) (string, error) {
|
||||
if arch, ok := archs[in]; ok {
|
||||
return arch, nil
|
||||
}
|
||||
return "", fmt.Errorf("string %s is not a valid arch for seccomp", in)
|
||||
}
|
||||
721
vendor/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_linux.go
generated
vendored
721
vendor/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_linux.go
generated
vendored
@@ -1,721 +0,0 @@
|
||||
//go:build cgo && seccomp
|
||||
// +build cgo,seccomp
|
||||
|
||||
package patchbpf
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"runtime"
|
||||
"unsafe"
|
||||
|
||||
libseccomp "github.com/seccomp/libseccomp-golang"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/net/bpf"
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
)
|
||||
|
||||
// #cgo pkg-config: libseccomp
|
||||
/*
|
||||
#include <errno.h>
|
||||
#include <stdint.h>
|
||||
#include <seccomp.h>
|
||||
#include <linux/seccomp.h>
|
||||
|
||||
const uint32_t C_ACT_ERRNO_ENOSYS = SCMP_ACT_ERRNO(ENOSYS);
|
||||
|
||||
// Copied from <linux/seccomp.h>.
|
||||
|
||||
#ifndef SECCOMP_SET_MODE_FILTER
|
||||
# define SECCOMP_SET_MODE_FILTER 1
|
||||
#endif
|
||||
const uintptr_t C_SET_MODE_FILTER = SECCOMP_SET_MODE_FILTER;
|
||||
|
||||
#ifndef SECCOMP_FILTER_FLAG_LOG
|
||||
# define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
|
||||
#endif
|
||||
const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG;
|
||||
|
||||
#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
|
||||
# define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
|
||||
#endif
|
||||
const uintptr_t C_FILTER_FLAG_NEW_LISTENER = SECCOMP_FILTER_FLAG_NEW_LISTENER;
|
||||
|
||||
#ifndef AUDIT_ARCH_RISCV64
|
||||
#ifndef EM_RISCV
|
||||
#define EM_RISCV 243
|
||||
#endif
|
||||
#define AUDIT_ARCH_RISCV64 (EM_RISCV|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
|
||||
#endif
|
||||
|
||||
// We use the AUDIT_ARCH_* values because those are the ones used by the kernel
|
||||
// and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we
|
||||
// use <seccomp.h> so we get libseccomp's fallback definitions of AUDIT_ARCH_*.
|
||||
|
||||
const uint32_t C_AUDIT_ARCH_I386 = AUDIT_ARCH_I386;
|
||||
const uint32_t C_AUDIT_ARCH_X86_64 = AUDIT_ARCH_X86_64;
|
||||
const uint32_t C_AUDIT_ARCH_ARM = AUDIT_ARCH_ARM;
|
||||
const uint32_t C_AUDIT_ARCH_AARCH64 = AUDIT_ARCH_AARCH64;
|
||||
const uint32_t C_AUDIT_ARCH_MIPS = AUDIT_ARCH_MIPS;
|
||||
const uint32_t C_AUDIT_ARCH_MIPS64 = AUDIT_ARCH_MIPS64;
|
||||
const uint32_t C_AUDIT_ARCH_MIPS64N32 = AUDIT_ARCH_MIPS64N32;
|
||||
const uint32_t C_AUDIT_ARCH_MIPSEL = AUDIT_ARCH_MIPSEL;
|
||||
const uint32_t C_AUDIT_ARCH_MIPSEL64 = AUDIT_ARCH_MIPSEL64;
|
||||
const uint32_t C_AUDIT_ARCH_MIPSEL64N32 = AUDIT_ARCH_MIPSEL64N32;
|
||||
const uint32_t C_AUDIT_ARCH_PPC = AUDIT_ARCH_PPC;
|
||||
const uint32_t C_AUDIT_ARCH_PPC64 = AUDIT_ARCH_PPC64;
|
||||
const uint32_t C_AUDIT_ARCH_PPC64LE = AUDIT_ARCH_PPC64LE;
|
||||
const uint32_t C_AUDIT_ARCH_S390 = AUDIT_ARCH_S390;
|
||||
const uint32_t C_AUDIT_ARCH_S390X = AUDIT_ARCH_S390X;
|
||||
const uint32_t C_AUDIT_ARCH_RISCV64 = AUDIT_ARCH_RISCV64;
|
||||
*/
|
||||
import "C"
|
||||
|
||||
var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS)
|
||||
|
||||
// This syscall is used for multiplexing "large" syscalls on s390(x). Unknown
|
||||
// syscalls will end up with this syscall number, so we need to explicitly
|
||||
// return -ENOSYS for this syscall on those architectures.
|
||||
const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0
|
||||
|
||||
func isAllowAction(action configs.Action) bool {
|
||||
switch action {
|
||||
// Trace is considered an "allow" action because a good tracer should
|
||||
// support future syscalls (by handling -ENOSYS on its own), and giving
|
||||
// -ENOSYS will be disruptive for emulation.
|
||||
case configs.Allow, configs.Log, configs.Trace:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func parseProgram(rdr io.Reader) ([]bpf.RawInstruction, error) {
|
||||
var program []bpf.RawInstruction
|
||||
loop:
|
||||
for {
|
||||
// Read the next instruction. We have to use NativeEndian because
|
||||
// seccomp_export_bpf outputs the program in *host* endian-ness.
|
||||
var insn unix.SockFilter
|
||||
if err := binary.Read(rdr, utils.NativeEndian, &insn); err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
// Parsing complete.
|
||||
break loop
|
||||
}
|
||||
if errors.Is(err, io.ErrUnexpectedEOF) {
|
||||
// Parsing stopped mid-instruction.
|
||||
return nil, fmt.Errorf("program parsing halted mid-instruction: %w", err)
|
||||
}
|
||||
// All other errors.
|
||||
return nil, fmt.Errorf("error parsing instructions: %w", err)
|
||||
}
|
||||
program = append(program, bpf.RawInstruction{
|
||||
Op: insn.Code,
|
||||
Jt: insn.Jt,
|
||||
Jf: insn.Jf,
|
||||
K: insn.K,
|
||||
})
|
||||
}
|
||||
return program, nil
|
||||
}
|
||||
|
||||
func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error) {
|
||||
rdr, wtr, err := os.Pipe()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error creating scratch pipe: %w", err)
|
||||
}
|
||||
defer wtr.Close()
|
||||
defer rdr.Close()
|
||||
|
||||
readerBuffer := new(bytes.Buffer)
|
||||
errChan := make(chan error, 1)
|
||||
go func() {
|
||||
_, err := io.Copy(readerBuffer, rdr)
|
||||
errChan <- err
|
||||
close(errChan)
|
||||
}()
|
||||
|
||||
if err := filter.ExportBPF(wtr); err != nil {
|
||||
return nil, fmt.Errorf("error exporting BPF: %w", err)
|
||||
}
|
||||
// Close so that the reader actually gets EOF.
|
||||
_ = wtr.Close()
|
||||
|
||||
if copyErr := <-errChan; copyErr != nil {
|
||||
return nil, fmt.Errorf("error reading from ExportBPF pipe: %w", copyErr)
|
||||
}
|
||||
|
||||
// Parse the instructions.
|
||||
rawProgram, err := parseProgram(readerBuffer)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parsing generated BPF filter: %w", err)
|
||||
}
|
||||
program, ok := bpf.Disassemble(rawProgram)
|
||||
if !ok {
|
||||
return nil, errors.New("could not disassemble entire BPF filter")
|
||||
}
|
||||
return program, nil
|
||||
}
|
||||
|
||||
type linuxAuditArch uint32
|
||||
|
||||
const invalidArch linuxAuditArch = 0
|
||||
|
||||
func scmpArchToAuditArch(arch libseccomp.ScmpArch) (linuxAuditArch, error) {
|
||||
switch arch {
|
||||
case libseccomp.ArchNative:
|
||||
// Convert to actual native architecture.
|
||||
arch, err := libseccomp.GetNativeArch()
|
||||
if err != nil {
|
||||
return invalidArch, fmt.Errorf("unable to get native arch: %w", err)
|
||||
}
|
||||
return scmpArchToAuditArch(arch)
|
||||
case libseccomp.ArchX86:
|
||||
return linuxAuditArch(C.C_AUDIT_ARCH_I386), nil
|
||||
case libseccomp.ArchAMD64, libseccomp.ArchX32:
|
||||
// NOTE: x32 is treated like x86_64 except all x32 syscalls have the
|
||||
// 30th bit of the syscall number set to indicate that it's not a
|
||||
// normal x86_64 syscall.
|
||||
return linuxAuditArch(C.C_AUDIT_ARCH_X86_64), nil
|
||||
case libseccomp.ArchARM:
|
||||
return linuxAuditArch(C.C_AUDIT_ARCH_ARM), nil
|
||||
case libseccomp.ArchARM64:
|
||||
return linuxAuditArch(C.C_AUDIT_ARCH_AARCH64), nil
|
||||
case libseccomp.ArchMIPS:
|
||||
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS), nil
|
||||
case libseccomp.ArchMIPS64:
|
||||
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64), nil
|
||||
case libseccomp.ArchMIPS64N32:
|
||||
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64N32), nil
|
||||
case libseccomp.ArchMIPSEL:
|
||||
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL), nil
|
||||
case libseccomp.ArchMIPSEL64:
|
||||
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64), nil
|
||||
case libseccomp.ArchMIPSEL64N32:
|
||||
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
|
||||
case libseccomp.ArchPPC:
|
||||
return linuxAuditArch(C.C_AUDIT_ARCH_PPC), nil
|
||||
case libseccomp.ArchPPC64:
|
||||
return linuxAuditArch(C.C_AUDIT_ARCH_PPC64), nil
|
||||
case libseccomp.ArchPPC64LE:
|
||||
return linuxAuditArch(C.C_AUDIT_ARCH_PPC64LE), nil
|
||||
case libseccomp.ArchS390:
|
||||
return linuxAuditArch(C.C_AUDIT_ARCH_S390), nil
|
||||
case libseccomp.ArchS390X:
|
||||
return linuxAuditArch(C.C_AUDIT_ARCH_S390X), nil
|
||||
case libseccomp.ArchRISCV64:
|
||||
return linuxAuditArch(C.C_AUDIT_ARCH_RISCV64), nil
|
||||
default:
|
||||
return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
|
||||
}
|
||||
}
|
||||
|
||||
type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
|
||||
|
||||
// Figure out largest syscall number referenced in the filter for each
|
||||
// architecture. We will be generating code based on the native architecture
|
||||
// representation, but SCMP_ARCH_X32 means we have to track cases where the
|
||||
// same architecture has different largest syscalls based on the mode.
|
||||
func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
|
||||
scmpArchs := make(map[libseccomp.ScmpArch]struct{})
|
||||
for _, ociArch := range config.Architectures {
|
||||
arch, err := libseccomp.GetArchFromString(ociArch)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
|
||||
}
|
||||
scmpArchs[arch] = struct{}{}
|
||||
}
|
||||
// On architectures like ppc64le, Docker inexplicably doesn't include the
|
||||
// native architecture in the architecture list which results in no
|
||||
// architectures being present in the list at all (rendering the ENOSYS
|
||||
// stub a no-op). So, always include the native architecture.
|
||||
if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil {
|
||||
return nil, fmt.Errorf("unable to get native arch: %w", err)
|
||||
} else if _, ok := scmpArchs[nativeScmpArch]; !ok {
|
||||
logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch)
|
||||
scmpArchs[nativeScmpArch] = struct{}{}
|
||||
}
|
||||
logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs)
|
||||
|
||||
// Only loop over architectures which are present in the filter. Any other
|
||||
// architectures will get the libseccomp bad architecture action anyway.
|
||||
lastSyscalls := make(lastSyscallMap)
|
||||
for arch := range scmpArchs {
|
||||
auditArch, err := scmpArchToAuditArch(arch)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
|
||||
}
|
||||
|
||||
if _, ok := lastSyscalls[auditArch]; !ok {
|
||||
lastSyscalls[auditArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
|
||||
}
|
||||
if _, ok := lastSyscalls[auditArch][arch]; ok {
|
||||
// Because of ArchNative we may hit the same entry multiple times.
|
||||
// Just skip it if we've seen this (linuxAuditArch, ScmpArch)
|
||||
// combination before.
|
||||
continue
|
||||
}
|
||||
|
||||
// Find the largest syscall in the filter for this architecture.
|
||||
var largestSyscall libseccomp.ScmpSyscall
|
||||
for _, rule := range config.Syscalls {
|
||||
sysno, err := libseccomp.GetSyscallFromNameByArch(rule.Name, arch)
|
||||
if err != nil {
|
||||
// Ignore unknown syscalls.
|
||||
continue
|
||||
}
|
||||
if sysno > largestSyscall {
|
||||
largestSyscall = sysno
|
||||
}
|
||||
}
|
||||
if largestSyscall != 0 {
|
||||
logrus.Debugf("seccomp: largest syscall number for arch %v is %v", arch, largestSyscall)
|
||||
lastSyscalls[auditArch][arch] = largestSyscall
|
||||
} else {
|
||||
logrus.Warnf("could not find any syscalls for arch %v", arch)
|
||||
delete(lastSyscalls[auditArch], arch)
|
||||
}
|
||||
}
|
||||
return lastSyscalls, nil
|
||||
}
|
||||
|
||||
// FIXME FIXME FIXME
|
||||
//
|
||||
// This solution is less than ideal. In the future it would be great to have
|
||||
// per-arch information about which syscalls were added in which kernel
|
||||
// versions so we can create far more accurate filter rules (handling holes in
|
||||
// the syscall table and determining -ENOSYS requirements based on kernel
|
||||
// minimum version alone.
|
||||
//
|
||||
// This implementation can in principle cause issues with syscalls like
|
||||
// close_range(2) which were added out-of-order in the syscall table between
|
||||
// kernel releases.
|
||||
func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) {
|
||||
// A jump-table for each linuxAuditArch used to generate the initial
|
||||
// conditional jumps -- measured from the *END* of the program so they
|
||||
// remain valid after prepending to the tail.
|
||||
archJumpTable := map[linuxAuditArch]uint32{}
|
||||
|
||||
// Generate our own -ENOSYS rules for each architecture. They have to be
|
||||
// generated in reverse (prepended to the tail of the program) because the
|
||||
// JumpIf jumps need to be computed from the end of the program.
|
||||
programTail := []bpf.Instruction{
|
||||
// Fall-through rules jump into the filter.
|
||||
bpf.Jump{Skip: 1},
|
||||
// Rules which jump to here get -ENOSYS.
|
||||
bpf.RetConstant{Val: retErrnoEnosys},
|
||||
}
|
||||
|
||||
// Generate the syscall -ENOSYS rules.
|
||||
for auditArch, maxSyscalls := range lastSyscalls {
|
||||
// The number of instructions from the tail of this section which need
|
||||
// to be jumped in order to reach the -ENOSYS return. If the section
|
||||
// does not jump, it will fall through to the actual filter.
|
||||
baseJumpEnosys := uint32(len(programTail) - 1)
|
||||
baseJumpFilter := baseJumpEnosys + 1
|
||||
|
||||
// Add the load instruction for the syscall number -- we jump here
|
||||
// directly from the arch code so we need to do it here. Sadly we can't
|
||||
// share this code between architecture branches.
|
||||
section := []bpf.Instruction{
|
||||
// load [0] (syscall number)
|
||||
bpf.LoadAbsolute{Off: 0, Size: 4}, // NOTE: We assume sizeof(int) == 4.
|
||||
}
|
||||
|
||||
switch len(maxSyscalls) {
|
||||
case 0:
|
||||
// No syscalls found for this arch -- skip it and move on.
|
||||
continue
|
||||
case 1:
|
||||
// Get the only syscall and scmpArch in the map.
|
||||
var (
|
||||
scmpArch libseccomp.ScmpArch
|
||||
sysno libseccomp.ScmpSyscall
|
||||
)
|
||||
for arch, no := range maxSyscalls {
|
||||
sysno = no
|
||||
scmpArch = arch
|
||||
}
|
||||
|
||||
switch scmpArch {
|
||||
// Return -ENOSYS for setup(2) on s390(x). This syscall is used for
|
||||
// multiplexing "large syscall number" syscalls, but if the syscall
|
||||
// number is not known to the kernel then the syscall number is
|
||||
// left unchanged (and because it is sysno=0, you'll end up with
|
||||
// EPERM for syscalls the kernel doesn't know about).
|
||||
//
|
||||
// The actual setup(2) syscall is never used by userspace anymore
|
||||
// (and hasn't existed for decades) outside of this multiplexing
|
||||
// scheme so returning -ENOSYS is fine.
|
||||
case libseccomp.ArchS390, libseccomp.ArchS390X:
|
||||
section = append(section, []bpf.Instruction{
|
||||
// jne [setup=0],1
|
||||
bpf.JumpIf{
|
||||
Cond: bpf.JumpNotEqual,
|
||||
Val: uint32(s390xMultiplexSyscall),
|
||||
SkipTrue: 1,
|
||||
},
|
||||
// ret [ENOSYS]
|
||||
bpf.RetConstant{Val: retErrnoEnosys},
|
||||
}...)
|
||||
}
|
||||
|
||||
// The simplest case just boils down to a single jgt instruction,
|
||||
// with special handling if baseJumpEnosys is larger than 255 (and
|
||||
// thus a long jump is required).
|
||||
var sectionTail []bpf.Instruction
|
||||
if baseJumpEnosys+1 <= 255 {
|
||||
sectionTail = []bpf.Instruction{
|
||||
// jgt [syscall],[baseJumpEnosys+1]
|
||||
bpf.JumpIf{
|
||||
Cond: bpf.JumpGreaterThan,
|
||||
Val: uint32(sysno),
|
||||
SkipTrue: uint8(baseJumpEnosys + 1),
|
||||
},
|
||||
// ja [baseJumpFilter]
|
||||
bpf.Jump{Skip: baseJumpFilter},
|
||||
}
|
||||
} else {
|
||||
sectionTail = []bpf.Instruction{
|
||||
// jle [syscall],1
|
||||
bpf.JumpIf{Cond: bpf.JumpLessOrEqual, Val: uint32(sysno), SkipTrue: 1},
|
||||
// ja [baseJumpEnosys+1]
|
||||
bpf.Jump{Skip: baseJumpEnosys + 1},
|
||||
// ja [baseJumpFilter]
|
||||
bpf.Jump{Skip: baseJumpFilter},
|
||||
}
|
||||
}
|
||||
|
||||
// If we're on x86 we need to add a check for x32 and if we're in
|
||||
// the wrong mode we jump over the section.
|
||||
if uint32(auditArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
|
||||
// Generate a prefix to check the mode.
|
||||
switch scmpArch {
|
||||
case libseccomp.ArchAMD64:
|
||||
sectionTail = append([]bpf.Instruction{
|
||||
// jset (1<<30),[len(tail)-1]
|
||||
bpf.JumpIf{
|
||||
Cond: bpf.JumpBitsSet,
|
||||
Val: 1 << 30,
|
||||
SkipTrue: uint8(len(sectionTail) - 1),
|
||||
},
|
||||
}, sectionTail...)
|
||||
case libseccomp.ArchX32:
|
||||
sectionTail = append([]bpf.Instruction{
|
||||
// jset (1<<30),0,[len(tail)-1]
|
||||
bpf.JumpIf{
|
||||
Cond: bpf.JumpBitsNotSet,
|
||||
Val: 1 << 30,
|
||||
SkipTrue: uint8(len(sectionTail) - 1),
|
||||
},
|
||||
}, sectionTail...)
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown amd64 native architecture %#x", scmpArch)
|
||||
}
|
||||
}
|
||||
|
||||
section = append(section, sectionTail...)
|
||||
case 2:
|
||||
// x32 and x86_64 are a unique case, we can't handle any others.
|
||||
if uint32(auditArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
|
||||
return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", auditArch)
|
||||
}
|
||||
|
||||
x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchX32, maxSyscalls)
|
||||
}
|
||||
x86sysno, ok := maxSyscalls[libseccomp.ArchAMD64]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchAMD64, maxSyscalls)
|
||||
}
|
||||
|
||||
// The x32 ABI indicates that a syscall is being made by an x32
|
||||
// process by setting the 30th bit of the syscall number, but we
|
||||
// need to do some special-casing depending on whether we need to
|
||||
// do long jumps.
|
||||
if baseJumpEnosys+2 <= 255 {
|
||||
// For the simple case we want to have something like:
|
||||
// jset (1<<30),1
|
||||
// jgt [x86 syscall],[baseJumpEnosys+2],1
|
||||
// jgt [x32 syscall],[baseJumpEnosys+1]
|
||||
// ja [baseJumpFilter]
|
||||
section = append(section, []bpf.Instruction{
|
||||
// jset (1<<30),1
|
||||
bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
|
||||
// jgt [x86 syscall],[baseJumpEnosys+1],1
|
||||
bpf.JumpIf{
|
||||
Cond: bpf.JumpGreaterThan,
|
||||
Val: uint32(x86sysno),
|
||||
SkipTrue: uint8(baseJumpEnosys + 2), SkipFalse: 1,
|
||||
},
|
||||
// jgt [x32 syscall],[baseJumpEnosys]
|
||||
bpf.JumpIf{
|
||||
Cond: bpf.JumpGreaterThan,
|
||||
Val: uint32(x32sysno),
|
||||
SkipTrue: uint8(baseJumpEnosys + 1),
|
||||
},
|
||||
// ja [baseJumpFilter]
|
||||
bpf.Jump{Skip: baseJumpFilter},
|
||||
}...)
|
||||
} else {
|
||||
// But if the [baseJumpEnosys+2] jump is larger than 255 we
|
||||
// need to do a long jump like so:
|
||||
// jset (1<<30),1
|
||||
// jgt [x86 syscall],1,2
|
||||
// jle [x32 syscall],1
|
||||
// ja [baseJumpEnosys+1]
|
||||
// ja [baseJumpFilter]
|
||||
section = append(section, []bpf.Instruction{
|
||||
// jset (1<<30),1
|
||||
bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
|
||||
// jgt [x86 syscall],1,2
|
||||
bpf.JumpIf{
|
||||
Cond: bpf.JumpGreaterThan,
|
||||
Val: uint32(x86sysno),
|
||||
SkipTrue: 1, SkipFalse: 2,
|
||||
},
|
||||
// jle [x32 syscall],[baseJumpEnosys]
|
||||
bpf.JumpIf{
|
||||
Cond: bpf.JumpLessOrEqual,
|
||||
Val: uint32(x32sysno),
|
||||
SkipTrue: 1,
|
||||
},
|
||||
// ja [baseJumpEnosys+1]
|
||||
bpf.Jump{Skip: baseJumpEnosys + 1},
|
||||
// ja [baseJumpFilter]
|
||||
bpf.Jump{Skip: baseJumpFilter},
|
||||
}...)
|
||||
}
|
||||
default:
|
||||
return nil, fmt.Errorf("invalid number of architecture overlaps: %v", len(maxSyscalls))
|
||||
}
|
||||
|
||||
// Prepend this section to the tail.
|
||||
programTail = append(section, programTail...)
|
||||
|
||||
// Update jump table.
|
||||
archJumpTable[auditArch] = uint32(len(programTail))
|
||||
}
|
||||
|
||||
// Add a dummy "jump to filter" for any architecture we might miss below.
|
||||
// Such architectures will probably get the BadArch action of the filter
|
||||
// regardless.
|
||||
programTail = append([]bpf.Instruction{
|
||||
// ja [end of stub and start of filter]
|
||||
bpf.Jump{Skip: uint32(len(programTail))},
|
||||
}, programTail...)
|
||||
|
||||
// Generate the jump rules for each architecture. This has to be done in
|
||||
// reverse as well for the same reason as above. We add to programTail
|
||||
// directly because the jumps are impacted by each architecture rule we add
|
||||
// as well.
|
||||
//
|
||||
// TODO: Maybe we want to optimise to avoid long jumps here? So sort the
|
||||
// architectures based on how large the jumps are going to be, or
|
||||
// re-sort the candidate architectures each time to make sure that we
|
||||
// pick the largest jump which is going to be smaller than 255.
|
||||
for auditArch := range lastSyscalls {
|
||||
// We jump forwards but the jump table is calculated from the *END*.
|
||||
jump := uint32(len(programTail)) - archJumpTable[auditArch]
|
||||
|
||||
// Same routine as above -- this is a basic jeq check, complicated
|
||||
// slightly if it turns out that we need to do a long jump.
|
||||
if jump <= 255 {
|
||||
programTail = append([]bpf.Instruction{
|
||||
// jeq [arch],[jump]
|
||||
bpf.JumpIf{
|
||||
Cond: bpf.JumpEqual,
|
||||
Val: uint32(auditArch),
|
||||
SkipTrue: uint8(jump),
|
||||
},
|
||||
}, programTail...)
|
||||
} else {
|
||||
programTail = append([]bpf.Instruction{
|
||||
// jne [arch],1
|
||||
bpf.JumpIf{
|
||||
Cond: bpf.JumpNotEqual,
|
||||
Val: uint32(auditArch),
|
||||
SkipTrue: 1,
|
||||
},
|
||||
// ja [jump]
|
||||
bpf.Jump{Skip: jump},
|
||||
}, programTail...)
|
||||
}
|
||||
}
|
||||
|
||||
// Prepend the load instruction for the architecture.
|
||||
programTail = append([]bpf.Instruction{
|
||||
// load [4] (architecture)
|
||||
bpf.LoadAbsolute{Off: 4, Size: 4}, // NOTE: We assume sizeof(int) == 4.
|
||||
}, programTail...)
|
||||
|
||||
// And that's all folks!
|
||||
return programTail, nil
|
||||
}
|
||||
|
||||
func assemble(program []bpf.Instruction) ([]unix.SockFilter, error) {
|
||||
rawProgram, err := bpf.Assemble(program)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error assembling program: %w", err)
|
||||
}
|
||||
|
||||
// Convert to []unix.SockFilter for unix.SockFilter.
|
||||
var filter []unix.SockFilter
|
||||
for _, insn := range rawProgram {
|
||||
filter = append(filter, unix.SockFilter{
|
||||
Code: insn.Op,
|
||||
Jt: insn.Jt,
|
||||
Jf: insn.Jf,
|
||||
K: insn.K,
|
||||
})
|
||||
}
|
||||
return filter, nil
|
||||
}
|
||||
|
||||
func generatePatch(config *configs.Seccomp) ([]bpf.Instruction, error) {
|
||||
// Patch the generated cBPF only when there is not a defaultErrnoRet set
|
||||
// and it is different from ENOSYS
|
||||
if config.DefaultErrnoRet != nil && *config.DefaultErrnoRet == uint(retErrnoEnosys) {
|
||||
return nil, nil
|
||||
}
|
||||
// We only add the stub if the default action is not permissive.
|
||||
if isAllowAction(config.DefaultAction) {
|
||||
logrus.Debugf("seccomp: skipping -ENOSYS stub filter generation")
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
lastSyscalls, err := findLastSyscalls(config)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error finding last syscalls for -ENOSYS stub: %w", err)
|
||||
}
|
||||
stubProgram, err := generateEnosysStub(lastSyscalls)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error generating -ENOSYS stub: %w", err)
|
||||
}
|
||||
return stubProgram, nil
|
||||
}
|
||||
|
||||
func enosysPatchFilter(config *configs.Seccomp, filter *libseccomp.ScmpFilter) ([]unix.SockFilter, error) {
|
||||
program, err := disassembleFilter(filter)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error disassembling original filter: %w", err)
|
||||
}
|
||||
|
||||
patch, err := generatePatch(config)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error generating patch for filter: %w", err)
|
||||
}
|
||||
fullProgram := append(patch, program...)
|
||||
|
||||
logrus.Debugf("seccomp: prepending -ENOSYS stub filter to user filter...")
|
||||
for idx, insn := range patch {
|
||||
logrus.Debugf(" [%4.1d] %s", idx, insn)
|
||||
}
|
||||
logrus.Debugf(" [....] --- original filter ---")
|
||||
|
||||
fprog, err := assemble(fullProgram)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error assembling modified filter: %w", err)
|
||||
}
|
||||
return fprog, nil
|
||||
}
|
||||
|
||||
func filterFlags(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, err error) {
|
||||
// Ignore the error since pre-2.4 libseccomp is treated as API level 0.
|
||||
apiLevel, _ := libseccomp.GetAPI()
|
||||
|
||||
noNewPrivs, err = filter.GetNoNewPrivsBit()
|
||||
if err != nil {
|
||||
return 0, false, fmt.Errorf("unable to fetch no_new_privs filter bit: %w", err)
|
||||
}
|
||||
|
||||
if apiLevel >= 3 {
|
||||
if logBit, err := filter.GetLogBit(); err != nil {
|
||||
return 0, false, fmt.Errorf("unable to fetch SECCOMP_FILTER_FLAG_LOG bit: %w", err)
|
||||
} else if logBit {
|
||||
flags |= uint(C.C_FILTER_FLAG_LOG)
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Support seccomp flags not yet added to libseccomp-golang...
|
||||
|
||||
for _, call := range config.Syscalls {
|
||||
if call.Action == configs.Notify {
|
||||
flags |= uint(C.C_FILTER_FLAG_NEW_LISTENER)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err error) {
|
||||
fprog := unix.SockFprog{
|
||||
Len: uint16(len(filter)),
|
||||
Filter: &filter[0],
|
||||
}
|
||||
fd = -1 // only return a valid fd when C_FILTER_FLAG_NEW_LISTENER is set
|
||||
// If no seccomp flags were requested we can use the old-school prctl(2).
|
||||
if flags == 0 {
|
||||
err = unix.Prctl(unix.PR_SET_SECCOMP,
|
||||
unix.SECCOMP_MODE_FILTER,
|
||||
uintptr(unsafe.Pointer(&fprog)), 0, 0)
|
||||
} else {
|
||||
fdptr, _, errno := unix.RawSyscall(unix.SYS_SECCOMP,
|
||||
uintptr(C.C_SET_MODE_FILTER),
|
||||
uintptr(flags), uintptr(unsafe.Pointer(&fprog)))
|
||||
if errno != 0 {
|
||||
err = errno
|
||||
}
|
||||
if flags&uint(C.C_FILTER_FLAG_NEW_LISTENER) != 0 {
|
||||
fd = int(fdptr)
|
||||
}
|
||||
}
|
||||
runtime.KeepAlive(filter)
|
||||
runtime.KeepAlive(fprog)
|
||||
return
|
||||
}
|
||||
|
||||
// PatchAndLoad takes a seccomp configuration and a libseccomp filter which has
|
||||
// been pre-configured with the set of rules in the seccomp config. It then
|
||||
// patches said filter to handle -ENOSYS in a much nicer manner than the
|
||||
// default libseccomp default action behaviour, and loads the patched filter
|
||||
// into the kernel for the current process.
|
||||
func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, error) {
|
||||
// Generate a patched filter.
|
||||
fprog, err := enosysPatchFilter(config, filter)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("error patching filter: %w", err)
|
||||
}
|
||||
|
||||
// Get the set of libseccomp flags set.
|
||||
seccompFlags, noNewPrivs, err := filterFlags(config, filter)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("unable to fetch seccomp filter flags: %w", err)
|
||||
}
|
||||
|
||||
// Set no_new_privs if it was requested, though in runc we handle
|
||||
// no_new_privs separately so warn if we hit this path.
|
||||
if noNewPrivs {
|
||||
logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path")
|
||||
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
|
||||
return -1, fmt.Errorf("error enabling no_new_privs bit: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Finally, load the filter.
|
||||
fd, err := sysSeccompSetFilter(seccompFlags, fprog)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("error loading seccomp filter: %w", err)
|
||||
}
|
||||
|
||||
return fd, nil
|
||||
}
|
||||
@@ -1,4 +0,0 @@
|
||||
//go:build !linux || !cgo || !seccomp
|
||||
// +build !linux !cgo !seccomp
|
||||
|
||||
package patchbpf
|
||||
268
vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go
generated
vendored
268
vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go
generated
vendored
@@ -1,268 +0,0 @@
|
||||
//go:build cgo && seccomp
|
||||
// +build cgo,seccomp
|
||||
|
||||
package seccomp
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
|
||||
libseccomp "github.com/seccomp/libseccomp-golang"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/libcontainer/seccomp/patchbpf"
|
||||
)
|
||||
|
||||
var (
|
||||
actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM))
|
||||
actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM))
|
||||
)
|
||||
|
||||
const (
|
||||
// Linux system calls can have at most 6 arguments
|
||||
syscallMaxArguments int = 6
|
||||
)
|
||||
|
||||
// InitSeccomp installs the seccomp filters to be used in the container as
|
||||
// specified in config.
|
||||
// Returns the seccomp file descriptor if any of the filters include a
|
||||
// SCMP_ACT_NOTIFY action, otherwise returns -1.
|
||||
func InitSeccomp(config *configs.Seccomp) (int, error) {
|
||||
if config == nil {
|
||||
return -1, errors.New("cannot initialize Seccomp - nil config passed")
|
||||
}
|
||||
|
||||
defaultAction, err := getAction(config.DefaultAction, config.DefaultErrnoRet)
|
||||
if err != nil {
|
||||
return -1, errors.New("error initializing seccomp - invalid default action")
|
||||
}
|
||||
|
||||
// Ignore the error since pre-2.4 libseccomp is treated as API level 0.
|
||||
apiLevel, _ := libseccomp.GetAPI()
|
||||
for _, call := range config.Syscalls {
|
||||
if call.Action == configs.Notify {
|
||||
if apiLevel < 6 {
|
||||
return -1, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel)
|
||||
}
|
||||
|
||||
// We can't allow the write syscall to notify to the seccomp agent.
|
||||
// After InitSeccomp() is called, we need to syncParentSeccomp() to write the seccomp fd plain
|
||||
// number, so the parent sends it to the seccomp agent. If we use SCMP_ACT_NOTIFY on write, we
|
||||
// never can write the seccomp fd to the parent and therefore the seccomp agent never receives
|
||||
// the seccomp fd and runc is hang during initialization.
|
||||
//
|
||||
// Note that read()/close(), that are also used in syncParentSeccomp(), _can_ use SCMP_ACT_NOTIFY.
|
||||
// Because we write the seccomp fd on the pipe to the parent, the parent is able to proceed and
|
||||
// send the seccomp fd to the agent (it is another process and not subject to the seccomp
|
||||
// filter). We will be blocked on read()/close() inside syncParentSeccomp() but if the seccomp
|
||||
// agent allows those syscalls to proceed, initialization works just fine and the agent can
|
||||
// handle future read()/close() syscalls as it wanted.
|
||||
if call.Name == "write" {
|
||||
return -1, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// See comment on why write is not allowed. The same reason applies, as this can mean handling write too.
|
||||
if defaultAction == libseccomp.ActNotify {
|
||||
return -1, errors.New("SCMP_ACT_NOTIFY cannot be used as default action")
|
||||
}
|
||||
|
||||
filter, err := libseccomp.NewFilter(defaultAction)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("error creating filter: %w", err)
|
||||
}
|
||||
|
||||
// Add extra architectures
|
||||
for _, arch := range config.Architectures {
|
||||
scmpArch, err := libseccomp.GetArchFromString(arch)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("error validating Seccomp architecture: %w", err)
|
||||
}
|
||||
if err := filter.AddArch(scmpArch); err != nil {
|
||||
return -1, fmt.Errorf("error adding architecture to seccomp filter: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Unset no new privs bit
|
||||
if err := filter.SetNoNewPrivsBit(false); err != nil {
|
||||
return -1, fmt.Errorf("error setting no new privileges: %w", err)
|
||||
}
|
||||
|
||||
// Add a rule for each syscall
|
||||
for _, call := range config.Syscalls {
|
||||
if call == nil {
|
||||
return -1, errors.New("encountered nil syscall while initializing Seccomp")
|
||||
}
|
||||
|
||||
if err := matchCall(filter, call, defaultAction); err != nil {
|
||||
return -1, err
|
||||
}
|
||||
}
|
||||
|
||||
seccompFd, err := patchbpf.PatchAndLoad(config, filter)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("error loading seccomp filter into kernel: %w", err)
|
||||
}
|
||||
|
||||
return seccompFd, nil
|
||||
}
|
||||
|
||||
// Convert Libcontainer Action to Libseccomp ScmpAction
|
||||
func getAction(act configs.Action, errnoRet *uint) (libseccomp.ScmpAction, error) {
|
||||
switch act {
|
||||
case configs.Kill, configs.KillThread:
|
||||
return libseccomp.ActKillThread, nil
|
||||
case configs.Errno:
|
||||
if errnoRet != nil {
|
||||
return libseccomp.ActErrno.SetReturnCode(int16(*errnoRet)), nil
|
||||
}
|
||||
return actErrno, nil
|
||||
case configs.Trap:
|
||||
return libseccomp.ActTrap, nil
|
||||
case configs.Allow:
|
||||
return libseccomp.ActAllow, nil
|
||||
case configs.Trace:
|
||||
if errnoRet != nil {
|
||||
return libseccomp.ActTrace.SetReturnCode(int16(*errnoRet)), nil
|
||||
}
|
||||
return actTrace, nil
|
||||
case configs.Log:
|
||||
return libseccomp.ActLog, nil
|
||||
case configs.Notify:
|
||||
return libseccomp.ActNotify, nil
|
||||
case configs.KillProcess:
|
||||
return libseccomp.ActKillProcess, nil
|
||||
default:
|
||||
return libseccomp.ActInvalid, errors.New("invalid action, cannot use in rule")
|
||||
}
|
||||
}
|
||||
|
||||
// Convert Libcontainer Operator to Libseccomp ScmpCompareOp
|
||||
func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) {
|
||||
switch op {
|
||||
case configs.EqualTo:
|
||||
return libseccomp.CompareEqual, nil
|
||||
case configs.NotEqualTo:
|
||||
return libseccomp.CompareNotEqual, nil
|
||||
case configs.GreaterThan:
|
||||
return libseccomp.CompareGreater, nil
|
||||
case configs.GreaterThanOrEqualTo:
|
||||
return libseccomp.CompareGreaterEqual, nil
|
||||
case configs.LessThan:
|
||||
return libseccomp.CompareLess, nil
|
||||
case configs.LessThanOrEqualTo:
|
||||
return libseccomp.CompareLessOrEqual, nil
|
||||
case configs.MaskEqualTo:
|
||||
return libseccomp.CompareMaskedEqual, nil
|
||||
default:
|
||||
return libseccomp.CompareInvalid, errors.New("invalid operator, cannot use in rule")
|
||||
}
|
||||
}
|
||||
|
||||
// Convert Libcontainer Arg to Libseccomp ScmpCondition
|
||||
func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) {
|
||||
cond := libseccomp.ScmpCondition{}
|
||||
|
||||
if arg == nil {
|
||||
return cond, errors.New("cannot convert nil to syscall condition")
|
||||
}
|
||||
|
||||
op, err := getOperator(arg.Op)
|
||||
if err != nil {
|
||||
return cond, err
|
||||
}
|
||||
|
||||
return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo)
|
||||
}
|
||||
|
||||
// Add a rule to match a single syscall
|
||||
func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall, defAct libseccomp.ScmpAction) error {
|
||||
if call == nil || filter == nil {
|
||||
return errors.New("cannot use nil as syscall to block")
|
||||
}
|
||||
|
||||
if len(call.Name) == 0 {
|
||||
return errors.New("empty string is not a valid syscall")
|
||||
}
|
||||
|
||||
// Convert the call's action to the libseccomp equivalent
|
||||
callAct, err := getAction(call.Action, call.ErrnoRet)
|
||||
if err != nil {
|
||||
return fmt.Errorf("action in seccomp profile is invalid: %w", err)
|
||||
}
|
||||
if callAct == defAct {
|
||||
// This rule is redundant, silently skip it
|
||||
// to avoid error from AddRule.
|
||||
return nil
|
||||
}
|
||||
|
||||
// If we can't resolve the syscall, assume it is not supported
|
||||
// by this kernel. Warn about it, don't error out.
|
||||
callNum, err := libseccomp.GetSyscallFromName(call.Name)
|
||||
if err != nil {
|
||||
logrus.Debugf("unknown seccomp syscall %q ignored", call.Name)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Unconditional match - just add the rule
|
||||
if len(call.Args) == 0 {
|
||||
if err := filter.AddRule(callNum, callAct); err != nil {
|
||||
return fmt.Errorf("error adding seccomp filter rule for syscall %s: %w", call.Name, err)
|
||||
}
|
||||
} else {
|
||||
// If two or more arguments have the same condition,
|
||||
// Revert to old behavior, adding each condition as a separate rule
|
||||
argCounts := make([]uint, syscallMaxArguments)
|
||||
conditions := []libseccomp.ScmpCondition{}
|
||||
|
||||
for _, cond := range call.Args {
|
||||
newCond, err := getCondition(cond)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %w", call.Name, err)
|
||||
}
|
||||
|
||||
argCounts[cond.Index] += 1
|
||||
|
||||
conditions = append(conditions, newCond)
|
||||
}
|
||||
|
||||
hasMultipleArgs := false
|
||||
for _, count := range argCounts {
|
||||
if count > 1 {
|
||||
hasMultipleArgs = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if hasMultipleArgs {
|
||||
// Revert to old behavior
|
||||
// Add each condition attached to a separate rule
|
||||
for _, cond := range conditions {
|
||||
condArr := []libseccomp.ScmpCondition{cond}
|
||||
|
||||
if err := filter.AddRuleConditional(callNum, callAct, condArr); err != nil {
|
||||
return fmt.Errorf("error adding seccomp rule for syscall %s: %w", call.Name, err)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No conditions share same argument
|
||||
// Use new, proper behavior
|
||||
if err := filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
|
||||
return fmt.Errorf("error adding seccomp rule for syscall %s: %w", call.Name, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Version returns major, minor, and micro.
|
||||
func Version() (uint, uint, uint) {
|
||||
return libseccomp.GetLibraryVersion()
|
||||
}
|
||||
|
||||
// Enabled is true if seccomp support is compiled in.
|
||||
const Enabled = true
|
||||
28
vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go
generated
vendored
28
vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go
generated
vendored
@@ -1,28 +0,0 @@
|
||||
//go:build !linux || !cgo || !seccomp
|
||||
// +build !linux !cgo !seccomp
|
||||
|
||||
package seccomp
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
)
|
||||
|
||||
var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
|
||||
|
||||
// InitSeccomp does nothing because seccomp is not supported.
|
||||
func InitSeccomp(config *configs.Seccomp) (int, error) {
|
||||
if config != nil {
|
||||
return -1, ErrSeccompNotEnabled
|
||||
}
|
||||
return -1, nil
|
||||
}
|
||||
|
||||
// Version returns major, minor, and micro.
|
||||
func Version() (uint, uint, uint) {
|
||||
return 0, 0, 0
|
||||
}
|
||||
|
||||
// Enabled is true if seccomp support is compiled in.
|
||||
const Enabled = false
|
||||
149
vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go
generated
vendored
149
vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go
generated
vendored
@@ -1,149 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
|
||||
"github.com/opencontainers/selinux/go-selinux"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/apparmor"
|
||||
"github.com/opencontainers/runc/libcontainer/keys"
|
||||
"github.com/opencontainers/runc/libcontainer/seccomp"
|
||||
"github.com/opencontainers/runc/libcontainer/system"
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
)
|
||||
|
||||
// linuxSetnsInit performs the container's initialization for running a new process
|
||||
// inside an existing container.
|
||||
type linuxSetnsInit struct {
|
||||
pipe *os.File
|
||||
consoleSocket *os.File
|
||||
config *initConfig
|
||||
logFd int
|
||||
}
|
||||
|
||||
func (l *linuxSetnsInit) getSessionRingName() string {
|
||||
return "_ses." + l.config.ContainerId
|
||||
}
|
||||
|
||||
func (l *linuxSetnsInit) Init() error {
|
||||
if !l.config.Config.NoNewKeyring {
|
||||
if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
|
||||
return err
|
||||
}
|
||||
defer selinux.SetKeyLabel("") //nolint: errcheck
|
||||
// Do not inherit the parent's session keyring.
|
||||
if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
|
||||
// Same justification as in standart_init_linux.go as to why we
|
||||
// don't bail on ENOSYS.
|
||||
//
|
||||
// TODO(cyphar): And we should have logging here too.
|
||||
if !errors.Is(err, unix.ENOSYS) {
|
||||
return fmt.Errorf("unable to join session keyring: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if l.config.CreateConsole {
|
||||
if err := setupConsole(l.consoleSocket, l.config, false); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := system.Setctty(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if l.config.NoNewPrivileges {
|
||||
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// Tell our parent that we're ready to exec. This must be done before the
|
||||
// Seccomp rules have been applied, because we need to be able to read and
|
||||
// write to a socket.
|
||||
if err := syncParentReady(l.pipe); err != nil {
|
||||
return fmt.Errorf("sync ready: %w", err)
|
||||
}
|
||||
|
||||
if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
|
||||
return err
|
||||
}
|
||||
defer selinux.SetExecLabel("") //nolint: errcheck
|
||||
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
|
||||
// do this before dropping capabilities; otherwise do it as late as possible
|
||||
// just before execve so as few syscalls take place after it as possible.
|
||||
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
|
||||
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := finalizeNamespace(l.config); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Check for the arg before waiting to make sure it exists and it is
|
||||
// returned as a create time error.
|
||||
name, err := exec.LookPath(l.config.Args[0])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// exec.LookPath in Go < 1.20 might return no error for an executable
|
||||
// residing on a file system mounted with noexec flag, so perform this
|
||||
// extra check now while we can still return a proper error.
|
||||
// TODO: remove this once go < 1.20 is not supported.
|
||||
if err := eaccess(name); err != nil {
|
||||
return &os.PathError{Op: "eaccess", Path: name, Err: err}
|
||||
}
|
||||
|
||||
// Set seccomp as close to execve as possible, so as few syscalls take
|
||||
// place afterward (reducing the amount of syscalls that users need to
|
||||
// enable in their seccomp profiles).
|
||||
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
|
||||
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to init seccomp: %w", err)
|
||||
}
|
||||
|
||||
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
logrus.Debugf("setns_init: about to exec")
|
||||
// Close the log pipe fd so the parent's ForwardLogs can exit.
|
||||
if err := unix.Close(l.logFd); err != nil {
|
||||
return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
|
||||
}
|
||||
|
||||
// Close all file descriptors we are not passing to the container. This is
|
||||
// necessary because the execve target could use internal runc fds as the
|
||||
// execve path, potentially giving access to binary files from the host
|
||||
// (which can then be opened by container processes, leading to container
|
||||
// escapes). Note that because this operation will close any open file
|
||||
// descriptors that are referenced by (*os.File) handles from underneath
|
||||
// the Go runtime, we must not do any file operations after this point
|
||||
// (otherwise the (*os.File) finaliser could close the wrong file). See
|
||||
// CVE-2024-21626 for more information as to why this protection is
|
||||
// necessary.
|
||||
//
|
||||
// This is not needed for runc-dmz, because the extra execve(2) step means
|
||||
// that all O_CLOEXEC file descriptors have already been closed and thus
|
||||
// the second execve(2) from runc-dmz cannot access internal file
|
||||
// descriptors from runc.
|
||||
if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
|
||||
return err
|
||||
}
|
||||
return system.Exec(name, l.config.Args[0:], os.Environ())
|
||||
}
|
||||
282
vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go
generated
vendored
282
vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go
generated
vendored
@@ -1,282 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/opencontainers/selinux/go-selinux"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/apparmor"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/libcontainer/keys"
|
||||
"github.com/opencontainers/runc/libcontainer/seccomp"
|
||||
"github.com/opencontainers/runc/libcontainer/system"
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
)
|
||||
|
||||
type linuxStandardInit struct {
|
||||
pipe *os.File
|
||||
consoleSocket *os.File
|
||||
parentPid int
|
||||
fifoFd int
|
||||
logFd int
|
||||
mountFds []int
|
||||
config *initConfig
|
||||
}
|
||||
|
||||
func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
|
||||
var newperms uint32
|
||||
|
||||
if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
|
||||
// With user ns we need 'other' search permissions.
|
||||
newperms = 0x8
|
||||
} else {
|
||||
// Without user ns we need 'UID' search permissions.
|
||||
newperms = 0x80000
|
||||
}
|
||||
|
||||
// Create a unique per session container name that we can join in setns;
|
||||
// However, other containers can also join it.
|
||||
return "_ses." + l.config.ContainerId, 0xffffffff, newperms
|
||||
}
|
||||
|
||||
func (l *linuxStandardInit) Init() error {
|
||||
if !l.config.Config.NoNewKeyring {
|
||||
if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
|
||||
return err
|
||||
}
|
||||
defer selinux.SetKeyLabel("") //nolint: errcheck
|
||||
ringname, keepperms, newperms := l.getSessionRingParams()
|
||||
|
||||
// Do not inherit the parent's session keyring.
|
||||
if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
|
||||
// If keyrings aren't supported then it is likely we are on an
|
||||
// older kernel (or inside an LXC container). While we could bail,
|
||||
// the security feature we are using here is best-effort (it only
|
||||
// really provides marginal protection since VFS credentials are
|
||||
// the only significant protection of keyrings).
|
||||
//
|
||||
// TODO(cyphar): Log this so people know what's going on, once we
|
||||
// have proper logging in 'runc init'.
|
||||
if !errors.Is(err, unix.ENOSYS) {
|
||||
return fmt.Errorf("unable to join session keyring: %w", err)
|
||||
}
|
||||
} else {
|
||||
// Make session keyring searchable. If we've gotten this far we
|
||||
// bail on any error -- we don't want to have a keyring with bad
|
||||
// permissions.
|
||||
if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
|
||||
return fmt.Errorf("unable to mod keyring permissions: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := setupNetwork(l.config); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := setupRoute(l.config.Config); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// initialises the labeling system
|
||||
selinux.GetEnabled()
|
||||
|
||||
// We don't need the mountFds after prepareRootfs() nor if it fails.
|
||||
err := prepareRootfs(l.pipe, l.config, l.mountFds)
|
||||
for _, m := range l.mountFds {
|
||||
if m == -1 {
|
||||
continue
|
||||
}
|
||||
|
||||
if err := unix.Close(m); err != nil {
|
||||
return fmt.Errorf("Unable to close mountFds fds: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Set up the console. This has to be done *before* we finalize the rootfs,
|
||||
// but *after* we've given the user the chance to set up all of the mounts
|
||||
// they wanted.
|
||||
if l.config.CreateConsole {
|
||||
if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := system.Setctty(); err != nil {
|
||||
return &os.SyscallError{Syscall: "ioctl(setctty)", Err: err}
|
||||
}
|
||||
}
|
||||
|
||||
// Finish the rootfs setup.
|
||||
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
|
||||
if err := finalizeRootfs(l.config.Config); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if hostname := l.config.Config.Hostname; hostname != "" {
|
||||
if err := unix.Sethostname([]byte(hostname)); err != nil {
|
||||
return &os.SyscallError{Syscall: "sethostname", Err: err}
|
||||
}
|
||||
}
|
||||
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
|
||||
return fmt.Errorf("unable to apply apparmor profile: %w", err)
|
||||
}
|
||||
|
||||
for key, value := range l.config.Config.Sysctl {
|
||||
if err := writeSystemProperty(key, value); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
for _, path := range l.config.Config.ReadonlyPaths {
|
||||
if err := readonlyPath(path); err != nil {
|
||||
return fmt.Errorf("can't make %q read-only: %w", path, err)
|
||||
}
|
||||
}
|
||||
for _, path := range l.config.Config.MaskPaths {
|
||||
if err := maskPath(path, l.config.Config.MountLabel); err != nil {
|
||||
return fmt.Errorf("can't mask path %s: %w", path, err)
|
||||
}
|
||||
}
|
||||
pdeath, err := system.GetParentDeathSignal()
|
||||
if err != nil {
|
||||
return fmt.Errorf("can't get pdeath signal: %w", err)
|
||||
}
|
||||
if l.config.NoNewPrivileges {
|
||||
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
|
||||
return &os.SyscallError{Syscall: "prctl(SET_NO_NEW_PRIVS)", Err: err}
|
||||
}
|
||||
}
|
||||
|
||||
// Tell our parent that we're ready to exec. This must be done before the
|
||||
// Seccomp rules have been applied, because we need to be able to read and
|
||||
// write to a socket.
|
||||
if err := syncParentReady(l.pipe); err != nil {
|
||||
return fmt.Errorf("sync ready: %w", err)
|
||||
}
|
||||
if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
|
||||
return fmt.Errorf("can't set process label: %w", err)
|
||||
}
|
||||
defer selinux.SetExecLabel("") //nolint: errcheck
|
||||
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
|
||||
// do this before dropping capabilities; otherwise do it as late as possible
|
||||
// just before execve so as few syscalls take place after it as possible.
|
||||
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
|
||||
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := finalizeNamespace(l.config); err != nil {
|
||||
return err
|
||||
}
|
||||
// finalizeNamespace can change user/group which clears the parent death
|
||||
// signal, so we restore it here.
|
||||
if err := pdeath.Restore(); err != nil {
|
||||
return fmt.Errorf("can't restore pdeath signal: %w", err)
|
||||
}
|
||||
// Compare the parent from the initial start of the init process and make
|
||||
// sure that it did not change. if the parent changes that means it died
|
||||
// and we were reparented to something else so we should just kill ourself
|
||||
// and not cause problems for someone else.
|
||||
if unix.Getppid() != l.parentPid {
|
||||
return unix.Kill(unix.Getpid(), unix.SIGKILL)
|
||||
}
|
||||
// Check for the arg before waiting to make sure it exists and it is
|
||||
// returned as a create time error.
|
||||
name, err := exec.LookPath(l.config.Args[0])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// exec.LookPath in Go < 1.20 might return no error for an executable
|
||||
// residing on a file system mounted with noexec flag, so perform this
|
||||
// extra check now while we can still return a proper error.
|
||||
// TODO: remove this once go < 1.20 is not supported.
|
||||
if err := eaccess(name); err != nil {
|
||||
return &os.PathError{Op: "eaccess", Path: name, Err: err}
|
||||
}
|
||||
|
||||
// Set seccomp as close to execve as possible, so as few syscalls take
|
||||
// place afterward (reducing the amount of syscalls that users need to
|
||||
// enable in their seccomp profiles). However, this needs to be done
|
||||
// before closing the pipe since we need it to pass the seccompFd to
|
||||
// the parent.
|
||||
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
|
||||
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to init seccomp: %w", err)
|
||||
}
|
||||
|
||||
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
// Close the pipe to signal that we have completed our init.
|
||||
logrus.Debugf("init: closing the pipe to signal completion")
|
||||
_ = l.pipe.Close()
|
||||
|
||||
// Close the log pipe fd so the parent's ForwardLogs can exit.
|
||||
if err := unix.Close(l.logFd); err != nil {
|
||||
return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
|
||||
}
|
||||
|
||||
// Wait for the FIFO to be opened on the other side before exec-ing the
|
||||
// user process. We open it through /proc/self/fd/$fd, because the fd that
|
||||
// was given to us was an O_PATH fd to the fifo itself. Linux allows us to
|
||||
// re-open an O_PATH fd through /proc.
|
||||
fifoPath := "/proc/self/fd/" + strconv.Itoa(l.fifoFd)
|
||||
fd, err := unix.Open(fifoPath, unix.O_WRONLY|unix.O_CLOEXEC, 0)
|
||||
if err != nil {
|
||||
return &os.PathError{Op: "open exec fifo", Path: fifoPath, Err: err}
|
||||
}
|
||||
if _, err := unix.Write(fd, []byte("0")); err != nil {
|
||||
return &os.PathError{Op: "write exec fifo", Path: fifoPath, Err: err}
|
||||
}
|
||||
|
||||
// Close the O_PATH fifofd fd before exec because the kernel resets
|
||||
// dumpable in the wrong order. This has been fixed in newer kernels, but
|
||||
// we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
|
||||
// N.B. the core issue itself (passing dirfds to the host filesystem) has
|
||||
// since been resolved.
|
||||
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
|
||||
_ = unix.Close(l.fifoFd)
|
||||
|
||||
s := l.config.SpecState
|
||||
s.Pid = unix.Getpid()
|
||||
s.Status = specs.StateCreated
|
||||
if err := l.config.Config.Hooks[configs.StartContainer].RunHooks(s); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Close all file descriptors we are not passing to the container. This is
|
||||
// necessary because the execve target could use internal runc fds as the
|
||||
// execve path, potentially giving access to binary files from the host
|
||||
// (which can then be opened by container processes, leading to container
|
||||
// escapes). Note that because this operation will close any open file
|
||||
// descriptors that are referenced by (*os.File) handles from underneath
|
||||
// the Go runtime, we must not do any file operations after this point
|
||||
// (otherwise the (*os.File) finaliser could close the wrong file). See
|
||||
// CVE-2024-21626 for more information as to why this protection is
|
||||
// necessary.
|
||||
//
|
||||
// This is not needed for runc-dmz, because the extra execve(2) step means
|
||||
// that all O_CLOEXEC file descriptors have already been closed and thus
|
||||
// the second execve(2) from runc-dmz cannot access internal file
|
||||
// descriptors from runc.
|
||||
if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
|
||||
return err
|
||||
}
|
||||
return system.Exec(name, l.config.Args[0:], os.Environ())
|
||||
}
|
||||
243
vendor/github.com/opencontainers/runc/libcontainer/state_linux.go
generated
vendored
243
vendor/github.com/opencontainers/runc/libcontainer/state_linux.go
generated
vendored
@@ -1,243 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
func newStateTransitionError(from, to containerState) error {
|
||||
return &stateTransitionError{
|
||||
From: from.status().String(),
|
||||
To: to.status().String(),
|
||||
}
|
||||
}
|
||||
|
||||
// stateTransitionError is returned when an invalid state transition happens from one
|
||||
// state to another.
|
||||
type stateTransitionError struct {
|
||||
From string
|
||||
To string
|
||||
}
|
||||
|
||||
func (s *stateTransitionError) Error() string {
|
||||
return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To)
|
||||
}
|
||||
|
||||
type containerState interface {
|
||||
transition(containerState) error
|
||||
destroy() error
|
||||
status() Status
|
||||
}
|
||||
|
||||
func destroy(c *linuxContainer) error {
|
||||
if !c.config.Namespaces.Contains(configs.NEWPID) ||
|
||||
c.config.Namespaces.PathOf(configs.NEWPID) != "" {
|
||||
if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil {
|
||||
logrus.Warn(err)
|
||||
}
|
||||
}
|
||||
err := c.cgroupManager.Destroy()
|
||||
if c.intelRdtManager != nil {
|
||||
if ierr := c.intelRdtManager.Destroy(); err == nil {
|
||||
err = ierr
|
||||
}
|
||||
}
|
||||
if rerr := os.RemoveAll(c.root); err == nil {
|
||||
err = rerr
|
||||
}
|
||||
c.initProcess = nil
|
||||
if herr := runPoststopHooks(c); err == nil {
|
||||
err = herr
|
||||
}
|
||||
c.state = &stoppedState{c: c}
|
||||
return err
|
||||
}
|
||||
|
||||
func runPoststopHooks(c *linuxContainer) error {
|
||||
hooks := c.config.Hooks
|
||||
if hooks == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
s, err := c.currentOCIState()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s.Status = specs.StateStopped
|
||||
|
||||
if err := hooks[configs.Poststop].RunHooks(s); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// stoppedState represents a container is a stopped/destroyed state.
|
||||
type stoppedState struct {
|
||||
c *linuxContainer
|
||||
}
|
||||
|
||||
func (b *stoppedState) status() Status {
|
||||
return Stopped
|
||||
}
|
||||
|
||||
func (b *stoppedState) transition(s containerState) error {
|
||||
switch s.(type) {
|
||||
case *runningState, *restoredState:
|
||||
b.c.state = s
|
||||
return nil
|
||||
case *stoppedState:
|
||||
return nil
|
||||
}
|
||||
return newStateTransitionError(b, s)
|
||||
}
|
||||
|
||||
func (b *stoppedState) destroy() error {
|
||||
return destroy(b.c)
|
||||
}
|
||||
|
||||
// runningState represents a container that is currently running.
|
||||
type runningState struct {
|
||||
c *linuxContainer
|
||||
}
|
||||
|
||||
func (r *runningState) status() Status {
|
||||
return Running
|
||||
}
|
||||
|
||||
func (r *runningState) transition(s containerState) error {
|
||||
switch s.(type) {
|
||||
case *stoppedState:
|
||||
if r.c.runType() == Running {
|
||||
return ErrRunning
|
||||
}
|
||||
r.c.state = s
|
||||
return nil
|
||||
case *pausedState:
|
||||
r.c.state = s
|
||||
return nil
|
||||
case *runningState:
|
||||
return nil
|
||||
}
|
||||
return newStateTransitionError(r, s)
|
||||
}
|
||||
|
||||
func (r *runningState) destroy() error {
|
||||
if r.c.runType() == Running {
|
||||
return ErrRunning
|
||||
}
|
||||
return destroy(r.c)
|
||||
}
|
||||
|
||||
type createdState struct {
|
||||
c *linuxContainer
|
||||
}
|
||||
|
||||
func (i *createdState) status() Status {
|
||||
return Created
|
||||
}
|
||||
|
||||
func (i *createdState) transition(s containerState) error {
|
||||
switch s.(type) {
|
||||
case *runningState, *pausedState, *stoppedState:
|
||||
i.c.state = s
|
||||
return nil
|
||||
case *createdState:
|
||||
return nil
|
||||
}
|
||||
return newStateTransitionError(i, s)
|
||||
}
|
||||
|
||||
func (i *createdState) destroy() error {
|
||||
_ = i.c.initProcess.signal(unix.SIGKILL)
|
||||
return destroy(i.c)
|
||||
}
|
||||
|
||||
// pausedState represents a container that is currently pause. It cannot be destroyed in a
|
||||
// paused state and must transition back to running first.
|
||||
type pausedState struct {
|
||||
c *linuxContainer
|
||||
}
|
||||
|
||||
func (p *pausedState) status() Status {
|
||||
return Paused
|
||||
}
|
||||
|
||||
func (p *pausedState) transition(s containerState) error {
|
||||
switch s.(type) {
|
||||
case *runningState, *stoppedState:
|
||||
p.c.state = s
|
||||
return nil
|
||||
case *pausedState:
|
||||
return nil
|
||||
}
|
||||
return newStateTransitionError(p, s)
|
||||
}
|
||||
|
||||
func (p *pausedState) destroy() error {
|
||||
t := p.c.runType()
|
||||
if t != Running && t != Created {
|
||||
if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
|
||||
return err
|
||||
}
|
||||
return destroy(p.c)
|
||||
}
|
||||
return ErrPaused
|
||||
}
|
||||
|
||||
// restoredState is the same as the running state but also has associated checkpoint
|
||||
// information that maybe need destroyed when the container is stopped and destroy is called.
|
||||
type restoredState struct {
|
||||
imageDir string
|
||||
c *linuxContainer
|
||||
}
|
||||
|
||||
func (r *restoredState) status() Status {
|
||||
return Running
|
||||
}
|
||||
|
||||
func (r *restoredState) transition(s containerState) error {
|
||||
switch s.(type) {
|
||||
case *stoppedState, *runningState:
|
||||
return nil
|
||||
}
|
||||
return newStateTransitionError(r, s)
|
||||
}
|
||||
|
||||
func (r *restoredState) destroy() error {
|
||||
if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return destroy(r.c)
|
||||
}
|
||||
|
||||
// loadedState is used whenever a container is restored, loaded, or setting additional
|
||||
// processes inside and it should not be destroyed when it is exiting.
|
||||
type loadedState struct {
|
||||
c *linuxContainer
|
||||
s Status
|
||||
}
|
||||
|
||||
func (n *loadedState) status() Status {
|
||||
return n.s
|
||||
}
|
||||
|
||||
func (n *loadedState) transition(s containerState) error {
|
||||
n.c.state = s
|
||||
return nil
|
||||
}
|
||||
|
||||
func (n *loadedState) destroy() error {
|
||||
if err := n.c.refreshState(); err != nil {
|
||||
return err
|
||||
}
|
||||
return n.c.state.destroy()
|
||||
}
|
||||
13
vendor/github.com/opencontainers/runc/libcontainer/stats_linux.go
generated
vendored
13
vendor/github.com/opencontainers/runc/libcontainer/stats_linux.go
generated
vendored
@@ -1,13 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
"github.com/opencontainers/runc/libcontainer/intelrdt"
|
||||
"github.com/opencontainers/runc/types"
|
||||
)
|
||||
|
||||
type Stats struct {
|
||||
Interfaces []*types.NetworkInterface
|
||||
CgroupStats *cgroups.Stats
|
||||
IntelRdtStats *intelrdt.Stats
|
||||
}
|
||||
126
vendor/github.com/opencontainers/runc/libcontainer/sync.go
generated
vendored
126
vendor/github.com/opencontainers/runc/libcontainer/sync.go
generated
vendored
@@ -1,126 +0,0 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
)
|
||||
|
||||
type syncType string
|
||||
|
||||
// Constants that are used for synchronisation between the parent and child
|
||||
// during container setup. They come in pairs (with procError being a generic
|
||||
// response which is followed by an &initError).
|
||||
//
|
||||
// [ child ] <-> [ parent ]
|
||||
//
|
||||
// procHooks --> [run hooks]
|
||||
// <-- procResume
|
||||
//
|
||||
// procReady --> [final setup]
|
||||
// <-- procRun
|
||||
//
|
||||
// procSeccomp --> [pick up seccomp fd with pidfd_getfd()]
|
||||
// <-- procSeccompDone
|
||||
const (
|
||||
procError syncType = "procError"
|
||||
procReady syncType = "procReady"
|
||||
procRun syncType = "procRun"
|
||||
procHooks syncType = "procHooks"
|
||||
procResume syncType = "procResume"
|
||||
procSeccomp syncType = "procSeccomp"
|
||||
procSeccompDone syncType = "procSeccompDone"
|
||||
)
|
||||
|
||||
type syncT struct {
|
||||
Type syncType `json:"type"`
|
||||
Fd int `json:"fd"`
|
||||
}
|
||||
|
||||
// initError is used to wrap errors for passing them via JSON,
|
||||
// as encoding/json can't unmarshal into error type.
|
||||
type initError struct {
|
||||
Message string `json:"message,omitempty"`
|
||||
}
|
||||
|
||||
func (i initError) Error() string {
|
||||
return i.Message
|
||||
}
|
||||
|
||||
// writeSync is used to write to a synchronisation pipe. An error is returned
|
||||
// if there was a problem writing the payload.
|
||||
func writeSync(pipe io.Writer, sync syncType) error {
|
||||
return writeSyncWithFd(pipe, sync, -1)
|
||||
}
|
||||
|
||||
// writeSyncWithFd is used to write to a synchronisation pipe. An error is
|
||||
// returned if there was a problem writing the payload.
|
||||
func writeSyncWithFd(pipe io.Writer, sync syncType, fd int) error {
|
||||
if err := utils.WriteJSON(pipe, syncT{sync, fd}); err != nil {
|
||||
return fmt.Errorf("writing syncT %q: %w", string(sync), err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// readSync is used to read from a synchronisation pipe. An error is returned
|
||||
// if we got an initError, the pipe was closed, or we got an unexpected flag.
|
||||
func readSync(pipe io.Reader, expected syncType) error {
|
||||
var procSync syncT
|
||||
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
return errors.New("parent closed synchronisation channel")
|
||||
}
|
||||
return fmt.Errorf("failed reading error from parent: %w", err)
|
||||
}
|
||||
|
||||
if procSync.Type == procError {
|
||||
var ierr initError
|
||||
|
||||
if err := json.NewDecoder(pipe).Decode(&ierr); err != nil {
|
||||
return fmt.Errorf("failed reading error from parent: %w", err)
|
||||
}
|
||||
|
||||
return &ierr
|
||||
}
|
||||
|
||||
if procSync.Type != expected {
|
||||
return errors.New("invalid synchronisation flag from parent")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// parseSync runs the given callback function on each syncT received from the
|
||||
// child. It will return once io.EOF is returned from the given pipe.
|
||||
func parseSync(pipe io.Reader, fn func(*syncT) error) error {
|
||||
dec := json.NewDecoder(pipe)
|
||||
for {
|
||||
var sync syncT
|
||||
if err := dec.Decode(&sync); err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// We handle this case outside fn for cleanliness reasons.
|
||||
var ierr *initError
|
||||
if sync.Type == procError {
|
||||
if err := dec.Decode(&ierr); err != nil && !errors.Is(err, io.EOF) {
|
||||
return fmt.Errorf("error decoding proc error from init: %w", err)
|
||||
}
|
||||
if ierr != nil {
|
||||
return ierr
|
||||
}
|
||||
// Programmer error.
|
||||
panic("No error following JSON procError payload.")
|
||||
}
|
||||
|
||||
if err := fn(&sync); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
155
vendor/github.com/opencontainers/runc/types/events.go
generated
vendored
155
vendor/github.com/opencontainers/runc/types/events.go
generated
vendored
@@ -1,155 +0,0 @@
|
||||
package types
|
||||
|
||||
import "github.com/opencontainers/runc/libcontainer/intelrdt"
|
||||
|
||||
// Event struct for encoding the event data to json.
|
||||
type Event struct {
|
||||
Type string `json:"type"`
|
||||
ID string `json:"id"`
|
||||
Data interface{} `json:"data,omitempty"`
|
||||
}
|
||||
|
||||
// stats is the runc specific stats structure for stability when encoding and decoding stats.
|
||||
type Stats struct {
|
||||
CPU Cpu `json:"cpu"`
|
||||
CPUSet CPUSet `json:"cpuset"`
|
||||
Memory Memory `json:"memory"`
|
||||
Pids Pids `json:"pids"`
|
||||
Blkio Blkio `json:"blkio"`
|
||||
Hugetlb map[string]Hugetlb `json:"hugetlb"`
|
||||
IntelRdt IntelRdt `json:"intel_rdt"`
|
||||
NetworkInterfaces []*NetworkInterface `json:"network_interfaces"`
|
||||
}
|
||||
|
||||
type Hugetlb struct {
|
||||
Usage uint64 `json:"usage,omitempty"`
|
||||
Max uint64 `json:"max,omitempty"`
|
||||
Failcnt uint64 `json:"failcnt"`
|
||||
}
|
||||
|
||||
type BlkioEntry struct {
|
||||
Major uint64 `json:"major,omitempty"`
|
||||
Minor uint64 `json:"minor,omitempty"`
|
||||
Op string `json:"op,omitempty"`
|
||||
Value uint64 `json:"value,omitempty"`
|
||||
}
|
||||
|
||||
type Blkio struct {
|
||||
IoServiceBytesRecursive []BlkioEntry `json:"ioServiceBytesRecursive,omitempty"`
|
||||
IoServicedRecursive []BlkioEntry `json:"ioServicedRecursive,omitempty"`
|
||||
IoQueuedRecursive []BlkioEntry `json:"ioQueueRecursive,omitempty"`
|
||||
IoServiceTimeRecursive []BlkioEntry `json:"ioServiceTimeRecursive,omitempty"`
|
||||
IoWaitTimeRecursive []BlkioEntry `json:"ioWaitTimeRecursive,omitempty"`
|
||||
IoMergedRecursive []BlkioEntry `json:"ioMergedRecursive,omitempty"`
|
||||
IoTimeRecursive []BlkioEntry `json:"ioTimeRecursive,omitempty"`
|
||||
SectorsRecursive []BlkioEntry `json:"sectorsRecursive,omitempty"`
|
||||
}
|
||||
|
||||
type Pids struct {
|
||||
Current uint64 `json:"current,omitempty"`
|
||||
Limit uint64 `json:"limit,omitempty"`
|
||||
}
|
||||
|
||||
type Throttling struct {
|
||||
Periods uint64 `json:"periods,omitempty"`
|
||||
ThrottledPeriods uint64 `json:"throttledPeriods,omitempty"`
|
||||
ThrottledTime uint64 `json:"throttledTime,omitempty"`
|
||||
}
|
||||
|
||||
type CpuUsage struct {
|
||||
// Units: nanoseconds.
|
||||
Total uint64 `json:"total,omitempty"`
|
||||
Percpu []uint64 `json:"percpu,omitempty"`
|
||||
PercpuKernel []uint64 `json:"percpu_kernel,omitempty"`
|
||||
PercpuUser []uint64 `json:"percpu_user,omitempty"`
|
||||
Kernel uint64 `json:"kernel"`
|
||||
User uint64 `json:"user"`
|
||||
}
|
||||
|
||||
type Cpu struct {
|
||||
Usage CpuUsage `json:"usage,omitempty"`
|
||||
Throttling Throttling `json:"throttling,omitempty"`
|
||||
}
|
||||
|
||||
type CPUSet struct {
|
||||
CPUs []uint16 `json:"cpus,omitempty"`
|
||||
CPUExclusive uint64 `json:"cpu_exclusive"`
|
||||
Mems []uint16 `json:"mems,omitempty"`
|
||||
MemHardwall uint64 `json:"mem_hardwall"`
|
||||
MemExclusive uint64 `json:"mem_exclusive"`
|
||||
MemoryMigrate uint64 `json:"memory_migrate"`
|
||||
MemorySpreadPage uint64 `json:"memory_spread_page"`
|
||||
MemorySpreadSlab uint64 `json:"memory_spread_slab"`
|
||||
MemoryPressure uint64 `json:"memory_pressure"`
|
||||
SchedLoadBalance uint64 `json:"sched_load_balance"`
|
||||
SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"`
|
||||
}
|
||||
|
||||
type MemoryEntry struct {
|
||||
Limit uint64 `json:"limit"`
|
||||
Usage uint64 `json:"usage,omitempty"`
|
||||
Max uint64 `json:"max,omitempty"`
|
||||
Failcnt uint64 `json:"failcnt"`
|
||||
}
|
||||
|
||||
type Memory struct {
|
||||
Cache uint64 `json:"cache,omitempty"`
|
||||
Usage MemoryEntry `json:"usage,omitempty"`
|
||||
Swap MemoryEntry `json:"swap,omitempty"`
|
||||
Kernel MemoryEntry `json:"kernel,omitempty"`
|
||||
KernelTCP MemoryEntry `json:"kernelTCP,omitempty"`
|
||||
Raw map[string]uint64 `json:"raw,omitempty"`
|
||||
}
|
||||
|
||||
type L3CacheInfo struct {
|
||||
CbmMask string `json:"cbm_mask,omitempty"`
|
||||
MinCbmBits uint64 `json:"min_cbm_bits,omitempty"`
|
||||
NumClosids uint64 `json:"num_closids,omitempty"`
|
||||
}
|
||||
|
||||
type MemBwInfo struct {
|
||||
BandwidthGran uint64 `json:"bandwidth_gran,omitempty"`
|
||||
DelayLinear uint64 `json:"delay_linear,omitempty"`
|
||||
MinBandwidth uint64 `json:"min_bandwidth,omitempty"`
|
||||
NumClosids uint64 `json:"num_closids,omitempty"`
|
||||
}
|
||||
|
||||
type IntelRdt struct {
|
||||
// The read-only L3 cache information
|
||||
L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"`
|
||||
|
||||
// The read-only L3 cache schema in root
|
||||
L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"`
|
||||
|
||||
// The L3 cache schema in 'container_id' group
|
||||
L3CacheSchema string `json:"l3_cache_schema,omitempty"`
|
||||
|
||||
// The read-only memory bandwidth information
|
||||
MemBwInfo *MemBwInfo `json:"mem_bw_info,omitempty"`
|
||||
|
||||
// The read-only memory bandwidth schema in root
|
||||
MemBwSchemaRoot string `json:"mem_bw_schema_root,omitempty"`
|
||||
|
||||
// The memory bandwidth schema in 'container_id' group
|
||||
MemBwSchema string `json:"mem_bw_schema,omitempty"`
|
||||
|
||||
// The memory bandwidth monitoring statistics from NUMA nodes in 'container_id' group
|
||||
MBMStats *[]intelrdt.MBMNumaNodeStats `json:"mbm_stats,omitempty"`
|
||||
|
||||
// The cache monitoring technology statistics from NUMA nodes in 'container_id' group
|
||||
CMTStats *[]intelrdt.CMTNumaNodeStats `json:"cmt_stats,omitempty"`
|
||||
}
|
||||
|
||||
type NetworkInterface struct {
|
||||
// Name is the name of the network interface.
|
||||
Name string
|
||||
|
||||
RxBytes uint64
|
||||
RxPackets uint64
|
||||
RxErrors uint64
|
||||
RxDropped uint64
|
||||
TxBytes uint64
|
||||
TxPackets uint64
|
||||
TxErrors uint64
|
||||
TxDropped uint64
|
||||
}
|
||||
Reference in New Issue
Block a user