From fa95ebcaae3997ce50454b23a88d963902217c46 Mon Sep 17 00:00:00 2001 From: Samuel Angebault Date: Thu, 11 May 2023 02:46:36 -0700 Subject: [PATCH] Add optional zram compression for docker_inram Some devices running SONiC have a small storage device (2G and 4G mainly) The SONiC image growth over time has made it impossible to install 2 images on a single device. Some mitigations have been implemented in the past for some devices but there is a need to do more. One such mitigation is `docker_inram` which creates a `tmpfs` and extracts `dockerfs.tar.gz` in it. This all happens in the SONiC initramfs and by ensuring the installation process does not extract `dockerfs.tar.gz` on the flash but keep the file as is. This mitigation does a tradeoff by using more RAM to reduce the disk footprint. It however creates new issues for devices with 4G of system memory since the extracted `dockerfs.tar.gz` nears the 1.6G. Considering debian upgrades (with dual base images) and the continuous stream of features this is only going to get bigger. This change introduces an alternative to the `tmpfs` by allowing a system to extract the `dockerfs.tar.gz` inside a `zram` device thus bringing compression in play at the detriment of performance. Introduce 2 new optional kernel parameters to be consumed by SONiC initramfs. - `docker_inram_size` which represent the max physical size of the `zram` or `tmpfs` volume (defaults to DOCKER_RAMFS_SIZE) - `docker_inram_algo` which is the method to use to extract the `dockerfs.tar.gz` (defaults to `tmpfs`) other values are considered to be compression algorithm for `zram` (e.g `zstd`, `zlo-rle`, `lz4`) Refactored the logic to mount the docker fs in the SONiC initramfs under the `union-mount` script. Moved the code into a function to make it cleaner and separated the inram volume creation and docker extraction. On Arista platform with a flash smaller or equal to 4GB set `docker_inram_algo` to `zstd` which produces the best compression ratio at the detriment of a slower write performance and a similar read performance to other `zram` compression algorithms. --- files/Aboot/boot0.j2 | 1 + files/initramfs-tools/modules | 1 + files/initramfs-tools/union-mount.j2 | 89 +++++++++++++++++++++------- 3 files changed, 71 insertions(+), 20 deletions(-) diff --git a/files/Aboot/boot0.j2 b/files/Aboot/boot0.j2 index b2ac34f9d..8548d50d5 100644 --- a/files/Aboot/boot0.j2 +++ b/files/Aboot/boot0.j2 @@ -656,6 +656,7 @@ write_platform_specific_cmdline() { varlog_size=256 cmdline_add logs_inram=on cmdline_add docker_inram=on + cmdline_add docker_inram_algo=zstd if [ $flash_size -le 2000 ]; then # enable docker_inram for switches with less than 2G of flash varlog_size=128 diff --git a/files/initramfs-tools/modules b/files/initramfs-tools/modules index 349bf3761..546660dae 100644 --- a/files/initramfs-tools/modules +++ b/files/initramfs-tools/modules @@ -5,3 +5,4 @@ nls_ascii nls_cp437 nls_utf8 nvme +zstd diff --git a/files/initramfs-tools/union-mount.j2 b/files/initramfs-tools/union-mount.j2 index 291806f95..fccd21f41 100644 --- a/files/initramfs-tools/union-mount.j2 +++ b/files/initramfs-tools/union-mount.j2 @@ -12,6 +12,8 @@ case $1 in esac docker_inram=false +docker_inram_algo=tmpfs +docker_inram_size={{ DOCKER_RAMFS_SIZE }} logs_inram=false secureboot=false bootloader=generic @@ -27,6 +29,12 @@ for x in $(cat /proc/cmdline); do docker_inram=on) docker_inram=true ;; + docker_inram_algo=*) + docker_inram_algo="${x#docker_inram_algo=}" + ;; + docker_inram_size=*) + docker_inram_size="${x#docker_inram_size=}" + ;; logs_inram=on) logs_inram=true ;; @@ -95,6 +103,63 @@ remove_not_in_allowlist_files() rm -f $allowlist_pattern_file } +mount_docker_inram() +{ + if [ "$docker_inram_algo" = "tmpfs" ]; then + echo "Creating tmpfs to extract {{ FILESYSTEM_DOCKERFS }}" + mount -t tmpfs -o "rw,nodev,size=$docker_inram_size" tmpfs "${rootmnt}/var/lib/docker" + else + echo "Creating zram to extract {{ FILESYSTEM_DOCKERFS }}" + modprobe zram num_devices=0 + # create new zram device + local zid="$(cat /sys/class/zram-control/hot_add)" + local zname="zram$zid" + # attempt to use desired algorithm + if ! echo $docker_inram_algo > /sys/block/$zname/comp_algorithm 2>/dev/null; then + echo "zram algorithm $docker_inram_algo is not supported" + echo "using default instead: $(cat /sys/block/$zname/comp_algorithm)" + fi + echo $docker_inram_size > /sys/block/$zname/disksize + # create filesystem on the newly created zram block device + mkfs.ext4 -m 0 -L dockerfs -O '^has_journal' -q /dev/$zname + mount -o rw,nodev /dev/$zname "${rootmnt}/var/lib/docker" + fi +} + +extract_dockerfs() +{ + echo "Extracting {{ FILESYSTEM_DOCKERFS }}" + if [ -f "${rootmnt}/host/$image_dir/{{ FILESYSTEM_DOCKERFS }}" ] && [ "$secureboot" = false ]; then + # Extract dockerfs.tar.gz into /var/lib/docker unless the system booted with secureboot + # In secureboot dockerfs.tar.gz cannot be trusted as it does not have a signature + tar xz --numeric-owner -f ${rootmnt}/host/$image_dir/{{ FILESYSTEM_DOCKERFS }} -C ${rootmnt}/var/lib/docker + elif [ "$bootloader" = "aboot" ] && unzip -l "$swi_path" | grep -q {{ FILESYSTEM_DOCKERFS }}; then + # Aboot swi images also support extracting dockerfs.tar.gz directly from them + unzip -qp "$swi_path" {{ FILESYSTEM_DOCKERFS }} | tar xz --numeric-owner -C ${rootmnt}/var/lib/docker + else + # Warn but allow the system to boot to at least have ssh access + echo "No {{ FILESYSTEM_DOCKERFS }} to extract, SONiC will be broken" + fi +} + +mount_docker() +{ + if [ "$in_kdump" = true ]; then + # There is no point in mounting the docker filesystem in kdump environment + # Especially when there is some space mitigation in place + return + fi + + if [ "$docker_inram" = true ]; then + # Create an in memory filesystem (tmpfs, zram) and extract dockerfs.tar.gz + mount_docker_inram + extract_dockerfs + else + # Mount the working directory of docker engine in the raw partition, bypass the overlay + mount --bind ${rootmnt}/host/$image_dir/{{ DOCKERFS_DIR }} ${rootmnt}/var/lib/docker + fi +} + ## Mount the overlay file system: rw layer over squashfs image_dir=$(cat /proc/cmdline | sed -e 's/.*loop=\(\S*\)\/.*/\1/') rw_dir=${rootmnt}/host/$image_dir/rw @@ -137,30 +202,14 @@ case "${ROOT}" in ;; esac +## Mount the docker storage path mkdir -p ${rootmnt}/var/lib/docker -if [ "$in_kdump" = false ]; then - if [ "$secureboot" = true ]; then - mount -t tmpfs -o rw,nodev,size={{ DOCKER_RAMFS_SIZE }} tmpfs ${rootmnt}/var/lib/docker - if [ "$bootloader" = "aboot" ]; then - unzip -qp "$swi_path" dockerfs.tar.gz | tar xz --numeric-owner -C ${rootmnt}/var/lib/docker - ## Boot folder is not extracted during secureboot since content would inherently become unsafe - mkdir -p ${rootmnt}/host/$image_dir/boot - else - echo "secureboot unsupported for bootloader $bootloader" 1>&2 - exit 1 - fi - elif [ -f ${rootmnt}/host/$image_dir/{{ FILESYSTEM_DOCKERFS }} ]; then - ## mount tmpfs and extract docker into it - mount -t tmpfs -o rw,nodev,size={{ DOCKER_RAMFS_SIZE }} tmpfs ${rootmnt}/var/lib/docker - tar xz --numeric-owner -f ${rootmnt}/host/$image_dir/{{ FILESYSTEM_DOCKERFS }} -C ${rootmnt}/var/lib/docker - else - ## Mount the working directory of docker engine in the raw partition, bypass the overlay - mount --bind ${rootmnt}/host/$image_dir/{{ DOCKERFS_DIR }} ${rootmnt}/var/lib/docker - fi -fi +mount_docker ## Mount the boot directory in the raw partition, bypass the overlay mkdir -p ${rootmnt}/boot +# make sure that the boot folder exists before attempting a mount +mkdir -p ${rootmnt}/host/$image_dir/boot mount --bind ${rootmnt}/host/$image_dir/boot ${rootmnt}/boot ## Mount loop device or tmpfs for /var/log