diff --git a/cluster/saltbase/salt/docker/docker-healthcheck b/cluster/saltbase/salt/docker/docker-healthcheck new file mode 100755 index 00000000000..9167567bb04 --- /dev/null +++ b/cluster/saltbase/salt/docker/docker-healthcheck @@ -0,0 +1,44 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is intended to be run periodically, to check the health +# of docker. If it detects a failure, it will restart docker using systemctl. + +if timeout 10 docker version > /dev/null; then + exit 0 +fi + +echo "docker failed" +echo "Giving docker 30 seconds grace before restarting" +sleep 30 + +if timeout 10 docker version > /dev/null; then + echo "docker recovered" + exit 0 +fi + +echo "docker still down; triggering docker restart" +systemctl restart docker + +echo "Waiting 60 seconds to give docker time to start" +sleep 60 + +if timeout 10 docker version > /dev/null; then + echo "docker recovered" + exit 0 +fi + +echo "docker still failing" diff --git a/cluster/saltbase/salt/docker/docker-healthcheck.service b/cluster/saltbase/salt/docker/docker-healthcheck.service new file mode 100644 index 00000000000..b4ecfaf8f72 --- /dev/null +++ b/cluster/saltbase/salt/docker/docker-healthcheck.service @@ -0,0 +1,9 @@ +[Unit] +Description=Run docker-healthcheck once + +[Service] +Type=oneshot +ExecStart=/opt/kubernetes/helpers/docker-healthcheck + +[Install] +WantedBy=multi-user.target diff --git a/cluster/saltbase/salt/docker/docker-healthcheck.timer b/cluster/saltbase/salt/docker/docker-healthcheck.timer new file mode 100644 index 00000000000..3d252e4464e --- /dev/null +++ b/cluster/saltbase/salt/docker/docker-healthcheck.timer @@ -0,0 +1,9 @@ +[Unit] +Description=Trigger docker-healthcheck periodically + +[Timer] +OnUnitInactiveSec=10s +Unit=docker-healthcheck.service + +[Install] +WantedBy=multi-user.target diff --git a/cluster/saltbase/salt/docker/init.sls b/cluster/saltbase/salt/docker/init.sls index 2de683a42a0..9abc97e8693 100644 --- a/cluster/saltbase/salt/docker/init.sls +++ b/cluster/saltbase/salt/docker/init.sls @@ -338,6 +338,45 @@ fix-service-docker: - cmd: docker-upgrade {% endif %} +/opt/kubernetes/helpers/docker-healthcheck: + file.managed: + - source: salt://docker/docker-healthcheck + - user: root + - group: root + - mode: 755 + +{{ pillar.get('systemd_system_path') }}/docker-healthcheck.service: + file.managed: + - source: salt://docker/docker-healthcheck.service + - template: jinja + - user: root + - group: root + - mode: 644 + +{{ pillar.get('systemd_system_path') }}/docker-healthcheck.timer: + file.managed: + - source: salt://docker/docker-healthcheck.timer + - template: jinja + - user: root + - group: root + - mode: 644 + +# Tell systemd to load the timer +fix-systemd-docker-healthcheck-timer: + cmd.wait: + - name: /opt/kubernetes/helpers/services bounce docker-healthcheck.timer + - watch: + - file: {{ pillar.get('systemd_system_path') }}/docker-healthcheck.timer + +# Trigger a first run of docker-healthcheck; needed because the timer fires 10s after the previous run. +fix-systemd-docker-healthcheck-service: + cmd.wait: + - name: /opt/kubernetes/helpers/services bounce docker-healthcheck.service + - watch: + - file: {{ pillar.get('systemd_system_path') }}/docker-healthcheck.service + - require: + - cmd: fix-service-docker + {% endif %} docker: