From dbff0ef67b5b7df715b4e30256ef20e643407e44 Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Thu, 3 Mar 2016 09:24:21 -0500 Subject: [PATCH] Systemd/non-Redhat: Add docker healthcheck script We do the equivalent of #21727 for systemd systems. Issue #21731 --- .../saltbase/salt/docker/docker-healthcheck | 44 +++++++++++++++++++ .../salt/docker/docker-healthcheck.service | 9 ++++ .../salt/docker/docker-healthcheck.timer | 9 ++++ cluster/saltbase/salt/docker/init.sls | 39 ++++++++++++++++ 4 files changed, 101 insertions(+) create mode 100755 cluster/saltbase/salt/docker/docker-healthcheck create mode 100644 cluster/saltbase/salt/docker/docker-healthcheck.service create mode 100644 cluster/saltbase/salt/docker/docker-healthcheck.timer diff --git a/cluster/saltbase/salt/docker/docker-healthcheck b/cluster/saltbase/salt/docker/docker-healthcheck new file mode 100755 index 00000000000..9167567bb04 --- /dev/null +++ b/cluster/saltbase/salt/docker/docker-healthcheck @@ -0,0 +1,44 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is intended to be run periodically, to check the health +# of docker. If it detects a failure, it will restart docker using systemctl. + +if timeout 10 docker version > /dev/null; then + exit 0 +fi + +echo "docker failed" +echo "Giving docker 30 seconds grace before restarting" +sleep 30 + +if timeout 10 docker version > /dev/null; then + echo "docker recovered" + exit 0 +fi + +echo "docker still down; triggering docker restart" +systemctl restart docker + +echo "Waiting 60 seconds to give docker time to start" +sleep 60 + +if timeout 10 docker version > /dev/null; then + echo "docker recovered" + exit 0 +fi + +echo "docker still failing" diff --git a/cluster/saltbase/salt/docker/docker-healthcheck.service b/cluster/saltbase/salt/docker/docker-healthcheck.service new file mode 100644 index 00000000000..b4ecfaf8f72 --- /dev/null +++ b/cluster/saltbase/salt/docker/docker-healthcheck.service @@ -0,0 +1,9 @@ +[Unit] +Description=Run docker-healthcheck once + +[Service] +Type=oneshot +ExecStart=/opt/kubernetes/helpers/docker-healthcheck + +[Install] +WantedBy=multi-user.target diff --git a/cluster/saltbase/salt/docker/docker-healthcheck.timer b/cluster/saltbase/salt/docker/docker-healthcheck.timer new file mode 100644 index 00000000000..3d252e4464e --- /dev/null +++ b/cluster/saltbase/salt/docker/docker-healthcheck.timer @@ -0,0 +1,9 @@ +[Unit] +Description=Trigger docker-healthcheck periodically + +[Timer] +OnUnitInactiveSec=10s +Unit=docker-healthcheck.service + +[Install] +WantedBy=multi-user.target diff --git a/cluster/saltbase/salt/docker/init.sls b/cluster/saltbase/salt/docker/init.sls index 2de683a42a0..9abc97e8693 100644 --- a/cluster/saltbase/salt/docker/init.sls +++ b/cluster/saltbase/salt/docker/init.sls @@ -338,6 +338,45 @@ fix-service-docker: - cmd: docker-upgrade {% endif %} +/opt/kubernetes/helpers/docker-healthcheck: + file.managed: + - source: salt://docker/docker-healthcheck + - user: root + - group: root + - mode: 755 + +{{ pillar.get('systemd_system_path') }}/docker-healthcheck.service: + file.managed: + - source: salt://docker/docker-healthcheck.service + - template: jinja + - user: root + - group: root + - mode: 644 + +{{ pillar.get('systemd_system_path') }}/docker-healthcheck.timer: + file.managed: + - source: salt://docker/docker-healthcheck.timer + - template: jinja + - user: root + - group: root + - mode: 644 + +# Tell systemd to load the timer +fix-systemd-docker-healthcheck-timer: + cmd.wait: + - name: /opt/kubernetes/helpers/services bounce docker-healthcheck.timer + - watch: + - file: {{ pillar.get('systemd_system_path') }}/docker-healthcheck.timer + +# Trigger a first run of docker-healthcheck; needed because the timer fires 10s after the previous run. +fix-systemd-docker-healthcheck-service: + cmd.wait: + - name: /opt/kubernetes/helpers/services bounce docker-healthcheck.service + - watch: + - file: {{ pillar.get('systemd_system_path') }}/docker-healthcheck.service + - require: + - cmd: fix-service-docker + {% endif %} docker: