From 414b10fab073ba3a8a800b1635a5f812ed81d110 Mon Sep 17 00:00:00 2001 From: Sangeet Gupta Date: Fri, 7 Feb 2020 17:22:06 +0000 Subject: [PATCH] Fix health-probe concurrency and timings Changed Nova and Neutron health-probe script to exit if previous probe process is still running. The health-probe has RPC call timeout of 60 seconds and has 2 retries. In worst case scenario the probe process can run a little over 180 seconds. Changing the periodSeconds so that probe starts after previous one is complete. Also changing timeoutSeconds value a little to give little more extra time for the probe to finish. Increasing the liveness probe periods as they are not do critical which will reduce the resource usage for the probes. Co-authored-by: Randeep Jalli Change-Id: Ife1c381d663c1e271a5099bdc6d0dfefb00d8d73 --- neutron/templates/bin/_health-probe.py.tpl | 30 +++++++++++++++++++ neutron/templates/daemonset-l2gw-agent.yaml | 8 +++--- neutron/values.yaml | 32 ++++++++++----------- nova/templates/bin/_health-probe.py.tpl | 31 ++++++++++++++++++++ nova/templates/daemonset-compute.yaml | 8 +++--- nova/templates/deployment-conductor.yaml | 8 +++--- nova/templates/deployment-consoleauth.yaml | 8 +++--- nova/templates/deployment-scheduler.yaml | 8 +++--- 8 files changed, 97 insertions(+), 36 deletions(-) diff --git a/neutron/templates/bin/_health-probe.py.tpl b/neutron/templates/bin/_health-probe.py.tpl index 284163af..0aa4a564 100644 --- a/neutron/templates/bin/_health-probe.py.tpl +++ b/neutron/templates/bin/_health-probe.py.tpl @@ -39,8 +39,10 @@ Usage example for Neutron metadata agent: import httplib2 from six.moves import http_client as httplib +import json import os import psutil +import signal import socket import sys @@ -292,8 +294,36 @@ def test_rpc_liveness(): check_agent_status(transport) +def check_pid_running(pid): + if psutil.pid_exists(int(pid)): + return True + else: + return False if __name__ == "__main__": + + if "liveness-probe" in ','.join(sys.argv): + pidfile = "/tmp/liveness.pid" #nosec + else: + pidfile = "/tmp/readiness.pid" #nosec + data = {} + if os.path.isfile(pidfile): + with open(pidfile,'r') as f: + data = json.load(f) + if check_pid_running(data['pid']): + if data['exit_count'] > 1: + # Third time in, kill the previous process + os.kill(int(data['pid']), signal.SIGTERM) + else: + data['exit_count'] = data['exit_count'] + 1 + with open(pidfile, 'w') as f: + json.dump(data, f) + sys.exit(0) + data['pid'] = os.getpid() + data['exit_count'] = 0 + with open(pidfile, 'w') as f: + json.dump(data, f) + if "sriov_agent.ini" in ','.join(sys.argv): sriov_readiness_check() elif "metadata_agent.ini" not in ','.join(sys.argv): diff --git a/neutron/templates/daemonset-l2gw-agent.yaml b/neutron/templates/daemonset-l2gw-agent.yaml index 50daf29a..3f673990 100644 --- a/neutron/templates/daemonset-l2gw-agent.yaml +++ b/neutron/templates/daemonset-l2gw-agent.yaml @@ -81,8 +81,8 @@ spec: - --use-fqdn {{- end }} initialDelaySeconds: 30 - periodSeconds: 15 - timeoutSeconds: 65 + periodSeconds: 190 + timeoutSeconds: 185 livenessProbe: exec: command: @@ -99,8 +99,8 @@ spec: - --use-fqdn {{- end }} initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 command: - /tmp/neutron-l2gw-agent.sh volumeMounts: diff --git a/neutron/values.yaml b/neutron/values.yaml index 916b4e88..3694099b 100644 --- a/neutron/values.yaml +++ b/neutron/values.yaml @@ -341,28 +341,28 @@ pod: enabled: true params: initialDelaySeconds: 30 - periodSeconds: 15 - timeoutSeconds: 65 + periodSeconds: 190 + timeoutSeconds: 185 liveness: enabled: true params: initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 l3_agent: l3_agent: readiness: enabled: true params: initialDelaySeconds: 30 - periodSeconds: 15 - timeoutSeconds: 65 + periodSeconds: 190 + timeoutSeconds: 185 liveness: enabled: true params: initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 lb_agent: lb_agent: readiness: @@ -373,14 +373,14 @@ pod: enabled: true params: initialDelaySeconds: 30 - periodSeconds: 15 - timeoutSeconds: 65 + periodSeconds: 190 + timeoutSeconds: 185 liveness: enabled: true params: initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 ovs_agent: ovs_agent: readiness: @@ -390,16 +390,16 @@ pod: enabled: true params: initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 sriov_agent: sriov_agent: readiness: enabled: true params: initialDelaySeconds: 30 - periodSeconds: 15 - timeoutSeconds: 65 + periodSeconds: 190 + timeoutSeconds: 185 server: server: readiness: diff --git a/nova/templates/bin/_health-probe.py.tpl b/nova/templates/bin/_health-probe.py.tpl index d78e7013..d1127fb9 100644 --- a/nova/templates/bin/_health-probe.py.tpl +++ b/nova/templates/bin/_health-probe.py.tpl @@ -33,7 +33,10 @@ Usage example for Nova Compute: """ +import json +import os import psutil +import signal import socket import sys @@ -218,8 +221,36 @@ def test_rpc_liveness(): check_service_status(transport) +def check_pid_running(pid): + if psutil.pid_exists(int(pid)): + return True + else: + return False if __name__ == "__main__": + + if "liveness-probe" in ','.join(sys.argv): + pidfile = "/tmp/liveness.pid" #nosec + else: + pidfile = "/tmp/readiness.pid" #nosec + data = {} + if os.path.isfile(pidfile): + with open(pidfile,'r') as f: + data = json.load(f) + if check_pid_running(data['pid']): + if data['exit_count'] > 1: + # Third time in, kill the previous process + os.kill(int(data['pid']), signal.SIGTERM) + else: + data['exit_count'] = data['exit_count'] + 1 + with open(pidfile, 'w') as f: + json.dump(data, f) + sys.exit(0) + data['pid'] = os.getpid() + data['exit_count'] = 0 + with open(pidfile, 'w') as f: + json.dump(data, f) + test_rpc_liveness() sys.exit(0) # return success diff --git a/nova/templates/daemonset-compute.yaml b/nova/templates/daemonset-compute.yaml index 43e53d72..fd1f37f4 100644 --- a/nova/templates/daemonset-compute.yaml +++ b/nova/templates/daemonset-compute.yaml @@ -203,8 +203,8 @@ spec: - --use-fqdn {{- end }} initialDelaySeconds: 80 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 190 + timeoutSeconds: 185 livenessProbe: exec: command: @@ -219,8 +219,8 @@ spec: - --use-fqdn {{- end }} initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 command: - /tmp/nova-compute.sh volumeMounts: diff --git a/nova/templates/deployment-conductor.yaml b/nova/templates/deployment-conductor.yaml index f927afa6..ad511646 100644 --- a/nova/templates/deployment-conductor.yaml +++ b/nova/templates/deployment-conductor.yaml @@ -69,8 +69,8 @@ spec: - --service-queue-name - conductor initialDelaySeconds: 80 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 190 + timeoutSeconds: 185 livenessProbe: exec: command: @@ -82,8 +82,8 @@ spec: - conductor - --liveness-probe initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 command: - /tmp/nova-conductor.sh volumeMounts: diff --git a/nova/templates/deployment-consoleauth.yaml b/nova/templates/deployment-consoleauth.yaml index b9cb7173..ddeea338 100644 --- a/nova/templates/deployment-consoleauth.yaml +++ b/nova/templates/deployment-consoleauth.yaml @@ -69,8 +69,8 @@ spec: - --service-queue-name - consoleauth initialDelaySeconds: 80 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 190 + timeoutSeconds: 185 livenessProbe: exec: command: @@ -82,8 +82,8 @@ spec: - consoleauth - --liveness-probe initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 command: - /tmp/nova-consoleauth.sh volumeMounts: diff --git a/nova/templates/deployment-scheduler.yaml b/nova/templates/deployment-scheduler.yaml index cb9e9df3..f2d5055e 100644 --- a/nova/templates/deployment-scheduler.yaml +++ b/nova/templates/deployment-scheduler.yaml @@ -69,8 +69,8 @@ spec: - --service-queue-name - scheduler initialDelaySeconds: 80 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 190 + timeoutSeconds: 185 livenessProbe: exec: command: @@ -82,8 +82,8 @@ spec: - scheduler - --liveness-probe initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 command: - /tmp/nova-scheduler.sh volumeMounts: