mirror of
				https://github.com/Telecominfraproject/ols-nos.git
				synced 2025-10-30 17:48:09 +00:00 
			
		
		
		
	[system-health] No longer check critical process/service status via monit (#9068)
HLD updated here: https://github.com/Azure/SONiC/pull/887 #### Why I did it Command `monit summary -B` can no longer display the status for each critical process, system-health should not depend on it and need find a way to monitor the status of critical processes. The PR is to address that. monit is still used by system-health to do file system check as well as customize check. #### How I did it 1. Get container names from FEATURE table 2. For each container, collect critical process names from file critical_processes 3. Use “docker exec -it <container_name> bash -c ‘supervisorctl status’” to get processes status inside container, parse the output and check if any critical processes exit #### How to verify it 1. Add unit test case to cover it 2. Adjust sonic-mgmt cases to cover it 3. Manual test
This commit is contained in:
		| @@ -4,6 +4,7 @@ SYSTEM_HEALTH = system_health-1.0-py3-none-any.whl | ||||
| $(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health | ||||
| $(SYSTEM_HEALTH)_PYTHON_VERSION = 3 | ||||
| $(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY3) $(SONIC_CONFIG_ENGINE_PY3) | ||||
| $(SYSTEM_HEALTH)_DEBS_DEPENDS = $(LIBSWSSCOMMON) $(PYTHON3_SWSSCOMMON) | ||||
| SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH) | ||||
|  | ||||
| export system_health_py3_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))" | ||||
|   | ||||
| @@ -1,3 +1,11 @@ | ||||
| from . import utils | ||||
| from .config import Config | ||||
| from .health_checker import HealthChecker | ||||
| from .service_checker import ServiceChecker | ||||
| from .hardware_checker import HardwareChecker | ||||
| from .user_defined_checker import UserDefinedChecker | ||||
|  | ||||
|  | ||||
| class HealthCheckerManager(object): | ||||
|     """ | ||||
|     Manage all system health checkers and system health configuration. | ||||
| @@ -10,7 +18,6 @@ class HealthCheckerManager(object): | ||||
|         self._checkers = [] | ||||
|         self._state = self.STATE_BOOTING | ||||
|  | ||||
|         from .config import Config | ||||
|         self.config = Config() | ||||
|         self.initialize() | ||||
|  | ||||
| @@ -19,8 +26,6 @@ class HealthCheckerManager(object): | ||||
|         Initialize the manager. Create service checker and hardware checker by default. | ||||
|         :return: | ||||
|         """ | ||||
|         from .service_checker import ServiceChecker | ||||
|         from .hardware_checker import HardwareChecker | ||||
|         self._checkers.append(ServiceChecker()) | ||||
|         self._checkers.append(HardwareChecker()) | ||||
|  | ||||
| @@ -31,7 +36,6 @@ class HealthCheckerManager(object): | ||||
|         :return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that | ||||
|         contains the status for all objects that was checked. | ||||
|         """ | ||||
|         from .health_checker import HealthChecker | ||||
|         HealthChecker.summary = HealthChecker.STATUS_OK | ||||
|         stats = {} | ||||
|         self.config.load_config() | ||||
| @@ -45,7 +49,6 @@ class HealthCheckerManager(object): | ||||
|             self._do_check(checker, stats) | ||||
|  | ||||
|         if self.config.user_defined_checkers: | ||||
|             from .user_defined_checker import UserDefinedChecker | ||||
|             for udc in self.config.user_defined_checkers: | ||||
|                 checker = UserDefinedChecker(udc) | ||||
|                 self._do_check(checker, stats) | ||||
| @@ -71,11 +74,12 @@ class HealthCheckerManager(object): | ||||
|             else: | ||||
|                 stats[category].update(info) | ||||
|         except Exception as e: | ||||
|             from .health_checker import HealthChecker | ||||
|             HealthChecker.summary = HealthChecker.STATUS_NOT_OK | ||||
|             error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e)) | ||||
|             entry = {str(checker): { | ||||
|                 HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK, | ||||
|                 HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg | ||||
|                 HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg, | ||||
|                 HealthChecker.INFO_FIELD_OBJECT_TYPE: "Internal" | ||||
|             }} | ||||
|             if 'Internal' not in stats: | ||||
|                 stats['Internal'] = entry | ||||
| @@ -83,8 +87,7 @@ class HealthCheckerManager(object): | ||||
|                 stats['Internal'].update(entry) | ||||
|  | ||||
|     def _is_system_booting(self): | ||||
|         from .utils import get_uptime | ||||
|         uptime = get_uptime() | ||||
|         uptime = utils.get_uptime() | ||||
|         if not self.boot_timeout: | ||||
|             self.boot_timeout = self.config.get_bootup_timeout() | ||||
|         booting = uptime < self.boot_timeout | ||||
|   | ||||
| @@ -1,12 +1,31 @@ | ||||
| import docker | ||||
| import os | ||||
| import pickle | ||||
| import re | ||||
|  | ||||
| from swsscommon import swsscommon | ||||
| from sonic_py_common import multi_asic | ||||
| from sonic_py_common.logger import Logger | ||||
| from .health_checker import HealthChecker | ||||
| from . import utils | ||||
|  | ||||
| SYSLOG_IDENTIFIER = 'service_checker' | ||||
| logger = Logger(log_identifier=SYSLOG_IDENTIFIER) | ||||
|  | ||||
|  | ||||
| class ServiceChecker(HealthChecker): | ||||
|     """ | ||||
|     Checker that checks critical system service status via monit service. | ||||
|     """ | ||||
|  | ||||
|     # Cache file to save container_critical_processes | ||||
|     CRITICAL_PROCESS_CACHE = '/tmp/critical_process_cache' | ||||
|  | ||||
|     CRITICAL_PROCESSES_PATH = 'etc/supervisor/critical_processes' | ||||
|  | ||||
|     # Command to get merged directory of a container | ||||
|     GET_CONTAINER_FOLDER_CMD = 'docker inspect {} --format "{{{{.GraphDriver.Data.MergedDir}}}}"' | ||||
|  | ||||
|     # Command to query the status of monit service. | ||||
|     CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service' | ||||
|  | ||||
| @@ -24,6 +43,160 @@ class ServiceChecker(HealthChecker): | ||||
|  | ||||
|     def __init__(self): | ||||
|         HealthChecker.__init__(self) | ||||
|         self.container_critical_processes = {} | ||||
|         # Containers that has invalid critical_processes file | ||||
|         self.bad_containers = set() | ||||
|  | ||||
|         self.container_feature_dict = {} | ||||
|  | ||||
|         self.need_save_cache = False | ||||
|  | ||||
|         self.load_critical_process_cache() | ||||
|  | ||||
|     def get_expected_running_containers(self, feature_table): | ||||
|         """Get a set of containers that are expected to running on SONiC | ||||
|  | ||||
|         Args: | ||||
|             feature_table (object): FEATURE table in CONFIG_DB | ||||
|  | ||||
|         Returns: | ||||
|             expected_running_containers: A set of container names that are expected running | ||||
|             container_feature_dict: A dictionary {<container_name>:<feature_name>} | ||||
|         """ | ||||
|         expected_running_containers = set() | ||||
|         container_feature_dict = {} | ||||
|         for feature_name, feature_entry in feature_table.items(): | ||||
|             if feature_entry["state"] not in ["disabled", "always_disabled"]: | ||||
|                 if multi_asic.is_multi_asic(): | ||||
|                     if feature_entry["has_global_scope"] == "True": | ||||
|                         expected_running_containers.add(feature_name) | ||||
|                         container_feature_dict[feature_name] = feature_name | ||||
|                     if feature_entry["has_per_asic_scope"] == "True": | ||||
|                         num_asics = multi_asic.get_num_asics() | ||||
|                         for asic_id in range(num_asics): | ||||
|                             expected_running_containers.add(feature_name + str(asic_id)) | ||||
|                             container_feature_dict[feature_name + str(asic_id)] = feature_name | ||||
|                 else: | ||||
|                     expected_running_containers.add(feature_name) | ||||
|                     container_feature_dict[feature_name] = feature_name | ||||
|  | ||||
|         return expected_running_containers, container_feature_dict | ||||
|  | ||||
|     def get_current_running_containers(self): | ||||
|         """Get current running containers, if the running container is not in self.container_critical_processes, | ||||
|            try get the critical process list | ||||
|  | ||||
|         Returns: | ||||
|             running_containers: A set of running container names | ||||
|         """ | ||||
|         DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock') | ||||
|         running_containers = set() | ||||
|         ctrs = DOCKER_CLIENT.containers | ||||
|         try: | ||||
|             lst = ctrs.list(filters={"status": "running"}) | ||||
|  | ||||
|             for ctr in lst: | ||||
|                 running_containers.add(ctr.name) | ||||
|                 if ctr.name not in self.container_critical_processes: | ||||
|                     self.fill_critical_process_by_container(ctr.name) | ||||
|         except docker.errors.APIError as err: | ||||
|             logger.log_error("Failed to retrieve the running container list. Error: '{}'".format(err)) | ||||
|  | ||||
|         return running_containers | ||||
|  | ||||
|     def get_critical_process_list_from_file(self, container, critical_processes_file): | ||||
|         """Read critical process name list from critical processes file | ||||
|  | ||||
|         Args: | ||||
|             container (str): contianer name | ||||
|             critical_processes_file (str): critical processes file path | ||||
|  | ||||
|         Returns: | ||||
|             critical_process_list: A list of critical process names | ||||
|         """ | ||||
|         critical_process_list = [] | ||||
|  | ||||
|         with open(critical_processes_file, 'r') as file: | ||||
|             for line in file: | ||||
|                 # Try to match a line like "program:<process_name>" | ||||
|                 match = re.match(r"^\s*((.+):(.*))*\s*$", line) | ||||
|                 if match is None: | ||||
|                     if container not in self.bad_containers: | ||||
|                         self.bad_containers.add(container) | ||||
|                         logger.log_error('Invalid syntax in critical_processes file of {}'.format(container)) | ||||
|                     continue | ||||
|  | ||||
|                 identifier_key = match.group(2).strip() | ||||
|                 identifier_value = match.group(3).strip() | ||||
|                 if identifier_key == "program" and identifier_value: | ||||
|                     critical_process_list.append(identifier_value) | ||||
|  | ||||
|         return critical_process_list | ||||
|  | ||||
|     def fill_critical_process_by_container(self, container): | ||||
|         """Get critical process for a given container | ||||
|  | ||||
|         Args: | ||||
|             container (str): container name | ||||
|         """ | ||||
|         # Get container volumn folder | ||||
|         container_folder = self._get_container_folder(container) | ||||
|         if not container_folder: | ||||
|             logger.log_error('Failed to get container folder for {}'.format(container_folder)) | ||||
|             return | ||||
|  | ||||
|         if not os.path.exists(container_folder): | ||||
|             logger.log_error('Container folder does not exist: {}'.format(container_folder)) | ||||
|             return | ||||
|  | ||||
|         # Get critical_processes file path | ||||
|         critical_processes_file = os.path.join(container_folder, ServiceChecker.CRITICAL_PROCESSES_PATH) | ||||
|         if not os.path.isfile(critical_processes_file): | ||||
|             # Critical process file does not exist, the container has no critical processes. | ||||
|             logger.log_debug('Failed to get critical process file for {}, {} does not exist'.format(container, critical_processes_file)) | ||||
|             self._update_container_critical_processes(container, []) | ||||
|             return | ||||
|  | ||||
|         # Get critical process list from critical_processes | ||||
|         critical_process_list = self.get_critical_process_list_from_file(container, critical_processes_file) | ||||
|         self._update_container_critical_processes(container, critical_process_list) | ||||
|  | ||||
|     def _update_container_critical_processes(self, container, critical_process_list): | ||||
|         self.container_critical_processes[container] = critical_process_list | ||||
|         self.need_save_cache = True | ||||
|  | ||||
|     def _get_container_folder(self, container): | ||||
|         container_folder = utils.run_command(ServiceChecker.GET_CONTAINER_FOLDER_CMD.format(container)) | ||||
|         if container_folder is None: | ||||
|             return container_folder | ||||
|  | ||||
|         return container_folder.strip() | ||||
|  | ||||
|     def save_critical_process_cache(self): | ||||
|         """Save self.container_critical_processes to a cache file | ||||
|         """ | ||||
|         if not self.need_save_cache: | ||||
|             return | ||||
|  | ||||
|         self.need_save_cache = False | ||||
|         if not self.container_critical_processes: | ||||
|             # if container_critical_processes is empty, don't save it | ||||
|             return | ||||
|  | ||||
|         if os.path.exists(ServiceChecker.CRITICAL_PROCESS_CACHE): | ||||
|             # if cache file exists, remove it | ||||
|             os.remove(ServiceChecker.CRITICAL_PROCESS_CACHE) | ||||
|  | ||||
|         with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'wb+') as f: | ||||
|             pickle.dump(self.container_critical_processes, f) | ||||
|  | ||||
|     def load_critical_process_cache(self): | ||||
|         if not os.path.isfile(ServiceChecker.CRITICAL_PROCESS_CACHE): | ||||
|             # cache file does not exist | ||||
|             return | ||||
|  | ||||
|         with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'rb') as f: | ||||
|             self.container_critical_processes = pickle.load(f) | ||||
|  | ||||
|     def reset(self): | ||||
|         self._info = {} | ||||
| @@ -31,16 +204,14 @@ class ServiceChecker(HealthChecker): | ||||
|     def get_category(self): | ||||
|         return 'Services' | ||||
|  | ||||
|     def check(self, config): | ||||
|     def check_by_monit(self, config): | ||||
|         """ | ||||
|         Check critical system service status. Get and analyze the output of $CHECK_CMD, collect status for system,  | ||||
|         process and file system. | ||||
|         et and analyze the output of $CHECK_CMD, collect status for file system or customize checker if any. | ||||
|         :param config: Health checker configuration. | ||||
|         :return: | ||||
|         """ | ||||
|         self.reset() | ||||
|         output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD).strip() | ||||
|         if output != 'active': | ||||
|         output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD) | ||||
|         if not output or output.strip() != 'active': | ||||
|             self.set_object_not_ok('Service', 'monit', 'monit service is not running') | ||||
|             return | ||||
|  | ||||
| @@ -58,7 +229,7 @@ class ServiceChecker(HealthChecker): | ||||
|  | ||||
|         for line in lines[2:]: | ||||
|             name = line[0:status_begin].strip() | ||||
|             if config.ignore_services and name in config.ignore_services: | ||||
|             if config and config.ignore_services and name in config.ignore_services: | ||||
|                 continue | ||||
|             status = line[status_begin:type_begin].strip() | ||||
|             service_type = line[type_begin:].strip() | ||||
| @@ -70,3 +241,105 @@ class ServiceChecker(HealthChecker): | ||||
|             else: | ||||
|                 self.set_object_ok(service_type, name) | ||||
|         return | ||||
|  | ||||
|     def check_services(self, config): | ||||
|         """Check status of critical services and critical processes | ||||
|  | ||||
|         Args: | ||||
|             config (config.Config): Health checker configuration. | ||||
|         """ | ||||
|         config_db = swsscommon.ConfigDBConnector() | ||||
|         config_db.connect() | ||||
|         feature_table = config_db.get_table("FEATURE") | ||||
|         expected_running_containers, self.container_feature_dict = self.get_expected_running_containers(feature_table) | ||||
|         current_running_containers = self.get_current_running_containers() | ||||
|  | ||||
|         newly_disabled_containers = set(self.container_critical_processes.keys()).difference(expected_running_containers) | ||||
|         for newly_disabled_container in newly_disabled_containers: | ||||
|             self.container_critical_processes.pop(newly_disabled_container) | ||||
|  | ||||
|         self.save_critical_process_cache() | ||||
|  | ||||
|         not_running_containers = expected_running_containers.difference(current_running_containers) | ||||
|         for container in not_running_containers: | ||||
|             self.set_object_not_ok('Service', container, "Container '{}' is not running".format(container)) | ||||
|  | ||||
|         if not self.container_critical_processes: | ||||
|             # Critical process is empty, not expect | ||||
|             self.set_object_not_ok('Service', 'system', 'no critical process found') | ||||
|             return | ||||
|  | ||||
|         for container, critical_process_list in self.container_critical_processes.items(): | ||||
|             self.check_process_existence(container, critical_process_list, config, feature_table) | ||||
|  | ||||
|         for bad_container in self.bad_containers: | ||||
|             self.set_object_not_ok('Service', bad_container, 'Syntax of critical_processes file is incorrect') | ||||
|  | ||||
|     def check(self, config): | ||||
|         """Check critical system service status. | ||||
|  | ||||
|         Args: | ||||
|             config (object): Health checker configuration. | ||||
|         """ | ||||
|         self.reset() | ||||
|         self.check_by_monit(config) | ||||
|         self.check_services(config) | ||||
|  | ||||
|  | ||||
|     def _parse_supervisorctl_status(self, process_status): | ||||
|         """Expected input: | ||||
|             arp_update                       RUNNING   pid 67, uptime 1:03:56 | ||||
|             buffermgrd                       RUNNING   pid 81, uptime 1:03:56 | ||||
|  | ||||
|         Args: | ||||
|             process_status (list): List of process status | ||||
|         """ | ||||
|         data = {} | ||||
|         for line in process_status: | ||||
|             line = line.strip() | ||||
|             if not line: | ||||
|                 continue | ||||
|             items = line.split() | ||||
|             if len(items) < 2: | ||||
|                 continue | ||||
|             data[items[0].strip()] = items[1].strip() | ||||
|         return data | ||||
|  | ||||
|     def check_process_existence(self, container_name, critical_process_list, config, feature_table): | ||||
|         """Check whether the process in the specified container is running or not. | ||||
|  | ||||
|         Args: | ||||
|             container_name (str): Container name | ||||
|             critical_process_list (list): Critical processes | ||||
|             config (object): Health checker configuration. | ||||
|             feature_table (object): Feature table | ||||
|         """ | ||||
|         feature_name = self.container_feature_dict[container_name] | ||||
|         if feature_name in feature_table: | ||||
|             # We look into the 'FEATURE' table to verify whether the container is disabled or not. | ||||
|             # If the container is diabled, we exit. | ||||
|             if ("state" in feature_table[feature_name] | ||||
|                     and feature_table[feature_name]["state"] not in ["disabled", "always_disabled"]): | ||||
|  | ||||
|                 # We are using supervisorctl status to check the critical process status. We cannot leverage psutil here because | ||||
|                 # it not always possible to get process cmdline in supervisor.conf. E.g, cmdline of orchagent is "/usr/bin/orchagent", | ||||
|                 # however, in supervisor.conf it is "/usr/bin/orchagent.sh" | ||||
|                 cmd = 'docker exec {} bash -c "supervisorctl status"'.format(container_name) | ||||
|                 process_status = utils.run_command(cmd) | ||||
|                 if process_status is None: | ||||
|                     for process_name in critical_process_list: | ||||
|                         self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name)) | ||||
|                     return | ||||
|  | ||||
|                 process_status = self._parse_supervisorctl_status(process_status.strip().splitlines()) | ||||
|                 for process_name in critical_process_list: | ||||
|                     if config and config.ignore_services and process_name in config.ignore_services: | ||||
|                         continue | ||||
|  | ||||
|                     # Sometimes process_name is in critical_processes file, but it is not in supervisor.conf, such process will not run in container. | ||||
|                     # and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status. | ||||
|                     if process_name in process_status: | ||||
|                         if process_status[process_name] != 'RUNNING': | ||||
|                             self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name)) | ||||
|                         else: | ||||
|                             self.set_object_ok('Process', '{}:{}'.format(container_name, process_name)) | ||||
|   | ||||
| @@ -3,6 +3,7 @@ from setuptools import setup | ||||
| dependencies = [ | ||||
|     'natsort', | ||||
|     'sonic_py_common', | ||||
|     'docker' | ||||
| ] | ||||
|  | ||||
| setup( | ||||
|   | ||||
| @@ -0,0 +1,2 @@ | ||||
| program:snmpd | ||||
| program:snmp-subagent | ||||
							
								
								
									
										11
									
								
								src/system-health/tests/system_health_monitoring_config.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								src/system-health/tests/system_health_monitoring_config.json
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,11 @@ | ||||
| { | ||||
|     "services_to_ignore": ["dummy_service"], | ||||
|     "devices_to_ignore": ["psu.voltage"], | ||||
|     "user_defined_checkers": [], | ||||
|     "polling_interval": 60, | ||||
|     "led_color": { | ||||
|         "fault": "orange", | ||||
|         "normal": "green", | ||||
|         "booting": "orange_blink" | ||||
|     } | ||||
| } | ||||
| @@ -8,6 +8,7 @@ | ||||
|         2. HealthCheckerManager | ||||
|         3. Config | ||||
| """ | ||||
| import copy | ||||
| import os | ||||
| import sys | ||||
| from swsscommon import swsscommon | ||||
| @@ -30,11 +31,21 @@ from health_checker.manager import HealthCheckerManager | ||||
| from health_checker.service_checker import ServiceChecker | ||||
| from health_checker.user_defined_checker import UserDefinedChecker | ||||
|  | ||||
| mock_supervisorctl_output = """ | ||||
| snmpd                       RUNNING   pid 67, uptime 1:03:56 | ||||
| snmp-subagent               EXITED    Oct 19 01:53 AM | ||||
| """ | ||||
| device_info.get_platform = MagicMock(return_value='unittest') | ||||
|  | ||||
|  | ||||
| def test_user_defined_checker(): | ||||
|     utils.run_command = MagicMock(return_value='') | ||||
| def setup(): | ||||
|     if os.path.exists(ServiceChecker.CRITICAL_PROCESS_CACHE): | ||||
|         os.remove(ServiceChecker.CRITICAL_PROCESS_CACHE) | ||||
|  | ||||
|  | ||||
| @patch('health_checker.utils.run_command') | ||||
| def test_user_defined_checker(mock_run): | ||||
|     mock_run.return_value = '' | ||||
|  | ||||
|     checker = UserDefinedChecker('') | ||||
|     checker.check(None) | ||||
| @@ -43,29 +54,195 @@ def test_user_defined_checker(): | ||||
|     checker.reset() | ||||
|     assert len(checker._info) == 0 | ||||
|  | ||||
|     utils.run_command = MagicMock(return_value='\n\n\n') | ||||
|     mock_run.return_value = '\n\n\n' | ||||
|     checker.check(None) | ||||
|     assert checker._info[str(checker)][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK | ||||
|  | ||||
|     valid_output = 'MyCategory\nDevice1:OK\nDevice2:Device2 is broken\n' | ||||
|     utils.run_command = MagicMock(return_value=valid_output) | ||||
|     mock_run.return_value = valid_output | ||||
|     checker.check(None) | ||||
|     assert checker.get_category() == 'MyCategory' | ||||
|     assert 'Device1' in checker._info | ||||
|     assert 'Device2' in checker._info | ||||
|     assert checker._info['Device1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK | ||||
|     assert checker._info['Device2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK | ||||
|  | ||||
|  | ||||
| def test_service_checker(): | ||||
|     return_value = '' | ||||
| @patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) | ||||
| @patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=test_path)) | ||||
| @patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False)) | ||||
| @patch('docker.DockerClient') | ||||
| @patch('health_checker.utils.run_command') | ||||
| @patch('swsscommon.swsscommon.ConfigDBConnector') | ||||
| def test_service_checker_single_asic(mock_config_db, mock_run, mock_docker_client): | ||||
|     mock_db_data = MagicMock() | ||||
|     mock_get_table = MagicMock() | ||||
|     mock_db_data.get_table = mock_get_table | ||||
|     mock_config_db.return_value = mock_db_data | ||||
|     mock_get_table.return_value = { | ||||
|         'snmp': { | ||||
|             'state': 'enabled', | ||||
|             'has_global_scope': 'True', | ||||
|             'has_per_asic_scope': 'False', | ||||
|  | ||||
|     def mock_run_command(cmd): | ||||
|         if cmd == ServiceChecker.CHECK_MONIT_SERVICE_CMD: | ||||
|             return 'active' | ||||
|         else: | ||||
|             return return_value | ||||
|         } | ||||
|     } | ||||
|     mock_containers = MagicMock() | ||||
|     mock_snmp_container = MagicMock() | ||||
|     mock_snmp_container.name = 'snmp' | ||||
|     mock_containers.list = MagicMock(return_value=[mock_snmp_container]) | ||||
|     mock_docker_client_object = MagicMock() | ||||
|     mock_docker_client.return_value = mock_docker_client_object | ||||
|     mock_docker_client_object.containers = mock_containers | ||||
|  | ||||
|     utils.run_command = mock_run_command | ||||
|     mock_run.return_value = mock_supervisorctl_output | ||||
|  | ||||
|     checker = ServiceChecker() | ||||
|     assert checker.get_category() == 'Services' | ||||
|     config = Config() | ||||
|     checker.check(config) | ||||
|     assert 'snmp:snmpd' in checker._info | ||||
|     assert checker._info['snmp:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK | ||||
|  | ||||
|     assert 'snmp:snmp-subagent' in checker._info | ||||
|     assert checker._info['snmp:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK | ||||
|  | ||||
|     mock_get_table.return_value = { | ||||
|         'new_service': { | ||||
|             'state': 'enabled', | ||||
|             'has_global_scope': 'True', | ||||
|             'has_per_asic_scope': 'False', | ||||
|         }, | ||||
|         'snmp': { | ||||
|             'state': 'enabled', | ||||
|             'has_global_scope': 'True', | ||||
|             'has_per_asic_scope': 'False', | ||||
|  | ||||
|         } | ||||
|     } | ||||
|     mock_ns_container = MagicMock() | ||||
|     mock_ns_container.name = 'new_service' | ||||
|     mock_containers.list = MagicMock(return_value=[mock_snmp_container, mock_ns_container]) | ||||
|     checker.check(config) | ||||
|     assert 'new_service' in checker.container_critical_processes | ||||
|  | ||||
|     assert 'new_service:snmpd' in checker._info | ||||
|     assert checker._info['new_service:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK | ||||
|  | ||||
|     assert 'new_service:snmp-subagent' in checker._info | ||||
|     assert checker._info['new_service:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK | ||||
|  | ||||
|     mock_containers.list = MagicMock(return_value=[mock_snmp_container]) | ||||
|     checker.check(config) | ||||
|     assert 'new_service' in checker._info | ||||
|     assert checker._info['new_service'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK | ||||
|  | ||||
|     mock_containers.list = MagicMock(return_value=[mock_snmp_container, mock_ns_container]) | ||||
|     mock_run.return_value = None | ||||
|     checker.check(config) | ||||
|     assert 'new_service:snmpd' in checker._info | ||||
|     assert checker._info['new_service:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK | ||||
|  | ||||
|     assert 'new_service:snmp-subagent' in checker._info | ||||
|     assert checker._info['new_service:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK | ||||
|  | ||||
|     origin_container_critical_processes = copy.deepcopy(checker.container_critical_processes) | ||||
|     checker.save_critical_process_cache() | ||||
|     checker.load_critical_process_cache() | ||||
|     assert origin_container_critical_processes == checker.container_critical_processes | ||||
|  | ||||
|  | ||||
|  | ||||
| @patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) | ||||
| @patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=test_path)) | ||||
| @patch('health_checker.utils.run_command', MagicMock(return_value=mock_supervisorctl_output)) | ||||
| @patch('sonic_py_common.multi_asic.get_num_asics', MagicMock(return_value=3)) | ||||
| @patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=True)) | ||||
| @patch('sonic_py_common.multi_asic.get_namespace_list', MagicMock(return_value=[str(x) for x in range(3)])) | ||||
| @patch('sonic_py_common.multi_asic.get_current_namespace', MagicMock(return_value='')) | ||||
| @patch('docker.DockerClient') | ||||
| @patch('swsscommon.swsscommon.ConfigDBConnector') | ||||
| def test_service_checker_multi_asic(mock_config_db, mock_docker_client): | ||||
|     mock_db_data = MagicMock() | ||||
|     mock_db_data.get_table = MagicMock() | ||||
|     mock_config_db.return_value = mock_db_data | ||||
|  | ||||
|     mock_db_data.get_table.return_value = { | ||||
|         'snmp': { | ||||
|             'state': 'enabled', | ||||
|             'has_global_scope': 'True', | ||||
|             'has_per_asic_scope': 'True', | ||||
|  | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     mock_containers = MagicMock() | ||||
|     mock_snmp_container = MagicMock() | ||||
|     mock_snmp_container.name = 'snmp' | ||||
|     list_return_value = [mock_snmp_container] | ||||
|     for i in range(3): | ||||
|         mock_container = MagicMock() | ||||
|         mock_container.name = 'snmp' + str(i) | ||||
|         list_return_value.append(mock_container) | ||||
|  | ||||
|     mock_containers.list = MagicMock(return_value=list_return_value) | ||||
|     mock_docker_client_object = MagicMock() | ||||
|     mock_docker_client.return_value = mock_docker_client_object | ||||
|     mock_docker_client_object.containers = mock_containers | ||||
|  | ||||
|     checker = ServiceChecker() | ||||
|  | ||||
|     config = Config() | ||||
|     checker.check(config) | ||||
|     assert 'snmp' in checker.container_critical_processes | ||||
|     assert 'snmp:snmpd' in checker._info | ||||
|     assert checker._info['snmp:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK | ||||
|     assert 'snmp0:snmpd' in checker._info | ||||
|     assert checker._info['snmp0:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK | ||||
|     assert 'snmp1:snmpd' in checker._info | ||||
|     assert checker._info['snmp1:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK | ||||
|     assert 'snmp2:snmpd' in checker._info | ||||
|     assert checker._info['snmp2:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK | ||||
|  | ||||
|     assert 'snmp:snmp-subagent' in checker._info | ||||
|     assert checker._info['snmp:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK | ||||
|     assert 'snmp0:snmp-subagent' in checker._info | ||||
|     assert checker._info['snmp0:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK | ||||
|     assert 'snmp1:snmp-subagent' in checker._info | ||||
|     assert checker._info['snmp1:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK | ||||
|     assert 'snmp2:snmp-subagent' in checker._info | ||||
|     assert checker._info['snmp2:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK | ||||
|  | ||||
|  | ||||
| @patch('swsscommon.swsscommon.ConfigDBConnector', MagicMock()) | ||||
| @patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) | ||||
| @patch('health_checker.service_checker.ServiceChecker.check_by_monit', MagicMock()) | ||||
| @patch('docker.DockerClient') | ||||
| @patch('swsscommon.swsscommon.ConfigDBConnector.get_table') | ||||
| def test_service_checker_no_critical_process(mock_get_table, mock_docker_client): | ||||
|     mock_get_table.return_value = { | ||||
|         'snmp': { | ||||
|             'state': 'enabled', | ||||
|             'has_global_scope': 'True', | ||||
|             'has_per_asic_scope': 'True', | ||||
|  | ||||
|         } | ||||
|     } | ||||
|     mock_containers = MagicMock() | ||||
|     mock_containers.list = MagicMock(return_value=[]) | ||||
|     mock_docker_client_object = MagicMock() | ||||
|     mock_docker_client.return_value = mock_docker_client_object | ||||
|     mock_docker_client_object.containers = mock_containers | ||||
|  | ||||
|     checker = ServiceChecker() | ||||
|     config = Config() | ||||
|     checker.check(config) | ||||
|     assert 'system' in checker._info | ||||
|     assert checker._info['system'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK | ||||
|  | ||||
| @patch('health_checker.service_checker.ServiceChecker.check_services', MagicMock()) | ||||
| @patch('health_checker.utils.run_command') | ||||
| def test_service_checker_check_by_monit(mock_run): | ||||
|     return_value = 'Monit 5.20.0 uptime: 3h 54m\n' \ | ||||
|                    'Service Name                     Status                      Type\n' \ | ||||
|                    'sonic                            Running                     System\n' \ | ||||
| @@ -74,7 +251,7 @@ def test_service_checker(): | ||||
|                    'orchagent                        Running                     Process\n' \ | ||||
|                    'root-overlay                     Accessible                  Filesystem\n' \ | ||||
|                    'var-log                          Is not accessible           Filesystem\n' | ||||
|  | ||||
|     mock_run.side_effect = ['active', return_value] | ||||
|     checker = ServiceChecker() | ||||
|     config = Config() | ||||
|     checker.check(config) | ||||
| @@ -185,6 +362,7 @@ def test_hardware_checker(): | ||||
|     }) | ||||
|  | ||||
|     checker = HardwareChecker() | ||||
|     assert checker.get_category() == 'Hardware' | ||||
|     config = Config() | ||||
|     checker.check(config) | ||||
|  | ||||
| @@ -217,3 +395,129 @@ def test_hardware_checker(): | ||||
|  | ||||
|     assert 'PSU 5' in checker._info | ||||
|     assert checker._info['PSU 5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK | ||||
|  | ||||
|  | ||||
| def test_config(): | ||||
|     config = Config() | ||||
|     config._config_file = os.path.join(test_path, Config.CONFIG_FILE) | ||||
|  | ||||
|     assert config.config_file_exists() | ||||
|     config.load_config() | ||||
|     assert config.interval == 60 | ||||
|     assert 'dummy_service' in config.ignore_services | ||||
|     assert 'psu.voltage' in config.ignore_devices | ||||
|     assert len(config.user_defined_checkers) == 0 | ||||
|  | ||||
|     assert config.get_led_color('fault') == 'orange' | ||||
|     assert config.get_led_color('normal') == 'green' | ||||
|     assert config.get_led_color('booting') == 'orange_blink' | ||||
|     assert config.get_bootup_timeout() == 300 | ||||
|  | ||||
|     config._reset() | ||||
|     assert not config.ignore_services | ||||
|     assert not config.ignore_devices | ||||
|     assert not config.user_defined_checkers | ||||
|     assert not config.config_data | ||||
|  | ||||
|     assert config.get_led_color('fault') == 'red' | ||||
|     assert config.get_led_color('normal') == 'green' | ||||
|     assert config.get_led_color('booting') == 'orange_blink' | ||||
|  | ||||
|     config._last_mtime  = 1 | ||||
|     config._config_file = 'notExistFile' | ||||
|     config.load_config() | ||||
|     assert not config._last_mtime | ||||
|  | ||||
|  | ||||
| @patch('swsscommon.swsscommon.ConfigDBConnector', MagicMock()) | ||||
| @patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) | ||||
| @patch('health_checker.service_checker.ServiceChecker.check', MagicMock()) | ||||
| @patch('health_checker.hardware_checker.HardwareChecker.check', MagicMock()) | ||||
| @patch('health_checker.user_defined_checker.UserDefinedChecker.check', MagicMock()) | ||||
| @patch('swsscommon.swsscommon.ConfigDBConnector.get_table', MagicMock()) | ||||
| @patch('health_checker.user_defined_checker.UserDefinedChecker.get_category', MagicMock(return_value='UserDefine')) | ||||
| @patch('health_checker.user_defined_checker.UserDefinedChecker.get_info') | ||||
| @patch('health_checker.service_checker.ServiceChecker.get_info') | ||||
| @patch('health_checker.hardware_checker.HardwareChecker.get_info') | ||||
| @patch('health_checker.utils.get_uptime') | ||||
| def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info): | ||||
|     chassis = MagicMock() | ||||
|     chassis.set_status_led = MagicMock() | ||||
|  | ||||
|     manager = HealthCheckerManager() | ||||
|     manager.config.user_defined_checkers = ['some check'] | ||||
|     assert manager._state == HealthCheckerManager.STATE_BOOTING | ||||
|     assert len(manager._checkers) == 2 | ||||
|  | ||||
|     mock_uptime.return_value = 200 | ||||
|     assert manager._is_system_booting() | ||||
|     state, stat = manager.check(chassis) | ||||
|     assert state == HealthCheckerManager.STATE_BOOTING | ||||
|     assert len(stat) == 0 | ||||
|     chassis.set_status_led.assert_called_with('orange_blink') | ||||
|  | ||||
|     mock_uptime.return_value = 500 | ||||
|     assert not manager._is_system_booting() | ||||
|     assert manager._state == HealthCheckerManager.STATE_RUNNING | ||||
|     mock_hw_info.return_value = { | ||||
|         'ASIC': { | ||||
|             'type': 'ASIC', | ||||
|             'message': '', | ||||
|             'status': 'OK' | ||||
|         }, | ||||
|         'fan1': { | ||||
|             'type': 'Fan', | ||||
|             'message': '', | ||||
|             'status': 'OK' | ||||
|         }, | ||||
|     } | ||||
|     mock_service_info.return_value = { | ||||
|         'snmp:snmpd': { | ||||
|             'type': 'Process', | ||||
|             'message': '', | ||||
|             'status': 'OK' | ||||
|         } | ||||
|     } | ||||
|     mock_udc_info.return_value = { | ||||
|         'udc': { | ||||
|             'type': 'Database', | ||||
|             'message': '', | ||||
|             'status': 'OK' | ||||
|         } | ||||
|     } | ||||
|     state, stat = manager.check(chassis) | ||||
|     assert state == HealthCheckerManager.STATE_RUNNING | ||||
|     assert 'Services' in stat | ||||
|     assert stat['Services']['snmp:snmpd']['status'] == 'OK' | ||||
|  | ||||
|     assert 'Hardware' in stat | ||||
|     assert stat['Hardware']['ASIC']['status'] == 'OK' | ||||
|     assert stat['Hardware']['fan1']['status'] == 'OK' | ||||
|  | ||||
|     assert 'UserDefine' in stat | ||||
|     assert stat['UserDefine']['udc']['status'] == 'OK' | ||||
|  | ||||
|     mock_hw_info.side_effect = RuntimeError() | ||||
|     mock_service_info.side_effect = RuntimeError() | ||||
|     mock_udc_info.side_effect = RuntimeError() | ||||
|     state, stat = manager.check(chassis) | ||||
|     assert 'Internal' in stat | ||||
|     assert stat['Internal']['ServiceChecker']['status'] == 'Not OK' | ||||
|     assert stat['Internal']['HardwareChecker']['status'] == 'Not OK' | ||||
|     assert stat['Internal']['UserDefinedChecker - some check']['status'] == 'Not OK' | ||||
|  | ||||
|     chassis.set_status_led.side_effect = NotImplementedError() | ||||
|     manager._set_system_led(chassis, manager.config, 'normal') | ||||
|  | ||||
|     chassis.set_status_led.side_effect = RuntimeError() | ||||
|     manager._set_system_led(chassis, manager.config, 'normal') | ||||
|  | ||||
| def test_utils(): | ||||
|     output = utils.run_command('some invalid command') | ||||
|     assert not output | ||||
|  | ||||
|     output = utils.run_command('ls') | ||||
|     assert output | ||||
|  | ||||
|     uptime = utils.get_uptime() | ||||
|     assert uptime > 0 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Junchao-Mellanox
					Junchao-Mellanox