From c1cb292310dca70562ba87f710575955e718864c Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Thu, 14 Dec 2023 18:04:24 +0800 Subject: [PATCH] [Mellanox] implement platform wait in python code (#17398) - Why I did it New implementation of Nvidia platform_wait due to: 1. sysfs deprecated by hw-mgmt 2. new dependencies to SDK 3. For CMIS host management mode - How I did it wait hw-management ready wait SDK sysfs nodes ready - How to verify it manual test unit test sonic-mgmt regression --- .../x86_64-mlnx_msn2700-r0/platform_wait | 92 ++++++------------- .../sonic_platform/device_data.py | 28 +++++- .../mlnx-platform-api/sonic_platform/utils.py | 24 +++++ .../tests/test_device_data.py | 24 ++++- .../mlnx-platform-api/tests/test_utils.py | 7 ++ 5 files changed, 107 insertions(+), 68 deletions(-) diff --git a/device/mellanox/x86_64-mlnx_msn2700-r0/platform_wait b/device/mellanox/x86_64-mlnx_msn2700-r0/platform_wait index 080974868..ea76db07a 100755 --- a/device/mellanox/x86_64-mlnx_msn2700-r0/platform_wait +++ b/device/mellanox/x86_64-mlnx_msn2700-r0/platform_wait @@ -1,68 +1,32 @@ -#!/bin/bash +#!/usr/bin/python3 -declare -r SYSLOG_LOGGER="/usr/bin/logger" -declare -r SYSLOG_IDENTIFIER="platform_wait" -declare -r SYSLOG_ERROR="error" -declare -r SYSLOG_NOTICE="notice" -declare -r SYSLOG_INFO="info" +# +# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# -declare -r HW_MGMT_CONFIG="/var/run/hw-management/config" +import sys +from sonic_platform.device_data import DeviceDataManager +from sonic_py_common.logger import Logger -declare -r ASIC_INIT_DONE="${HW_MGMT_CONFIG}/asics_init_done" -declare -r NUM_ASICS="${HW_MGMT_CONFIG}/asic_num" -declare -r ASIC_CHIPUP_COMPLETED="${HW_MGMT_CONFIG}/asic_chipup_completed" -declare -r EXIT_SUCCESS="0" -declare -r EXIT_TIMEOUT="1" - -function log_error() { - eval "${SYSLOG_LOGGER} -t ${SYSLOG_IDENTIFIER} -p ${SYSLOG_ERROR} $@" -} - -function log_notice() { - eval "${SYSLOG_LOGGER} -t ${SYSLOG_IDENTIFIER} -p ${SYSLOG_NOTICE} $@" -} - -function log_info() { - eval "${SYSLOG_LOGGER} -t ${SYSLOG_IDENTIFIER} -p ${SYSLOG_INFO} $@" -} - -function wait_for_asic_chipup() { - - local _ASIC_INIT="0" - local _ASIC_COUNT="0" - local _ASICS_CHIPUP="0" - - local -i _WDOG_CNT="1" - local -ir _WDOG_MAX="300" - - local -r _TIMEOUT="1s" - - while [[ "${_WDOG_CNT}" -le "${_WDOG_MAX}" ]]; do - _ASIC_INIT="$(cat ${ASIC_INIT_DONE} 2>&1)" - _ASIC_COUNT="$(cat ${NUM_ASICS} 2>&1)" - _ASICS_CHIPUP="$(cat ${ASIC_CHIPUP_COMPLETED} 2>&1)" - - if [[ "${_ASIC_INIT}" -eq 1 && "${_ASIC_COUNT}" -eq "${_ASICS_CHIPUP}" ]]; then - return "${EXIT_SUCCESS}" - fi - - let "_WDOG_CNT++" - sleep "${_TIMEOUT}" - done - - log_error "Mellanox ASIC is not ready: INIT: ${_ASIC_INIT}, NUM_ASIC: ${_ASIC_COUNT}, CHIPUP: ${_ASICS_CHIPUP} timeout...." - return "${EXIT_TIMEOUT}" -} - -log_info "Wait for Mellanox ASIC to be ready" - -wait_for_asic_chipup -EXIT_CODE="$?" -if [[ "${EXIT_CODE}" != "${EXIT_SUCCESS}" ]]; then - exit "${EXIT_CODE}" -fi - -log_notice "Mellanox ASIC is ready" - -exit "${EXIT_SUCCESS}" +logger = Logger(log_identifier='platform_wait') +logger.log_notice('Nvidia: Wait for PMON dependencies to be ready') +if DeviceDataManager.wait_platform_ready(): + logger.log_notice('Nvidia: PMON dependencies are ready') + sys.exit(0) +else: + logger.log_error('Nvidia: PMON dependencies are not ready: timeout') + sys.exit(-1) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py index 6bf0a9945..aeceb15d1 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py @@ -17,6 +17,7 @@ import glob import os +import time from . import utils @@ -167,8 +168,11 @@ class DeviceDataManager: @classmethod @utils.read_only_cache() def get_sfp_count(cls): - sfp_count = utils.read_int_from_file('/run/hw-management/config/sfp_counter') - return sfp_count if sfp_count > 0 else len(glob.glob('/sys/module/sx_core/asic0/module*')) + from sonic_py_common import device_info + platform_path = device_info.get_path_to_platform_dir() + platform_json_path = os.path.join(platform_path, 'platform.json') + platform_data = utils.load_json_file(platform_json_path) + return len(platform_data['chassis']['sfps']) @classmethod def get_linecard_sfp_count(cls, lc_index): @@ -244,3 +248,23 @@ class DeviceDataManager: sai_profile_file = os.path.join(hwsku_dir, 'sai.profile') data = utils.read_key_value_file(sai_profile_file, delimeter='=') return data.get('SAI_INDEPENDENT_MODULE_MODE') == '1' + + @classmethod + def wait_platform_ready(cls): + """ + Wait for Nvidia platform related services(SDK, hw-management) ready + Returns: + bool: True if wait success else timeout + """ + conditions = [] + sysfs_nodes = ['power_mode', 'power_mode_policy', 'present', 'reset', 'status', 'statuserror'] + if cls.is_independent_mode(): + sysfs_nodes.extend(['control', 'frequency', 'frequency_support', 'hw_present', 'hw_reset', + 'power_good', 'power_limit', 'power_on', 'temperature/input']) + else: + conditions.append(lambda: utils.read_int_from_file('/var/run/hw-management/config/asics_init_done') == 1) + sfp_count = cls.get_sfp_count() + for sfp_index in range(sfp_count): + for sysfs_node in sysfs_nodes: + conditions.append(lambda: os.path.exists(f'/sys/module/sx_core/asic0/module{sfp_index}/{sysfs_node}')) + return utils.wait_until_conditions(conditions, 300, 1) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py b/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py index 9db38e6b4..1135903c2 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py @@ -290,6 +290,30 @@ def wait_until(predict, timeout, interval=1, *args, **kwargs): return False +def wait_until_conditions(conditions, timeout, interval=1): + """ + Wait until all the conditions become true + Args: + conditions (list): a list of callable which generate True|False + timeout (int): wait time in seconds + interval (int, optional): interval to check the predict. Defaults to 1. + + Returns: + bool: True if wait success else False + """ + while timeout > 0: + pending_conditions = [] + for condition in conditions: + if not condition(): + pending_conditions.append(condition) + if not pending_conditions: + return True + conditions = pending_conditions + time.sleep(interval) + timeout -= interval + return False + + class TimerEvent: def __init__(self, interval, cb, repeat): self.interval = interval diff --git a/platform/mellanox/mlnx-platform-api/tests/test_device_data.py b/platform/mellanox/mlnx-platform-api/tests/test_device_data.py index 866f01c3e..c172b82a3 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_device_data.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_device_data.py @@ -60,6 +60,26 @@ class TestDeviceData: mock_read.return_value = {'SAI_INDEPENDENT_MODULE_MODE': '1'} assert DeviceDataManager.is_independent_mode() + @mock.patch('sonic_py_common.device_info.get_path_to_platform_dir', mock.MagicMock(return_value='/tmp')) + @mock.patch('sonic_platform.device_data.utils.load_json_file') + def test_get_sfp_count(self, mock_load_json): + mock_load_json.return_value = { + 'chassis': { + 'sfps': [1,2,3] + } + } + assert DeviceDataManager.get_sfp_count() == 3 - - + @mock.patch('sonic_platform.device_data.time.sleep', mock.MagicMock()) + @mock.patch('sonic_platform.device_data.DeviceDataManager.get_sfp_count', mock.MagicMock(return_value=3)) + @mock.patch('sonic_platform.device_data.utils.read_int_from_file', mock.MagicMock(return_value=1)) + @mock.patch('sonic_platform.device_data.os.path.exists') + @mock.patch('sonic_platform.device_data.DeviceDataManager.is_independent_mode') + def test_wait_platform_ready(self, mock_is_indep, mock_exists): + mock_exists.return_value = True + mock_is_indep.return_value = True + assert DeviceDataManager.wait_platform_ready() + mock_is_indep.return_value = False + assert DeviceDataManager.wait_platform_ready() + mock_exists.return_value = False + assert not DeviceDataManager.wait_platform_ready() diff --git a/platform/mellanox/mlnx-platform-api/tests/test_utils.py b/platform/mellanox/mlnx-platform-api/tests/test_utils.py index 2a186de7e..b6ec67975 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_utils.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_utils.py @@ -195,6 +195,13 @@ class TestUtils: mock_os_open = mock.mock_open(read_data='a=b') with mock.patch('sonic_platform.utils.open', mock_os_open): assert utils.read_key_value_file('some_file', delimeter='=') == {'a':'b'} + + @mock.patch('sonic_platform.utils.time.sleep', mock.MagicMock()) + def test_wait_until_conditions(self): + conditions = [lambda: True] + assert utils.wait_until_conditions(conditions, 1) + conditions = [lambda: False] + assert not utils.wait_until_conditions(conditions, 1) def test_timer(self): timer = utils.Timer()