Skip to content

Commit 6fa2b06

Browse files
authored
[OOM test] Assert DUT reboot before starting critical process check (sonic-net#17654)
Summary: On some platforms we notice DUT not really reboot after OOM, but the test failure reason is some critical process unhealthy. To catch such issue, assert DUT reboot before starting critical process check. What is the motivation for this PR? On some platforms we notice DUT not really reboot after OOM, but the test failure reason is some critical process unhealthy. How did you do it? Assert DUT reboot before starting critical process check. How did you verify/test it? Verified on Arista-720DT M0 testbed.
1 parent 588b4e7 commit 6fa2b06

File tree

1 file changed

+19
-41
lines changed

1 file changed

+19
-41
lines changed

tests/platform_tests/test_memory_exhaustion.py

+19-41
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,16 @@
22
import time
33
import pytest
44

5-
from tests.common.helpers.assertions import pytest_assert
6-
from tests.common.platform.processes_utils import wait_critical_processes
7-
from tests.common.reboot import SONIC_SSH_PORT, SONIC_SSH_REGEX, wait_for_startup
5+
from tests.common.platform.processes_utils import wait_critical_processes, get_critical_processes_status
6+
from tests.common.reboot import wait_for_startup
7+
from tests.common.utilities import wait_until
8+
from tests.common.errors import RunAnsibleModuleFail
89

910
pytestmark = [
1011
pytest.mark.disable_loganalyzer,
1112
pytest.mark.topology('any')
1213
]
1314

14-
SSH_SHUTDOWN_TIMEOUT = 480
15-
SSH_STARTUP_TIMEOUT = 600
16-
1715
SSH_STATE_ABSENT = "absent"
1816
SSH_STATE_STARTED = "started"
1917

@@ -37,25 +35,21 @@ def tearDown(self, duthosts, enum_rand_one_per_hwsku_hostname,
3735
# If the SSH connection is not established, or any critical process is exited,
3836
# try to recover the DUT by PDU reboot.
3937
duthost = duthosts[enum_rand_one_per_hwsku_hostname]
40-
dut_ip = duthost.mgmt_ip
4138
hostname = duthost.hostname
42-
if not self.check_ssh_state(localhost, dut_ip, SSH_STATE_STARTED):
39+
status, _ = get_critical_processes_status(duthost)
40+
if not status:
4341
if pdu_controller is None:
4442
logging.error("No PDU controller for {}, failed to recover DUT!".format(hostname))
4543
return
4644
self.pdu_reboot(pdu_controller)
47-
# Waiting for SSH connection startup
48-
pytest_assert(self.check_ssh_state(localhost, dut_ip, SSH_STATE_STARTED, SSH_STARTUP_TIMEOUT),
49-
'Recover {} by PDU reboot failed'.format(hostname))
5045
# Wait until all critical processes are healthy.
5146
wait_critical_processes(duthost)
5247
self.wait_lc_healthy_if_sup(duthost, duthosts, localhost)
5348

5449
def test_memory_exhaustion(self, duthosts, enum_rand_one_per_hwsku_hostname, localhost):
5550
duthost = duthosts[enum_rand_one_per_hwsku_hostname]
56-
dut_ip = duthost.mgmt_ip
5751
hostname = duthost.hostname
58-
dut_datetime = duthost.get_now_time()
52+
datetime_before_reboot = duthost.get_now_time()
5953

6054
# Our shell command is designed as 'nohup bash -c "sleep 5 && tail /dev/zero" &' because of:
6155
# * `tail /dev/zero` is used to run out of memory completely.
@@ -75,38 +69,22 @@ def test_memory_exhaustion(self, duthosts, enum_rand_one_per_hwsku_hostname, loc
7569
if not res.is_successful:
7670
pytest.fail('DUT {} run command {} failed'.format(hostname, cmd))
7771

78-
# Waiting for SSH connection shutdown
79-
pytest_assert(self.check_ssh_state(localhost, dut_ip, SSH_STATE_ABSENT, SSH_SHUTDOWN_TIMEOUT),
80-
'DUT {} did not shutdown'.format(hostname))
81-
# Waiting for SSH connection startup
82-
pytest_assert(self.check_ssh_state(localhost, dut_ip, SSH_STATE_STARTED, SSH_STARTUP_TIMEOUT),
83-
'DUT {} did not startup'.format(hostname))
72+
# Verify DUT triggered OOM reboot.
73+
self.wait_until_reboot(duthost, datetime_before_reboot)
8474
# Wait until all critical processes are healthy.
8575
wait_critical_processes(duthost)
8676
self.wait_lc_healthy_if_sup(duthost, duthosts, localhost)
87-
# Verify DUT uptime is later than the time when the test case started running.
88-
dut_uptime = duthost.get_up_time()
89-
pytest_assert(dut_uptime > dut_datetime, "Device {} did not reboot".format(hostname))
90-
91-
def check_ssh_state(self, localhost, dut_ip, expected_state, timeout=60):
92-
"""
93-
Check the SSH state of DUT.
9477

95-
:param localhost: A `tests.common.devices.local.Localhost` Object.
96-
:param dut_ip: A string, the IP address of DUT.
97-
:param expected_state: A string, the expected SSH state.
98-
:param timeout: An integer, the maximum number of seconds to wait for.
99-
:return: A boolean, True if SSH state is the same as expected
100-
, False otherwise.
101-
"""
102-
res = localhost.wait_for(host=dut_ip,
103-
port=SONIC_SSH_PORT,
104-
state=expected_state,
105-
search_regex=SONIC_SSH_REGEX,
106-
delay=10,
107-
timeout=timeout,
108-
module_ignore_errors=True)
109-
return not res.is_failed and 'Timeout' not in res.get('msg', '')
78+
def wait_until_reboot(self, duthost, datetime_before_reboot, timeout=600):
79+
def check_dut_rebooted(duthost, datetime_before_reboot):
80+
try:
81+
dut_up_datetime = duthost.get_up_time()
82+
except RunAnsibleModuleFail:
83+
# We may hit HostUnreachable issue during device reboot, so return False when
84+
# RunAnsibleModuleFail raised.
85+
return False
86+
return dut_up_datetime > datetime_before_reboot
87+
wait_until(timeout, 10, 0, check_dut_rebooted, duthost, datetime_before_reboot)
11088

11189
def pdu_reboot(self, pdu_controller):
11290
hostname = pdu_controller.dut_hostname

0 commit comments

Comments
 (0)