Skip to content

Commit f53ff21

Browse files
authored
[202305] Handling exceptions in CMIS SM to prevent xcvrd crash (sonic-net#484)
202305 cherry-pick for sonic-net#483
1 parent 0694d33 commit f53ff21

File tree

2 files changed

+68
-16
lines changed

2 files changed

+68
-16
lines changed

sonic-xcvrd/tests/test_xcvrd.py

+52
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,58 @@ def test_CmisManagerTask_task_run_with_exception(self):
7373
assert("sonic-xcvrd/xcvrd/xcvrd.py" in str(trace))
7474
assert("wait_for_port_config_done" in str(trace))
7575

76+
@patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_update_event', MagicMock(return_value=(None, None)))
77+
@patch('xcvrd.xcvrd_utilities.port_mapping.handle_port_update_event', MagicMock())
78+
@patch('xcvrd.xcvrd.CmisManagerTask.wait_for_port_config_done', MagicMock())
79+
@patch('xcvrd.xcvrd.log_exception_traceback')
80+
@patch('xcvrd.xcvrd.XcvrTableHelper.get_status_tbl')
81+
@patch('xcvrd.xcvrd.platform_chassis')
82+
def test_CmisManagerTask_get_xcvr_api_exception(self, mock_platform_chassis, mock_get_status_tbl, mock_log_exception_traceback):
83+
mock_get_status_tbl = Table("STATE_DB", TRANSCEIVER_STATUS_TABLE)
84+
mock_sfp = MagicMock()
85+
mock_sfp.get_presence.return_value = True
86+
mock_platform_chassis.get_sfp = MagicMock(return_value=mock_sfp)
87+
port_mapping = PortMapping()
88+
port_mapping
89+
stop_event = threading.Event()
90+
task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event)
91+
task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True])
92+
task.get_host_tx_status = MagicMock(return_value='true')
93+
task.get_port_admin_status = MagicMock(return_value='up')
94+
task.get_cfg_port_tbl = MagicMock()
95+
task.xcvr_table_helper.get_status_tbl.return_value = mock_get_status_tbl
96+
task.get_cmis_application_desired = MagicMock(side_effect=KeyError)
97+
port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET,
98+
{'speed':'400000', 'lanes':'1,2,3,4,5,6,7,8'})
99+
100+
# Case 1: get_xcvr_api() raises an exception
101+
task.on_port_update_event(port_change_event)
102+
mock_sfp.get_xcvr_api = MagicMock(side_effect=NotImplementedError)
103+
task.task_worker()
104+
assert mock_log_exception_traceback.call_count == 1
105+
assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_FAILED
106+
107+
# Case 2: is_flat_memory() raises AttributeError. In this case, CMIS SM should transition to READY state
108+
mock_xcvr_api = MagicMock()
109+
mock_sfp.get_xcvr_api = MagicMock(return_value=mock_xcvr_api)
110+
mock_xcvr_api.is_flat_memory = MagicMock(side_effect=AttributeError)
111+
task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True])
112+
task.on_port_update_event(port_change_event)
113+
task.task_worker()
114+
assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_READY
115+
116+
# Case 3: get_cmis_application_desired() raises an exception
117+
mock_xcvr_api.is_flat_memory = MagicMock(return_value=False)
118+
mock_xcvr_api.is_coherent_module = MagicMock(return_value=False)
119+
mock_xcvr_api.get_module_type_abbreviation = MagicMock(return_value='QSFP-DD')
120+
task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True])
121+
task.on_port_update_event(port_change_event)
122+
task.get_cmis_host_lanes_mask = MagicMock()
123+
task.task_worker()
124+
assert mock_log_exception_traceback.call_count == 2
125+
assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_FAILED
126+
assert task.get_cmis_host_lanes_mask.call_count == 0
127+
76128
@patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_config_change', MagicMock(side_effect = NotImplementedError))
77129
def test_DomInfoUpdateTask_task_run_with_exception(self):
78130
port_mapping = PortMapping()

sonic-xcvrd/xcvrd/xcvrd.py

+16-16
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,12 @@
121121
#
122122
# Helper functions =============================================================
123123
#
124+
def log_exception_traceback():
125+
exc_type, exc_value, exc_traceback = sys.exc_info()
126+
msg = traceback.format_exception(exc_type, exc_value, exc_traceback)
127+
for tb_line in msg:
128+
for tb_line_split in tb_line.splitlines():
129+
helper_logger.log_error(tb_line_split)
124130

125131
# Get physical port name
126132

@@ -1540,6 +1546,11 @@ def task_worker(self):
15401546
# Skip if these essential routines are not available
15411547
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_READY)
15421548
continue
1549+
except Exception as e:
1550+
self.log_error("{}: Exception in xcvr api: {}".format(lport, e))
1551+
log_exception_traceback()
1552+
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_FAILED)
1553+
continue
15431554

15441555
# CMIS expiration and retries
15451556
#
@@ -1763,8 +1774,9 @@ def task_worker(self):
17631774
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_READY)
17641775
self.post_port_active_apsel_to_db(api, lport, host_lanes_mask)
17651776

1766-
except (NotImplementedError, AttributeError) as e:
1777+
except Exception as e:
17671778
self.log_error("{}: internal errors due to {}".format(lport, e))
1779+
log_exception_traceback()
17681780
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_FAILED)
17691781

17701782
self.log_notice("Stopped")
@@ -1782,11 +1794,7 @@ def run(self):
17821794
self.task_worker()
17831795
except Exception as e:
17841796
helper_logger.log_error("Exception occured at {} thread due to {}".format(threading.current_thread().getName(), repr(e)))
1785-
exc_type, exc_value, exc_traceback = sys.exc_info()
1786-
msg = traceback.format_exception(exc_type, exc_value, exc_traceback)
1787-
for tb_line in msg:
1788-
for tb_line_split in tb_line.splitlines():
1789-
helper_logger.log_error(tb_line_split)
1797+
log_exception_traceback()
17901798
self.exc = e
17911799
self.main_thread_stop_event.set()
17921800

@@ -1946,11 +1954,7 @@ def run(self):
19461954
self.task_worker()
19471955
except Exception as e:
19481956
helper_logger.log_error("Exception occured at {} thread due to {}".format(threading.current_thread().getName(), repr(e)))
1949-
exc_type, exc_value, exc_traceback = sys.exc_info()
1950-
msg = traceback.format_exception(exc_type, exc_value, exc_traceback)
1951-
for tb_line in msg:
1952-
for tb_line_split in tb_line.splitlines():
1953-
helper_logger.log_error(tb_line_split)
1957+
log_exception_traceback()
19541958
self.exc = e
19551959
self.main_thread_stop_event.set()
19561960

@@ -2375,11 +2379,7 @@ def run(self):
23752379
self.task_worker(self.task_stopping_event, self.sfp_error_event)
23762380
except Exception as e:
23772381
helper_logger.log_error("Exception occured at {} thread due to {}".format(threading.current_thread().getName(), repr(e)))
2378-
exc_type, exc_value, exc_traceback = sys.exc_info()
2379-
msg = traceback.format_exception(exc_type, exc_value, exc_traceback)
2380-
for tb_line in msg:
2381-
for tb_line_split in tb_line.splitlines():
2382-
helper_logger.log_error(tb_line_split)
2382+
log_exception_traceback()
23832383
self.exc = e
23842384
self.main_thread_stop_event.set()
23852385

0 commit comments

Comments
 (0)