Skip to content

Commit 0694d33

Browse files
mlok-nokiaarlakshm
authored andcommitted
[chassis][linecard] Fix Module LINECARD<> went off-line message for empty slot issue (sonic-net#462)
* [chassis][linecard] Fix Module LINECARD<> went off-line message for empty slot issue Signed-off-by: mlok <marty.lok@nokia.com> * Define/use get_module_current_status() --------- Signed-off-by: mlok <marty.lok@nokia.com> Co-authored-by: Arvindsrinivasan Lakshmi Narasimhan <55814491+arlakshm@users.noreply.github.com>
1 parent 8ee3009 commit 0694d33

File tree

2 files changed

+102
-15
lines changed

2 files changed

+102
-15
lines changed

sonic-chassisd/scripts/chassisd

+23-11
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,13 @@ class ModuleUpdater(logger.Logger):
237237
fvs = swsscommon.FieldValuePairs([(CHASSIS_INFO_CARD_NUM_FIELD, str(num_modules))])
238238
self.chassis_table.set(CHASSIS_INFO_KEY_TEMPLATE.format(1), fvs)
239239

240+
def get_module_current_status(self, key):
241+
fvs = self.module_table.get(key)
242+
if isinstance(fvs, list) and fvs[0] is True:
243+
fvs = dict(fvs[-1])
244+
return fvs[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]
245+
return ModuleBase.MODULE_STATUS_EMPTY
246+
240247
def module_db_update(self):
241248
notOnlineModules = []
242249

@@ -260,6 +267,7 @@ class ModuleUpdater(logger.Logger):
260267
(CHASSIS_MODULE_INFO_OPERSTATUS_FIELD, module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]),
261268
(CHASSIS_MODULE_INFO_NUM_ASICS_FIELD, str(len(module_info_dict[CHASSIS_MODULE_INFO_ASICS]))),
262269
(CHASSIS_MODULE_INFO_SERIAL_FIELD, module_info_dict[CHASSIS_MODULE_INFO_SERIAL_FIELD])])
270+
prev_status = self.get_module_current_status(key)
263271
self.module_table.set(key, fvs)
264272

265273
# Construct key for down_modules dict. Example down_modules key format: LINE-CARD0|<hostname>
@@ -272,23 +280,27 @@ class ModuleUpdater(logger.Logger):
272280
down_module_key = key+'|'
273281

274282
if module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD] != str(ModuleBase.MODULE_STATUS_ONLINE):
275-
notOnlineModules.append(key)
276-
# Record the time when the module down was detected to track the
277-
# module down time. Used for chassis db cleanup for all asics of the module if the module is down for a
278-
# long time like 30 mins.
279-
# All down modules including supervisor are added to the down modules dictionary. This is to help
280-
# identifying module operational status change. But the clean up will not be attempted for supervisor
281-
if down_module_key not in self.down_modules:
282-
self.log_warning("Module {} went off-line!".format(key))
283-
self.down_modules[down_module_key] = {}
284-
self.down_modules[down_module_key]['down_time'] = time.time()
285-
self.down_modules[down_module_key]['cleaned'] = False
283+
if prev_status == ModuleBase.MODULE_STATUS_ONLINE:
284+
notOnlineModules.append(key)
285+
# Record the time when the module down was detected to track the
286+
# module down time. Used for chassis db cleanup for all asics of the module if the module is down for a
287+
# long time like 30 mins.
288+
# All down modules including supervisor are added to the down modules dictionary. This is to help
289+
# identifying module operational status change. But the clean up will not be attempted for supervisor
290+
291+
if down_module_key not in self.down_modules:
292+
self.log_warning("Module {} went off-line!".format(key))
293+
self.down_modules[down_module_key] = {}
294+
self.down_modules[down_module_key]['down_time'] = time.time()
295+
self.down_modules[down_module_key]['cleaned'] = False
286296
continue
287297
else:
288298
# Module is operational. Remove it from down time tracking.
289299
if down_module_key in self.down_modules:
290300
self.log_notice("Module {} recovered on-line!".format(key))
291301
del self.down_modules[down_module_key]
302+
elif prev_status != ModuleBase.MODULE_STATUS_ONLINE:
303+
self.log_notice("Module {} is on-line!".format(key))
292304

293305
for asic_id, asic in enumerate(module_info_dict[CHASSIS_MODULE_INFO_ASICS]):
294306
asic_global_id, asic_pci_addr = asic

sonic-chassisd/tests/test_chassisd.py

+79-4
Original file line numberDiff line numberDiff line change
@@ -652,8 +652,83 @@ def test_chassis_db_cleanup():
652652

653653
# Mock >= CHASSIS_DB_CLEANUP_MODULE_DOWN_PERIOD module down period for LINE-CARD1
654654
down_module_key = lc2_name+"|"
655-
module_down_time = sup_module_updater.down_modules[down_module_key]["down_time"]
656-
sup_module_updater.down_modules[down_module_key]["down_time"] = module_down_time - ((CHASSIS_DB_CLEANUP_MODULE_DOWN_PERIOD+10)*60)
655+
assert down_module_key not in sup_module_updater.down_modules.keys()
656+
657+
sup_module_updater.module_down_chassis_db_cleanup()
658+
659+
def test_chassis_db_bootup_with_empty_slot():
660+
chassis = MockChassis()
661+
662+
#Supervisor
663+
index = 0
664+
sup_name = "SUPERVISOR0"
665+
desc = "Supervisor card"
666+
sup_slot = 16
667+
serial = "RP1000101"
668+
module_type = ModuleBase.MODULE_TYPE_SUPERVISOR
669+
supervisor = MockModule(index, sup_name, desc, module_type, sup_slot, serial)
670+
supervisor.set_midplane_ip()
671+
chassis.module_list.append(supervisor)
657672

658-
# Run module database update from supervisor to run chassis db cleanup
659-
sup_module_updater.module_down_chassis_db_cleanup()
673+
#Linecard 0. Host name will be pushed for this to make clean up happen
674+
index = 1
675+
lc_name = "LINE-CARD0"
676+
desc = "36 port 400G card"
677+
lc_slot = 1
678+
serial = "LC1000101"
679+
module_type = ModuleBase.MODULE_TYPE_LINE
680+
module = MockModule(index, lc_name, desc, module_type, lc_slot, serial)
681+
module.set_midplane_ip()
682+
status = ModuleBase.MODULE_STATUS_ONLINE
683+
module.set_oper_status(status)
684+
chassis.module_list.append(module)
685+
686+
#Linecard 1. Host name will not be pushed for this so that clean up will not happen
687+
index = 2
688+
lc2_name = u"LINE-CARD1"
689+
desc = "Unavailable'"
690+
lc2_slot = 2
691+
serial = "N/A"
692+
module_type = ModuleBase.MODULE_TYPE_LINE
693+
module2 = MockModule(index, lc2_name, desc, module_type, lc2_slot, serial)
694+
module2.set_midplane_ip()
695+
status = ModuleBase.MODULE_STATUS_EMPTY
696+
module2.set_oper_status(status)
697+
chassis.module_list.append(module2)
698+
699+
# Supervisor ModuleUpdater
700+
sup_module_updater = ModuleUpdater(SYSLOG_IDENTIFIER, chassis, sup_slot, sup_slot)
701+
sup_module_updater.modules_num_update()
702+
703+
sup_module_updater.module_db_update()
704+
705+
# check LC1 STATUS ONLINE in module table
706+
fvs = sup_module_updater.module_table.get(lc_name)
707+
if isinstance(fvs, list):
708+
fvs = dict(fvs[-1])
709+
assert ModuleBase.MODULE_STATUS_ONLINE == fvs[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]
710+
711+
# check LC2 STATUS EMPTY in module table
712+
fvs = sup_module_updater.module_table.get(lc2_name)
713+
if isinstance(fvs, list):
714+
fvs = dict(fvs[-1])
715+
assert ModuleBase.MODULE_STATUS_EMPTY == fvs[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]
716+
717+
# Both should no tbe in down_module keys.
718+
719+
down_module_lc1_key = lc_name+"|"
720+
assert down_module_lc1_key not in sup_module_updater.down_modules.keys()
721+
down_module_lc2_key = lc_name+"|"
722+
assert down_module_lc2_key not in sup_module_updater.down_modules.keys()
723+
724+
# Change linecard module1 status to OFFLINE
725+
status = ModuleBase.MODULE_STATUS_OFFLINE
726+
module.set_oper_status(status)
727+
sup_module_updater.module_db_update()
728+
729+
fvs = sup_module_updater.module_table.get(lc_name)
730+
if isinstance(fvs, list):
731+
fvs = dict(fvs[-1])
732+
assert status == fvs[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]
733+
assert down_module_lc1_key in sup_module_updater.down_modules.keys()
734+

0 commit comments

Comments
 (0)