From 936a7cc13506f31ccbdc7a1bfe527be4cd94984c Mon Sep 17 00:00:00 2001 From: liuh-80 Date: Mon, 24 Feb 2025 03:14:00 +0000 Subject: [PATCH 1/6] Add Orchagent heartbeat config --- dockers/docker-orchagent/orchagent.sh | 6 +++ files/scripts/supervisor-proc-exit-listener | 24 ++++++++- .../yang-models/sonic-heartbeat.yang | 53 +++++++++++++++++++ 3 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 src/sonic-yang-models/yang-models/sonic-heartbeat.yang diff --git a/dockers/docker-orchagent/orchagent.sh b/dockers/docker-orchagent/orchagent.sh index 28067db7ccf5..bcc896b05e91 100755 --- a/dockers/docker-orchagent/orchagent.sh +++ b/dockers/docker-orchagent/orchagent.sh @@ -111,4 +111,10 @@ if [[ x"${MGMT_VRF_ENABLED}" == x"true" ]]; then ORCHAGENT_ARGS+=" -v mgmt" fi +# Add heartbeat interval when enabled +HEARTBEAT_INTERVAL=`sonic-db-cli CONFIG_DB hget "HEARTBEAT|orchagent" "heartbeat_interval"` +if [ ! -z "$HEARTBEAT_INTERVAL" ] && [ $HEARTBEAT_INTERVAL != "null" ]; then + ORCHAGENT_ARGS+=" -I $HEARTBEAT_INTERVAL" +fi + exec /usr/bin/orchagent ${ORCHAGENT_ARGS} diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 8628826e6157..9a6746d8149e 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -31,6 +31,9 @@ CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes' # The FEATURE table in config db contains auto-restart field FEATURE_TABLE_NAME = 'FEATURE' +# The HEARTBEAT table in config db contains heart beat config +HEARTBEAT_TABLE_NAME = 'HEARTBEAT' + # Value of parameter 'timeout' in select(...) method SELECT_TIMEOUT_SECS = 1.0 @@ -40,6 +43,8 @@ ALERTING_INTERVAL_SECS = 60 EVENTS_PUBLISHER_SOURCE = "sonic-events-host" EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly" +heartbeat_alert_interval_mapping = defaultdict(dict) + def get_group_and_process_list(process_file): """ @summary: Read the critical processes/group names. @@ -114,6 +119,21 @@ def get_autorestart_state(container_name, use_unix_socket_path): return is_auto_restart +def load_heartbeat_alert_interval(use_unix_socket_path): + config_db = swsscommon.ConfigDBConnector(use_unix_socket_path=use_unix_socket_path) + config_db.connect() + heartbeat_table = config_db.get_table(HEARTBEAT_TABLE_NAME) + if heartbeat_table: + heartbeat_table_keys = heartbeat_table.keys() + for process in heartbeat_table_keys: + heartbeat_alert_interval_mapping[process] = heartbeat_table[process].get('alert_interval') / 1000 + +def get_heartbeat_alert_interval(process): + if process in heartbeat_alert_interval_mapping: + return heartbeat_alert_interval_mapping[process] + + return ALERTING_INTERVAL_SECS + def publish_events(events_handle, process_name, container_name): params = swsscommon.FieldValueMap() params["process_name"] = process_name @@ -136,6 +156,8 @@ def main(argv): critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE) + load_heartbeat_alert_interval(use_unix_socket_path) + # WATCH_PROCESSES_FILE is optional watch_process_list = [] if os.path.exists(WATCH_PROCESSES_FILE): @@ -211,7 +233,7 @@ def main(argv): for process in process_heart_beat_info.keys(): epoch_time = time.time() elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"] - if elapsed_secs >= ALERTING_INTERVAL_SECS: + if elapsed_secs >= get_heartbeat_alert_interval(process): elapsed_mins = elapsed_secs // 60 generate_alerting_message(process, "stuck", elapsed_mins, syslog.LOG_WARNING) diff --git a/src/sonic-yang-models/yang-models/sonic-heartbeat.yang b/src/sonic-yang-models/yang-models/sonic-heartbeat.yang new file mode 100644 index 000000000000..9e53f73a8a26 --- /dev/null +++ b/src/sonic-yang-models/yang-models/sonic-heartbeat.yang @@ -0,0 +1,53 @@ +module sonic-heartbeat { + + yang-version 1.1; + + namespace "http://github.com/sonic-net/sonic-heartbeat"; + prefix heartbeat; + + import ietf-inet-types { + prefix inet; + } + + organization + "SONiC"; + + contact + "SONiC"; + + description "HEARTBEAT YANG Module for SONiC OS"; + + revision 2025-01-09 { + description "First Revision"; + } + + container sonic-heartbeat { + + container HEARTBEAT { + description "HEARTBEAT config TABLE part of config_db.json"; + + list HEARTBEAT_LIST { + key "name"; + + leaf name { + description "process name in HEARTBEAT table"; + type string { + length 1..32; + } + } + + leaf heartbeat_interval { + description "Heartbeat interval in millisecond"; + type uint32; + default "100000"; + } + + leaf alert_interval { + description "Alert interval in millisecond"; + type uint32; + default "60000"; + } + } + } + } +} \ No newline at end of file From 98ab5a7e31b4dd5360aff68effce34565fef3a5a Mon Sep 17 00:00:00 2001 From: Hua Liu <58683130+liuh-80@users.noreply.github.com> Date: Thu, 6 Mar 2025 13:40:19 +0800 Subject: [PATCH 2/6] Update heartbeat and alert interval logic --- files/scripts/supervisor-proc-exit-listener | 5 +++-- src/sonic-yang-models/yang-models/sonic-heartbeat.yang | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 9a6746d8149e..3f23ffbc69e8 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -233,8 +233,9 @@ def main(argv): for process in process_heart_beat_info.keys(): epoch_time = time.time() elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"] - if elapsed_secs >= get_heartbeat_alert_interval(process): - elapsed_mins = elapsed_secs // 60 + threshold = get_heartbeat_alert_interval(process) + if threshold > 0 and elapsed_secs >= threshold: + elapsed_mins = elapsed_secs generate_alerting_message(process, "stuck", elapsed_mins, syslog.LOG_WARNING) if __name__ == "__main__": diff --git a/src/sonic-yang-models/yang-models/sonic-heartbeat.yang b/src/sonic-yang-models/yang-models/sonic-heartbeat.yang index 9e53f73a8a26..dbf77902514d 100644 --- a/src/sonic-yang-models/yang-models/sonic-heartbeat.yang +++ b/src/sonic-yang-models/yang-models/sonic-heartbeat.yang @@ -39,7 +39,7 @@ module sonic-heartbeat { leaf heartbeat_interval { description "Heartbeat interval in millisecond"; type uint32; - default "100000"; + default "10000"; } leaf alert_interval { From 554e1103a6517175a5868115fe5816cd47b464f1 Mon Sep 17 00:00:00 2001 From: liuh-80 Date: Fri, 21 Mar 2025 03:07:32 +0000 Subject: [PATCH 3/6] Add yang model test --- .../tests/files/sample_config_db.json | 6 ++++ .../yang_model_tests/tests/heartbeat.json | 13 ++++++++ .../tests_config/heartbeat.json | 32 +++++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100644 src/sonic-yang-models/tests/yang_model_tests/tests/heartbeat.json create mode 100644 src/sonic-yang-models/tests/yang_model_tests/tests_config/heartbeat.json diff --git a/src/sonic-yang-models/tests/files/sample_config_db.json b/src/sonic-yang-models/tests/files/sample_config_db.json index 710151ef6662..f5da48c5a46b 100644 --- a/src/sonic-yang-models/tests/files/sample_config_db.json +++ b/src/sonic-yang-models/tests/files/sample_config_db.json @@ -2874,5 +2874,11 @@ "daemon_polling_interval" : "3600", "fsstats_sync_interval" : "86400" } + }, + "HEARTBEAT": { + "orchagent": { + "heartbeat_interval" : "10000", + "alert_interval" : "60000" + } } } diff --git a/src/sonic-yang-models/tests/yang_model_tests/tests/heartbeat.json b/src/sonic-yang-models/tests/yang_model_tests/tests/heartbeat.json new file mode 100644 index 000000000000..9661e0a99768 --- /dev/null +++ b/src/sonic-yang-models/tests/yang_model_tests/tests/heartbeat.json @@ -0,0 +1,13 @@ +{ + "HEARTBEAT_TABLE_WITH_INVALID_HEARTBEAT_INTERVAL": { + "desc": "TABLE_WITH_INVALID_HEARTBEAT_INTERVAL failure.", + "eStrKey": "Pattern" + }, + "HEARTBEAT_TABLE_WITH_INVALID_ALERT_INTERVAL": { + "desc": "TABLE_WITH_INVALID_HEARTBEAT_INTERVAL failure.", + "eStrKey": "Pattern" + }, + "HEARTBEAT_TABLE_WITH_VALID_CONFIG": { + "desc": "TABLE WITH VALID CONFIG." + } +} diff --git a/src/sonic-yang-models/tests/yang_model_tests/tests_config/heartbeat.json b/src/sonic-yang-models/tests/yang_model_tests/tests_config/heartbeat.json new file mode 100644 index 000000000000..be185912e27f --- /dev/null +++ b/src/sonic-yang-models/tests/yang_model_tests/tests_config/heartbeat.json @@ -0,0 +1,32 @@ +{ + "HEARTBEAT_TABLE_WITH_INVALID_HEARTBEAT_INTERVAL": { + "sonic-heartbeat:sonic-heartbeat": { + "sonic-heartbeat:HEARTBEAT": { + "orchagent": { + "heartbeat_interval" : "invalid", + "alert_interval" : "60000" + } + } + } + }, + "HEARTBEAT_TABLE_WITH_INVALID_ALERT_INTERVAL": { + "sonic-heartbeat:sonic-heartbeat": { + "sonic-heartbeat:HEARTBEAT": { + "orchagent": { + "heartbeat_interval" : "10000", + "alert_interval" : "invalid" + } + } + } + }, + "HEARTBEAT_TABLE_WITH_VALID_CONFIG": { + "sonic-heartbeat:sonic-heartbeat": { + "sonic-heartbeat:HEARTBEAT": { + "orchagent": { + "heartbeat_interval" : "10000", + "alert_interval" : "60000" + } + } + } + } +} From 2b4e2e8bbfd83461c6a4f79dd8fa6b0dcdc41fa5 Mon Sep 17 00:00:00 2001 From: Hua Liu <58683130+liuh-80@users.noreply.github.com> Date: Mon, 24 Mar 2025 13:39:17 +0800 Subject: [PATCH 4/6] Update heartbeat.json --- .../yang_model_tests/tests_config/heartbeat.json | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/sonic-yang-models/tests/yang_model_tests/tests_config/heartbeat.json b/src/sonic-yang-models/tests/yang_model_tests/tests_config/heartbeat.json index be185912e27f..7a8a81025ccc 100644 --- a/src/sonic-yang-models/tests/yang_model_tests/tests_config/heartbeat.json +++ b/src/sonic-yang-models/tests/yang_model_tests/tests_config/heartbeat.json @@ -2,30 +2,33 @@ "HEARTBEAT_TABLE_WITH_INVALID_HEARTBEAT_INTERVAL": { "sonic-heartbeat:sonic-heartbeat": { "sonic-heartbeat:HEARTBEAT": { - "orchagent": { + "AAA_LIST": [{ + "name" : "orchagent", "heartbeat_interval" : "invalid", "alert_interval" : "60000" - } + }] } } }, "HEARTBEAT_TABLE_WITH_INVALID_ALERT_INTERVAL": { "sonic-heartbeat:sonic-heartbeat": { "sonic-heartbeat:HEARTBEAT": { - "orchagent": { + "AAA_LIST": [{ + "name" : "orchagent", "heartbeat_interval" : "10000", "alert_interval" : "invalid" - } + }] } } }, "HEARTBEAT_TABLE_WITH_VALID_CONFIG": { "sonic-heartbeat:sonic-heartbeat": { "sonic-heartbeat:HEARTBEAT": { - "orchagent": { + "AAA_LIST": [{ + "name" : "orchagent", "heartbeat_interval" : "10000", "alert_interval" : "60000" - } + }] } } } From abf3e9e32ffef93e627b44bed7cd73937b626186 Mon Sep 17 00:00:00 2001 From: Hua Liu <58683130+liuh-80@users.noreply.github.com> Date: Mon, 24 Mar 2025 14:45:07 +0800 Subject: [PATCH 5/6] Rename `AAA_LIST` to `HEARTBEAT_LIST` in JSON --- .../tests/yang_model_tests/tests_config/heartbeat.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sonic-yang-models/tests/yang_model_tests/tests_config/heartbeat.json b/src/sonic-yang-models/tests/yang_model_tests/tests_config/heartbeat.json index 7a8a81025ccc..ef749eafba0a 100644 --- a/src/sonic-yang-models/tests/yang_model_tests/tests_config/heartbeat.json +++ b/src/sonic-yang-models/tests/yang_model_tests/tests_config/heartbeat.json @@ -2,7 +2,7 @@ "HEARTBEAT_TABLE_WITH_INVALID_HEARTBEAT_INTERVAL": { "sonic-heartbeat:sonic-heartbeat": { "sonic-heartbeat:HEARTBEAT": { - "AAA_LIST": [{ + "HEARTBEAT_LIST": [{ "name" : "orchagent", "heartbeat_interval" : "invalid", "alert_interval" : "60000" @@ -13,7 +13,7 @@ "HEARTBEAT_TABLE_WITH_INVALID_ALERT_INTERVAL": { "sonic-heartbeat:sonic-heartbeat": { "sonic-heartbeat:HEARTBEAT": { - "AAA_LIST": [{ + "HEARTBEAT_LIST": [{ "name" : "orchagent", "heartbeat_interval" : "10000", "alert_interval" : "invalid" @@ -24,7 +24,7 @@ "HEARTBEAT_TABLE_WITH_VALID_CONFIG": { "sonic-heartbeat:sonic-heartbeat": { "sonic-heartbeat:HEARTBEAT": { - "AAA_LIST": [{ + "HEARTBEAT_LIST": [{ "name" : "orchagent", "heartbeat_interval" : "10000", "alert_interval" : "60000" From f1ce5a82b78bb08de504d7a4b0ddc9726f2784d4 Mon Sep 17 00:00:00 2001 From: Hua Liu <58683130+liuh-80@users.noreply.github.com> Date: Mon, 24 Mar 2025 16:28:24 +0800 Subject: [PATCH 6/6] Update eStrKey values in heartbeat.json --- .../tests/yang_model_tests/tests/heartbeat.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sonic-yang-models/tests/yang_model_tests/tests/heartbeat.json b/src/sonic-yang-models/tests/yang_model_tests/tests/heartbeat.json index 9661e0a99768..d4369f188f36 100644 --- a/src/sonic-yang-models/tests/yang_model_tests/tests/heartbeat.json +++ b/src/sonic-yang-models/tests/yang_model_tests/tests/heartbeat.json @@ -1,11 +1,11 @@ { "HEARTBEAT_TABLE_WITH_INVALID_HEARTBEAT_INTERVAL": { "desc": "TABLE_WITH_INVALID_HEARTBEAT_INTERVAL failure.", - "eStrKey": "Pattern" + "eStrKey": "InvalidValue" }, "HEARTBEAT_TABLE_WITH_INVALID_ALERT_INTERVAL": { "desc": "TABLE_WITH_INVALID_HEARTBEAT_INTERVAL failure.", - "eStrKey": "Pattern" + "eStrKey": "InvalidValue" }, "HEARTBEAT_TABLE_WITH_VALID_CONFIG": { "desc": "TABLE WITH VALID CONFIG."