From 936a7cc13506f31ccbdc7a1bfe527be4cd94984c Mon Sep 17 00:00:00 2001 From: liuh-80 Date: Mon, 24 Feb 2025 03:14:00 +0000 Subject: [PATCH 1/2] Add Orchagent heartbeat config --- dockers/docker-orchagent/orchagent.sh | 6 +++ files/scripts/supervisor-proc-exit-listener | 24 ++++++++- .../yang-models/sonic-heartbeat.yang | 53 +++++++++++++++++++ 3 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 src/sonic-yang-models/yang-models/sonic-heartbeat.yang diff --git a/dockers/docker-orchagent/orchagent.sh b/dockers/docker-orchagent/orchagent.sh index 28067db7ccf5..bcc896b05e91 100755 --- a/dockers/docker-orchagent/orchagent.sh +++ b/dockers/docker-orchagent/orchagent.sh @@ -111,4 +111,10 @@ if [[ x"${MGMT_VRF_ENABLED}" == x"true" ]]; then ORCHAGENT_ARGS+=" -v mgmt" fi +# Add heartbeat interval when enabled +HEARTBEAT_INTERVAL=`sonic-db-cli CONFIG_DB hget "HEARTBEAT|orchagent" "heartbeat_interval"` +if [ ! -z "$HEARTBEAT_INTERVAL" ] && [ $HEARTBEAT_INTERVAL != "null" ]; then + ORCHAGENT_ARGS+=" -I $HEARTBEAT_INTERVAL" +fi + exec /usr/bin/orchagent ${ORCHAGENT_ARGS} diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 8628826e6157..9a6746d8149e 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -31,6 +31,9 @@ CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes' # The FEATURE table in config db contains auto-restart field FEATURE_TABLE_NAME = 'FEATURE' +# The HEARTBEAT table in config db contains heart beat config +HEARTBEAT_TABLE_NAME = 'HEARTBEAT' + # Value of parameter 'timeout' in select(...) method SELECT_TIMEOUT_SECS = 1.0 @@ -40,6 +43,8 @@ ALERTING_INTERVAL_SECS = 60 EVENTS_PUBLISHER_SOURCE = "sonic-events-host" EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly" +heartbeat_alert_interval_mapping = defaultdict(dict) + def get_group_and_process_list(process_file): """ @summary: Read the critical processes/group names. @@ -114,6 +119,21 @@ def get_autorestart_state(container_name, use_unix_socket_path): return is_auto_restart +def load_heartbeat_alert_interval(use_unix_socket_path): + config_db = swsscommon.ConfigDBConnector(use_unix_socket_path=use_unix_socket_path) + config_db.connect() + heartbeat_table = config_db.get_table(HEARTBEAT_TABLE_NAME) + if heartbeat_table: + heartbeat_table_keys = heartbeat_table.keys() + for process in heartbeat_table_keys: + heartbeat_alert_interval_mapping[process] = heartbeat_table[process].get('alert_interval') / 1000 + +def get_heartbeat_alert_interval(process): + if process in heartbeat_alert_interval_mapping: + return heartbeat_alert_interval_mapping[process] + + return ALERTING_INTERVAL_SECS + def publish_events(events_handle, process_name, container_name): params = swsscommon.FieldValueMap() params["process_name"] = process_name @@ -136,6 +156,8 @@ def main(argv): critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE) + load_heartbeat_alert_interval(use_unix_socket_path) + # WATCH_PROCESSES_FILE is optional watch_process_list = [] if os.path.exists(WATCH_PROCESSES_FILE): @@ -211,7 +233,7 @@ def main(argv): for process in process_heart_beat_info.keys(): epoch_time = time.time() elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"] - if elapsed_secs >= ALERTING_INTERVAL_SECS: + if elapsed_secs >= get_heartbeat_alert_interval(process): elapsed_mins = elapsed_secs // 60 generate_alerting_message(process, "stuck", elapsed_mins, syslog.LOG_WARNING) diff --git a/src/sonic-yang-models/yang-models/sonic-heartbeat.yang b/src/sonic-yang-models/yang-models/sonic-heartbeat.yang new file mode 100644 index 000000000000..9e53f73a8a26 --- /dev/null +++ b/src/sonic-yang-models/yang-models/sonic-heartbeat.yang @@ -0,0 +1,53 @@ +module sonic-heartbeat { + + yang-version 1.1; + + namespace "http://github.com/sonic-net/sonic-heartbeat"; + prefix heartbeat; + + import ietf-inet-types { + prefix inet; + } + + organization + "SONiC"; + + contact + "SONiC"; + + description "HEARTBEAT YANG Module for SONiC OS"; + + revision 2025-01-09 { + description "First Revision"; + } + + container sonic-heartbeat { + + container HEARTBEAT { + description "HEARTBEAT config TABLE part of config_db.json"; + + list HEARTBEAT_LIST { + key "name"; + + leaf name { + description "process name in HEARTBEAT table"; + type string { + length 1..32; + } + } + + leaf heartbeat_interval { + description "Heartbeat interval in millisecond"; + type uint32; + default "100000"; + } + + leaf alert_interval { + description "Alert interval in millisecond"; + type uint32; + default "60000"; + } + } + } + } +} \ No newline at end of file From 98ab5a7e31b4dd5360aff68effce34565fef3a5a Mon Sep 17 00:00:00 2001 From: Hua Liu <58683130+liuh-80@users.noreply.github.com> Date: Thu, 6 Mar 2025 13:40:19 +0800 Subject: [PATCH 2/2] Update heartbeat and alert interval logic --- files/scripts/supervisor-proc-exit-listener | 5 +++-- src/sonic-yang-models/yang-models/sonic-heartbeat.yang | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 9a6746d8149e..3f23ffbc69e8 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -233,8 +233,9 @@ def main(argv): for process in process_heart_beat_info.keys(): epoch_time = time.time() elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"] - if elapsed_secs >= get_heartbeat_alert_interval(process): - elapsed_mins = elapsed_secs // 60 + threshold = get_heartbeat_alert_interval(process) + if threshold > 0 and elapsed_secs >= threshold: + elapsed_mins = elapsed_secs generate_alerting_message(process, "stuck", elapsed_mins, syslog.LOG_WARNING) if __name__ == "__main__": diff --git a/src/sonic-yang-models/yang-models/sonic-heartbeat.yang b/src/sonic-yang-models/yang-models/sonic-heartbeat.yang index 9e53f73a8a26..dbf77902514d 100644 --- a/src/sonic-yang-models/yang-models/sonic-heartbeat.yang +++ b/src/sonic-yang-models/yang-models/sonic-heartbeat.yang @@ -39,7 +39,7 @@ module sonic-heartbeat { leaf heartbeat_interval { description "Heartbeat interval in millisecond"; type uint32; - default "100000"; + default "10000"; } leaf alert_interval {