diff --git a/dockers/docker-orchagent/orchagent.sh b/dockers/docker-orchagent/orchagent.sh index b0d19823f029..9f387a13837c 100755 --- a/dockers/docker-orchagent/orchagent.sh +++ b/dockers/docker-orchagent/orchagent.sh @@ -117,4 +117,10 @@ if [[ x"${ORCHDAEMON_RING_ENABLED}" == x"true" ]]; then ORCHAGENT_ARGS+=" -R" fi +# Add heartbeat interval when enabled +HEARTBEAT_INTERVAL=`sonic-db-cli CONFIG_DB hget "HEARTBEAT|orchagent" "heartbeat_interval"` +if [ ! -z "$HEARTBEAT_INTERVAL" ] && [ $HEARTBEAT_INTERVAL != "null" ]; then + ORCHAGENT_ARGS+=" -I $HEARTBEAT_INTERVAL" +fi + exec /usr/bin/orchagent ${ORCHAGENT_ARGS} diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 8628826e6157..3f23ffbc69e8 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -31,6 +31,9 @@ CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes' # The FEATURE table in config db contains auto-restart field FEATURE_TABLE_NAME = 'FEATURE' +# The HEARTBEAT table in config db contains heart beat config +HEARTBEAT_TABLE_NAME = 'HEARTBEAT' + # Value of parameter 'timeout' in select(...) method SELECT_TIMEOUT_SECS = 1.0 @@ -40,6 +43,8 @@ ALERTING_INTERVAL_SECS = 60 EVENTS_PUBLISHER_SOURCE = "sonic-events-host" EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly" +heartbeat_alert_interval_mapping = defaultdict(dict) + def get_group_and_process_list(process_file): """ @summary: Read the critical processes/group names. @@ -114,6 +119,21 @@ def get_autorestart_state(container_name, use_unix_socket_path): return is_auto_restart +def load_heartbeat_alert_interval(use_unix_socket_path): + config_db = swsscommon.ConfigDBConnector(use_unix_socket_path=use_unix_socket_path) + config_db.connect() + heartbeat_table = config_db.get_table(HEARTBEAT_TABLE_NAME) + if heartbeat_table: + heartbeat_table_keys = heartbeat_table.keys() + for process in heartbeat_table_keys: + heartbeat_alert_interval_mapping[process] = heartbeat_table[process].get('alert_interval') / 1000 + +def get_heartbeat_alert_interval(process): + if process in heartbeat_alert_interval_mapping: + return heartbeat_alert_interval_mapping[process] + + return ALERTING_INTERVAL_SECS + def publish_events(events_handle, process_name, container_name): params = swsscommon.FieldValueMap() params["process_name"] = process_name @@ -136,6 +156,8 @@ def main(argv): critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE) + load_heartbeat_alert_interval(use_unix_socket_path) + # WATCH_PROCESSES_FILE is optional watch_process_list = [] if os.path.exists(WATCH_PROCESSES_FILE): @@ -211,8 +233,9 @@ def main(argv): for process in process_heart_beat_info.keys(): epoch_time = time.time() elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"] - if elapsed_secs >= ALERTING_INTERVAL_SECS: - elapsed_mins = elapsed_secs // 60 + threshold = get_heartbeat_alert_interval(process) + if threshold > 0 and elapsed_secs >= threshold: + elapsed_mins = elapsed_secs generate_alerting_message(process, "stuck", elapsed_mins, syslog.LOG_WARNING) if __name__ == "__main__": diff --git a/src/sonic-yang-models/yang-models/sonic-heartbeat.yang b/src/sonic-yang-models/yang-models/sonic-heartbeat.yang new file mode 100644 index 000000000000..dbf77902514d --- /dev/null +++ b/src/sonic-yang-models/yang-models/sonic-heartbeat.yang @@ -0,0 +1,53 @@ +module sonic-heartbeat { + + yang-version 1.1; + + namespace "http://github.com/sonic-net/sonic-heartbeat"; + prefix heartbeat; + + import ietf-inet-types { + prefix inet; + } + + organization + "SONiC"; + + contact + "SONiC"; + + description "HEARTBEAT YANG Module for SONiC OS"; + + revision 2025-01-09 { + description "First Revision"; + } + + container sonic-heartbeat { + + container HEARTBEAT { + description "HEARTBEAT config TABLE part of config_db.json"; + + list HEARTBEAT_LIST { + key "name"; + + leaf name { + description "process name in HEARTBEAT table"; + type string { + length 1..32; + } + } + + leaf heartbeat_interval { + description "Heartbeat interval in millisecond"; + type uint32; + default "10000"; + } + + leaf alert_interval { + description "Alert interval in millisecond"; + type uint32; + default "60000"; + } + } + } + } +} \ No newline at end of file