Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make orchagent heartbeat configurable by config db. #21364

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions dockers/docker-orchagent/orchagent.sh
Original file line number Diff line number Diff line change
@@ -117,4 +117,10 @@ if [[ x"${ORCHDAEMON_RING_ENABLED}" == x"true" ]]; then
ORCHAGENT_ARGS+=" -R"
fi

# Add heartbeat interval when enabled
HEARTBEAT_INTERVAL=`sonic-db-cli CONFIG_DB hget "HEARTBEAT|orchagent" "heartbeat_interval"`
if [ ! -z "$HEARTBEAT_INTERVAL" ] && [ $HEARTBEAT_INTERVAL != "null" ]; then
ORCHAGENT_ARGS+=" -I $HEARTBEAT_INTERVAL"
fi

exec /usr/bin/orchagent ${ORCHAGENT_ARGS}
27 changes: 25 additions & 2 deletions files/scripts/supervisor-proc-exit-listener
Original file line number Diff line number Diff line change
@@ -31,6 +31,9 @@ CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes'
# The FEATURE table in config db contains auto-restart field
FEATURE_TABLE_NAME = 'FEATURE'

# The HEARTBEAT table in config db contains heart beat config
HEARTBEAT_TABLE_NAME = 'HEARTBEAT'

# Value of parameter 'timeout' in select(...) method
SELECT_TIMEOUT_SECS = 1.0

@@ -40,6 +43,8 @@ ALERTING_INTERVAL_SECS = 60
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"

heartbeat_alert_interval_mapping = defaultdict(dict)

def get_group_and_process_list(process_file):
"""
@summary: Read the critical processes/group names.
@@ -114,6 +119,21 @@ def get_autorestart_state(container_name, use_unix_socket_path):

return is_auto_restart

def load_heartbeat_alert_interval(use_unix_socket_path):
config_db = swsscommon.ConfigDBConnector(use_unix_socket_path=use_unix_socket_path)
config_db.connect()
heartbeat_table = config_db.get_table(HEARTBEAT_TABLE_NAME)
if heartbeat_table:
heartbeat_table_keys = heartbeat_table.keys()
for process in heartbeat_table_keys:
heartbeat_alert_interval_mapping[process] = heartbeat_table[process].get('alert_interval') / 1000

def get_heartbeat_alert_interval(process):
if process in heartbeat_alert_interval_mapping:
return heartbeat_alert_interval_mapping[process]

return ALERTING_INTERVAL_SECS

def publish_events(events_handle, process_name, container_name):
params = swsscommon.FieldValueMap()
params["process_name"] = process_name
@@ -136,6 +156,8 @@ def main(argv):

critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE)

load_heartbeat_alert_interval(use_unix_socket_path)

# WATCH_PROCESSES_FILE is optional
watch_process_list = []
if os.path.exists(WATCH_PROCESSES_FILE):
@@ -211,8 +233,9 @@ def main(argv):
for process in process_heart_beat_info.keys():
epoch_time = time.time()
elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"]
if elapsed_secs >= ALERTING_INTERVAL_SECS:
elapsed_mins = elapsed_secs // 60
threshold = get_heartbeat_alert_interval(process)
if threshold > 0 and elapsed_secs >= threshold:
elapsed_mins = elapsed_secs
generate_alerting_message(process, "stuck", elapsed_mins, syslog.LOG_WARNING)

if __name__ == "__main__":
53 changes: 53 additions & 0 deletions src/sonic-yang-models/yang-models/sonic-heartbeat.yang
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
module sonic-heartbeat {

yang-version 1.1;

namespace "http://github.com/sonic-net/sonic-heartbeat";
prefix heartbeat;

import ietf-inet-types {
prefix inet;
}

organization
"SONiC";

contact
"SONiC";

description "HEARTBEAT YANG Module for SONiC OS";

revision 2025-01-09 {
description "First Revision";
}

container sonic-heartbeat {

container HEARTBEAT {
description "HEARTBEAT config TABLE part of config_db.json";

list HEARTBEAT_LIST {
key "name";

leaf name {
description "process name in HEARTBEAT table";
type string {
length 1..32;
}
}

leaf heartbeat_interval {
description "Heartbeat interval in millisecond";
type uint32;
default "10000";
}

leaf alert_interval {
description "Alert interval in millisecond";
type uint32;
default "60000";
}
}
}
}
}