Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 936a7cc

Browse files
committedFeb 24, 2025·
Add Orchagent heartbeat config
1 parent 4608b26 commit 936a7cc

File tree

3 files changed

+82
-1
lines changed

3 files changed

+82
-1
lines changed
 

‎dockers/docker-orchagent/orchagent.sh

+6
Original file line numberDiff line numberDiff line change
@@ -111,4 +111,10 @@ if [[ x"${MGMT_VRF_ENABLED}" == x"true" ]]; then
111111
ORCHAGENT_ARGS+=" -v mgmt"
112112
fi
113113

114+
# Add heartbeat interval when enabled
115+
HEARTBEAT_INTERVAL=`sonic-db-cli CONFIG_DB hget "HEARTBEAT|orchagent" "heartbeat_interval"`
116+
if [ ! -z "$HEARTBEAT_INTERVAL" ] && [ $HEARTBEAT_INTERVAL != "null" ]; then
117+
ORCHAGENT_ARGS+=" -I $HEARTBEAT_INTERVAL"
118+
fi
119+
114120
exec /usr/bin/orchagent ${ORCHAGENT_ARGS}

‎files/scripts/supervisor-proc-exit-listener

+23-1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes'
3131
# The FEATURE table in config db contains auto-restart field
3232
FEATURE_TABLE_NAME = 'FEATURE'
3333

34+
# The HEARTBEAT table in config db contains heart beat config
35+
HEARTBEAT_TABLE_NAME = 'HEARTBEAT'
36+
3437
# Value of parameter 'timeout' in select(...) method
3538
SELECT_TIMEOUT_SECS = 1.0
3639

@@ -40,6 +43,8 @@ ALERTING_INTERVAL_SECS = 60
4043
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
4144
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"
4245

46+
heartbeat_alert_interval_mapping = defaultdict(dict)
47+
4348
def get_group_and_process_list(process_file):
4449
"""
4550
@summary: Read the critical processes/group names.
@@ -114,6 +119,21 @@ def get_autorestart_state(container_name, use_unix_socket_path):
114119

115120
return is_auto_restart
116121

122+
def load_heartbeat_alert_interval(use_unix_socket_path):
123+
config_db = swsscommon.ConfigDBConnector(use_unix_socket_path=use_unix_socket_path)
124+
config_db.connect()
125+
heartbeat_table = config_db.get_table(HEARTBEAT_TABLE_NAME)
126+
if heartbeat_table:
127+
heartbeat_table_keys = heartbeat_table.keys()
128+
for process in heartbeat_table_keys:
129+
heartbeat_alert_interval_mapping[process] = heartbeat_table[process].get('alert_interval') / 1000
130+
131+
def get_heartbeat_alert_interval(process):
132+
if process in heartbeat_alert_interval_mapping:
133+
return heartbeat_alert_interval_mapping[process]
134+
135+
return ALERTING_INTERVAL_SECS
136+
117137
def publish_events(events_handle, process_name, container_name):
118138
params = swsscommon.FieldValueMap()
119139
params["process_name"] = process_name
@@ -136,6 +156,8 @@ def main(argv):
136156

137157
critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE)
138158

159+
load_heartbeat_alert_interval(use_unix_socket_path)
160+
139161
# WATCH_PROCESSES_FILE is optional
140162
watch_process_list = []
141163
if os.path.exists(WATCH_PROCESSES_FILE):
@@ -211,7 +233,7 @@ def main(argv):
211233
for process in process_heart_beat_info.keys():
212234
epoch_time = time.time()
213235
elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"]
214-
if elapsed_secs >= ALERTING_INTERVAL_SECS:
236+
if elapsed_secs >= get_heartbeat_alert_interval(process):
215237
elapsed_mins = elapsed_secs // 60
216238
generate_alerting_message(process, "stuck", elapsed_mins, syslog.LOG_WARNING)
217239

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
module sonic-heartbeat {
2+
3+
yang-version 1.1;
4+
5+
namespace "http://github.com/sonic-net/sonic-heartbeat";
6+
prefix heartbeat;
7+
8+
import ietf-inet-types {
9+
prefix inet;
10+
}
11+
12+
organization
13+
"SONiC";
14+
15+
contact
16+
"SONiC";
17+
18+
description "HEARTBEAT YANG Module for SONiC OS";
19+
20+
revision 2025-01-09 {
21+
description "First Revision";
22+
}
23+
24+
container sonic-heartbeat {
25+
26+
container HEARTBEAT {
27+
description "HEARTBEAT config TABLE part of config_db.json";
28+
29+
list HEARTBEAT_LIST {
30+
key "name";
31+
32+
leaf name {
33+
description "process name in HEARTBEAT table";
34+
type string {
35+
length 1..32;
36+
}
37+
}
38+
39+
leaf heartbeat_interval {
40+
description "Heartbeat interval in millisecond";
41+
type uint32;
42+
default "100000";
43+
}
44+
45+
leaf alert_interval {
46+
description "Alert interval in millisecond";
47+
type uint32;
48+
default "60000";
49+
}
50+
}
51+
}
52+
}
53+
}

0 commit comments

Comments
 (0)
Please sign in to comment.