@@ -31,6 +31,9 @@ CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes'
31
31
# The FEATURE table in config db contains auto-restart field
32
32
FEATURE_TABLE_NAME = 'FEATURE'
33
33
34
+ # The HEARTBEAT table in config db contains heart beat config
35
+ HEARTBEAT_TABLE_NAME = 'HEARTBEAT'
36
+
34
37
# Value of parameter 'timeout' in select(...) method
35
38
SELECT_TIMEOUT_SECS = 1.0
36
39
@@ -40,6 +43,8 @@ ALERTING_INTERVAL_SECS = 60
40
43
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
41
44
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"
42
45
46
+ heartbeat_alert_interval_mapping = defaultdict (dict )
47
+
43
48
def get_group_and_process_list (process_file ):
44
49
"""
45
50
@summary: Read the critical processes/group names.
@@ -114,6 +119,21 @@ def get_autorestart_state(container_name, use_unix_socket_path):
114
119
115
120
return is_auto_restart
116
121
122
+ def load_heartbeat_alert_interval (use_unix_socket_path ):
123
+ config_db = swsscommon .ConfigDBConnector (use_unix_socket_path = use_unix_socket_path )
124
+ config_db .connect ()
125
+ heartbeat_table = config_db .get_table (HEARTBEAT_TABLE_NAME )
126
+ if heartbeat_table :
127
+ heartbeat_table_keys = heartbeat_table .keys ()
128
+ for process in heartbeat_table_keys :
129
+ heartbeat_alert_interval_mapping [process ] = heartbeat_table [process ].get ('alert_interval' ) / 1000
130
+
131
+ def get_heartbeat_alert_interval (process ):
132
+ if process in heartbeat_alert_interval_mapping :
133
+ return heartbeat_alert_interval_mapping [process ]
134
+
135
+ return ALERTING_INTERVAL_SECS
136
+
117
137
def publish_events (events_handle , process_name , container_name ):
118
138
params = swsscommon .FieldValueMap ()
119
139
params ["process_name" ] = process_name
@@ -136,6 +156,8 @@ def main(argv):
136
156
137
157
critical_group_list , critical_process_list = get_group_and_process_list (CRITICAL_PROCESSES_FILE )
138
158
159
+ load_heartbeat_alert_interval (use_unix_socket_path )
160
+
139
161
# WATCH_PROCESSES_FILE is optional
140
162
watch_process_list = []
141
163
if os .path .exists (WATCH_PROCESSES_FILE ):
@@ -211,7 +233,7 @@ def main(argv):
211
233
for process in process_heart_beat_info .keys ():
212
234
epoch_time = time .time ()
213
235
elapsed_secs = epoch_time - process_heart_beat_info [process ]["last_heart_beat" ]
214
- if elapsed_secs >= ALERTING_INTERVAL_SECS :
236
+ if elapsed_secs >= get_heartbeat_alert_interval ( process ) :
215
237
elapsed_mins = elapsed_secs // 60
216
238
generate_alerting_message (process , "stuck" , elapsed_mins , syslog .LOG_WARNING )
217
239
0 commit comments