Skip to content

Commit 4ae1b85

Browse files
committed
feat(system_monitor): add diagnostic feature for monitoring swap usage
1 parent efe7c62 commit 4ae1b85

File tree

3 files changed

+65
-0
lines changed

3 files changed

+65
-0
lines changed

system/system_monitor/config/mem_monitor.param.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@
33
available_size: 1024 # MB
44
usage_timeout: 5 # sec
55
ecc_timeout: 5 # sec
6+
swap_usage_warn: 0.25 # %
7+
swap_usage_error: 0.75 # %

system/system_monitor/include/system_monitor/mem_monitor/mem_monitor.hpp

+11
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,15 @@ class MemMonitor : public rclcpp::Node
5757
void checkUsage(
5858
diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references)
5959

60+
/**
61+
* @brief check Swap usage
62+
* @param @param [out] stat diagnostic message passed directly to diagnostic publish calls
63+
* @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference
64+
* to pass diagnostic message updated in this function to diagnostic publish calls.
65+
*/
66+
void checkSwapUsage(
67+
diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references)
68+
6069
/**
6170
* @brief check Memory ECC
6271
* @param [out] stat diagnostic message passed directly to diagnostic publish calls
@@ -97,6 +106,8 @@ class MemMonitor : public rclcpp::Node
97106
size_t available_size_; //!< @brief Memory available size to generate error
98107
int usage_timeout_; //!< @brief Timeout duration for executing readUsage
99108
int ecc_timeout_; //!< @brief Timeout duration for executing edac-util command
109+
float swap_usage_warn_; //!< @brief Swap usage(%) to generate warning
110+
float swap_usage_error_; //!< @brief Swap usage(%) to generate error
100111

101112
rclcpp::TimerBase::SharedPtr
102113
timer_; //!< @brief Timer to execute readUsage and edac-utils command

system/system_monitor/src/mem_monitor/mem_monitor.cpp

+52
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ MemMonitor::MemMonitor(const rclcpp::NodeOptions & options)
3838
available_size_(declare_parameter<int>("available_size", 1024) * 1024 * 1024),
3939
usage_timeout_(declare_parameter<int>("usage_timeout", 5)),
4040
ecc_timeout_(declare_parameter<int>("ecc_timeout", 5)),
41+
swap_usage_warn_(declare_parameter<float>("swap_usage_warn", 0.25)),
42+
swap_usage_error_(declare_parameter<float>("swap_usage_error", 0.75)),
4143
usage_elapsed_ms_(0.0),
4244
ecc_elapsed_ms_(0.0),
4345
use_edac_util_(false)
@@ -47,6 +49,7 @@ MemMonitor::MemMonitor(const rclcpp::NodeOptions & options)
4749
gethostname(hostname_, sizeof(hostname_));
4850
updater_.setHardwareID(hostname_);
4951
updater_.add("Memory Usage", this, &MemMonitor::checkUsage);
52+
updater_.add("Swap Usage", this, &MemMonitor::checkSwapUsage);
5053

5154
// Start timer to execute checkUsage and checkEcc
5255
timer_callback_group_ = this->create_callback_group(rclcpp::CallbackGroupType::MutuallyExclusive);
@@ -116,6 +119,54 @@ void MemMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat)
116119
stat.addf("execution time", "%f ms", elapsed_ms);
117120
}
118121

122+
void MemMonitor::checkSwapUsage(diagnostic_updater::DiagnosticStatusWrapper & stat)
123+
{
124+
std::string error_str;
125+
std::map<std::string, size_t> map;
126+
double elapsed_ms;
127+
128+
// thread-safe copy
129+
{
130+
std::lock_guard<std::mutex> lock(usage_mutex_);
131+
error_str = usage_error_str_;
132+
map = usage_map_;
133+
elapsed_ms = usage_elapsed_ms_;
134+
}
135+
136+
if (!error_str.empty()) {
137+
stat.summary(DiagStatus::ERROR, "readUsage error");
138+
stat.add("readUsage", error_str);
139+
return;
140+
}
141+
142+
// Check if Swap Usage
143+
const auto swap_usage = static_cast<double>(map["Swap: usage"]) / 1e+2;
144+
int level = DiagStatus::OK;
145+
146+
if (swap_usage >= swap_usage_error_) {
147+
level = std::max(level, static_cast<int>(DiagStatus::ERROR));
148+
} else if (swap_usage >= swap_usage_warn_) {
149+
level = std::max(level, static_cast<int>(DiagStatus::WARN));
150+
}
151+
152+
stat.addf("Swap: usage", "%.2f%%", static_cast<double>(map["Swap: usage"]));
153+
stat.add("Swap: total", toHumanReadable(std::to_string(map["Swap: total"])));
154+
stat.add("Swap: used", toHumanReadable(std::to_string(map["Swap: used"])));
155+
stat.add("Swap: free", toHumanReadable(std::to_string(map["Swap: free"])));
156+
157+
if (level == DiagStatus::ERROR) {
158+
stat.summary(level, usage_dict_.at(level));
159+
} else if (elapsed_ms == 0.0) {
160+
stat.summary(DiagStatus::WARN, "do not execute readUsage yet");
161+
} else if (elapsed_ms > usage_timeout_ * 1000) {
162+
stat.summary(DiagStatus::WARN, "readUsage timeout expired");
163+
} else {
164+
stat.summary(level, usage_dict_.at(level));
165+
}
166+
167+
stat.addf("execution time", "%f ms", elapsed_ms);
168+
}
169+
119170
void MemMonitor::checkEcc(diagnostic_updater::DiagnosticStatusWrapper & stat)
120171
{
121172
std::string error_str;
@@ -252,6 +303,7 @@ std::string MemMonitor::readUsage(std::map<std::string, size_t> & map)
252303
map["Swap: total"] = swap_total;
253304
map["Swap: used"] = swap_used;
254305
map["Swap: free"] = swap_free;
306+
map["Swap: usage"] = (swap_total > 0) ? static_cast<double>(swap_used) / swap_total * 1e+2 : 0.0;
255307

256308
size_t total_total = mem_total + swap_total;
257309
size_t total_used = mem_used + swap_used;

0 commit comments

Comments
 (0)