Skip to content

Commit 4ff8153

Browse files
authored
Add a delay between killing teamd processes (sonic-net#3325)
* Add a delay between killing teamd processes When killing 10 or more teamd processes, add a delay of 0.1 seconds after every 10 kill signals/proceses. This is because in the LAG scale tests (in `ecmp/inner_hashing/test_inner_hashing_lag.py` in sonic-mgmt), it may create 100 LAGs, and when destroying them all, some of those LAGs may fail to be properly destroyed, leaving some stale port channels around. This seems to be because the netlink socket buffers on which the teamd processes get notifications become full with events of the other port channels/interfaces going down. As a workaround, add some delays in killing the teamd processes, so that the netlink buffers don't become full, causing messages to get dropped. This delay was randomly chosen, and it seems to work well with 100 LAGs on a KVM. It can probably made to be a bit more aggressive if needed (i.e. maybe 0.05 seconds every 20 processes).
1 parent 54a499b commit 4ff8153

File tree

3 files changed

+241
-74
lines changed

3 files changed

+241
-74
lines changed

cfgmgr/teammgr.cpp

+39-61
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include <net/if.h>
1717
#include <sys/ioctl.h>
1818
#include <sys/stat.h>
19+
#include <sys/wait.h>
20+
#include <sys/types.h>
1921
#include <signal.h>
2022

2123

@@ -171,18 +173,29 @@ void TeamMgr::cleanTeamProcesses()
171173
SWSS_LOG_ENTER();
172174
SWSS_LOG_NOTICE("Cleaning up LAGs during shutdown...");
173175

174-
std::unordered_map<std::string, pid_t> aliasPidMap;
176+
std::unordered_map<std::string, int> aliasPidMap;
175177

176178
for (const auto& alias: m_lagList)
177179
{
178-
std::string res;
179180
pid_t pid;
181+
// Sleep for 10 milliseconds so as to not overwhelm the netlink
182+
// socket buffers with events about interfaces going down
183+
std::this_thread::sleep_for(std::chrono::milliseconds(10));
180184

181185
try
182186
{
183-
std::stringstream cmd;
184-
cmd << "cat " << shellquote("/var/run/teamd/" + alias + ".pid");
185-
EXEC_WITH_ERROR_THROW(cmd.str(), res);
187+
ifstream pidFile("/var/run/teamd/" + alias + ".pid");
188+
if (pidFile.is_open())
189+
{
190+
pidFile >> pid;
191+
aliasPidMap[alias] = pid;
192+
SWSS_LOG_INFO("Read port channel %s pid %d", alias.c_str(), pid);
193+
}
194+
else
195+
{
196+
SWSS_LOG_NOTICE("Unable to read pid file for %s, skipping...", alias.c_str());
197+
continue;
198+
}
186199
}
187200
catch (const std::exception &e)
188201
{
@@ -191,46 +204,28 @@ void TeamMgr::cleanTeamProcesses()
191204
continue;
192205
}
193206

194-
try
195-
{
196-
pid = static_cast<pid_t>(std::stoul(res, nullptr, 10));
197-
aliasPidMap[alias] = pid;
198-
199-
SWSS_LOG_INFO("Read port channel %s pid %d", alias.c_str(), pid);
200-
}
201-
catch (const std::exception &e)
207+
if (kill(pid, SIGTERM))
202208
{
203-
SWSS_LOG_ERROR("Failed to read port channel %s pid: %s", alias.c_str(), e.what());
204-
continue;
209+
SWSS_LOG_ERROR("Failed to send SIGTERM to port channel %s pid %d: %s", alias.c_str(), pid, strerror(errno));
210+
aliasPidMap.erase(alias);
205211
}
206-
207-
try
212+
else
208213
{
209-
std::stringstream cmd;
210-
cmd << "kill -TERM " << pid;
211-
EXEC_WITH_ERROR_THROW(cmd.str(), res);
212-
213214
SWSS_LOG_NOTICE("Sent SIGTERM to port channel %s pid %d", alias.c_str(), pid);
214215
}
215-
catch (const std::exception &e)
216-
{
217-
SWSS_LOG_ERROR("Failed to send SIGTERM to port channel %s pid %d: %s", alias.c_str(), pid, e.what());
218-
aliasPidMap.erase(alias);
219-
}
220216
}
221217

222218
for (const auto& cit: aliasPidMap)
223219
{
224220
const auto &alias = cit.first;
225221
const auto &pid = cit.second;
226222

227-
std::stringstream cmd;
228-
std::string res;
229-
230223
SWSS_LOG_NOTICE("Waiting for port channel %s pid %d to stop...", alias.c_str(), pid);
231224

232-
cmd << "tail -f --pid=" << pid << " /dev/null";
233-
EXEC_WITH_ERROR_THROW(cmd.str(), res);
225+
while (!kill(pid, 0))
226+
{
227+
std::this_thread::sleep_for(std::chrono::milliseconds(10));
228+
}
234229
}
235230

236231
SWSS_LOG_NOTICE("LAGs cleanup is done");
@@ -658,42 +653,25 @@ bool TeamMgr::removeLag(const string &alias)
658653
{
659654
SWSS_LOG_ENTER();
660655

661-
stringstream cmd;
662-
string res;
663656
pid_t pid;
664657

665-
try
666-
{
667-
std::stringstream cmd;
668-
cmd << "cat " << shellquote("/var/run/teamd/" + alias + ".pid");
669-
EXEC_WITH_ERROR_THROW(cmd.str(), res);
670-
}
671-
catch (const std::exception &e)
672658
{
673-
SWSS_LOG_NOTICE("Failed to remove non-existent port channel %s pid...", alias.c_str());
674-
return false;
675-
}
676-
677-
try
678-
{
679-
pid = static_cast<pid_t>(std::stoul(res, nullptr, 10));
680-
SWSS_LOG_INFO("Read port channel %s pid %d", alias.c_str(), pid);
681-
}
682-
catch (const std::exception &e)
683-
{
684-
SWSS_LOG_ERROR("Failed to read port channel %s pid: %s", alias.c_str(), e.what());
685-
return false;
659+
ifstream pidfile("/var/run/teamd/" + alias + ".pid");
660+
if (pidfile.is_open())
661+
{
662+
pidfile >> pid;
663+
SWSS_LOG_INFO("Read port channel %s pid %d", alias.c_str(), pid);
664+
}
665+
else
666+
{
667+
SWSS_LOG_NOTICE("Failed to remove non-existent port channel %s pid...", alias.c_str());
668+
return false;
669+
}
686670
}
687671

688-
try
689-
{
690-
std::stringstream cmd;
691-
cmd << "kill -TERM " << pid;
692-
EXEC_WITH_ERROR_THROW(cmd.str(), res);
693-
}
694-
catch (const std::exception &e)
672+
if (kill(pid, SIGTERM))
695673
{
696-
SWSS_LOG_ERROR("Failed to send SIGTERM to port channel %s pid %d: %s", alias.c_str(), pid, e.what());
674+
SWSS_LOG_ERROR("Failed to send SIGTERM to port channel %s pid %d: %s", alias.c_str(), pid, strerror(errno));
697675
return false;
698676
}
699677

tests/mock_tests/Makefile.am

+2-2
Original file line numberDiff line numberDiff line change
@@ -227,8 +227,8 @@ tests_teammgrd_SOURCES = teammgrd/teammgr_ut.cpp \
227227
tests_teammgrd_INCLUDES = $(tests_INCLUDES) -I$(top_srcdir)/cfgmgr -I$(top_srcdir)/lib
228228
tests_teammgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_GTEST) $(CFLAGS_SAI)
229229
tests_teammgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_GTEST) $(CFLAGS_SAI) $(tests_teammgrd_INCLUDES)
230-
tests_teammgrd_LDADD = $(LDADD_GTEST) $(LDADD_SAI) -lnl-genl-3 -lhiredis -lhiredis \
231-
-lswsscommon -lswsscommon -lgtest -lgtest_main -lzmq -lnl-3 -lnl-route-3 -lpthread -lgmock -lgmock_main
230+
tests_teammgrd_LDADD = $(LDADD_GTEST) $(LDADD_SAI) -ldl -lhiredis \
231+
-lswsscommon -lgtest -lgtest_main -lzmq -lpthread -lgmock -lgmock_main
232232

233233
## fpmsyncd unit tests
234234

0 commit comments

Comments
 (0)