Skip to content

Commit

Permalink
detect netdata exit reasons (netdata#19617)
Browse files Browse the repository at this point in the history
* detect netdata exit reasons

* log exit initiated

* commented debug logs

* commented debug logs again

* fix windows system shutdown detection

* commented debug logs again

* added exit reason msgid

* test shutdown detection by writing to exit.reason

* implement status file loading/saving

* accept also the shutdown event

* fix windows logs

* run as service from the script - not working yet

* save the first fatal message into the status file

* save memory information in the status file

* load machine guid early enough

* fix loading sequence

* simplify function run once logic; add dependencies on netdata.conf loading when required

* accept service parameter

* build for packaging is required for services

* log last exit status with a proper message; log node id and claim id in the status file

* added /var/cache disk space; fixed bug in rfc3339 parsing

* change log priority based on condition

* SIGINT is normal exit under windows

* wait to wevt provider to be initialized before logging

* Revert "fix windows logs (netdata#19632)"

This reverts commit d8c3dc0.

* fix windows logs - the right way

* set default event log sizes

* added detection of netdata update

* added systemd dbus watcher for systemd shutdown/suspend events

* log system shutdown

* detect system reboot in a better way

* cleanup static thread

* on fatal, call _exit(); linunwind should not skip top calls on the stack

* make the sd bus watcher exit on netdata shutdown

* make the netdata agent version log also print the last exit status

* start watcher when shutdown is initiated; prevent double logging of shutdown initiation

* prepare for sending reports

* a single read per receiver

* track memory calls per worker

* use 4 malloc arenas on parents

* spread higher tiers flushing over time

* pgc and replication tuning

* on child disconnect, get retention from the rrdcontexts worker

* BUFFER: the default size is now 1024 bytes

* use dedicated jemalloc arena for judy allocations

* ARAL: do not double the page size unconditionally; cleanup old members

* double pgc partitions

* fix compiler warning

* make the default replication commit buffer big enough to avoid constant realloc

* post crash reports

* revert log2journal changes

* log2journal minor

* disable the crash report when there was no status file

* increase buffer sizes

* added os_boottime() and os_boot_id(), which are now used in the status file

* log2journal: convert \u000A to \n

* fix headers includes

* fix compilation on non-linux

* for host prefix when getting boot_id and boottime

* write status file to /run/netdata too

* fix /run/netdata on startup

* move the IPC pipe inside the run directory

* exclusive file lock to avoid running concurrently

* allow netdatacli to run from any user and still find the run dir of netdata

* fix pipe failure message

* fix nested loop sharing same variable in ADCS

* fix run_dir and netdatacli on windows

* fix status files on windows

* initialize nd_threads early enough to allow creating threads during initialization

* fix compiler warnings

* on shutdown ignore points with delayed flushing

* fix macos compilation

* added os_type to daemon status

* make daemon status schema ecs compatible

* save daemon status file on every signal

* fix external plugins log to journal

* use special allocators for judy, only on netdata - not the external plugins

* systemd-cat-native: default newline string is \n

* when generating json, prefer special 2 character sequences for common control characters

* fix daemon-status filenames

* log errors when the status file cannot be opened/saved/parsed

* make status file world readable

* do not write status file in /run/netdata; add fall back locations when the file cannot be saved in the cache dir

* move ram and disk into host

* simplified inline subobject parsing for jsonc

* ensure path is an array of at least 128 bytes

* fix non-linux compilation
  • Loading branch information
ktsaou authored Feb 24, 2025
1 parent 84f48b5 commit d28b61e
Show file tree
Hide file tree
Showing 136 changed files with 3,510 additions and 934 deletions.
73 changes: 73 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,41 @@ if(ENABLE_MIMALLOC)
netdata_add_mimalloc()
endif()

option(ENABLE_JEMALLOC "Disable jemalloc allocator" OFF)

if(ENABLE_JEMALLOC)
pkg_check_modules(JEMALLOC QUIET jemalloc)
if(JEMALLOC_FOUND)
# Check if jemalloc has arena API
set(CMAKE_REQUIRED_INCLUDES ${JEMALLOC_INCLUDE_DIRS})
set(CMAKE_REQUIRED_LIBRARIES ${JEMALLOC_LIBRARIES})
check_c_source_compiles("
#include <jemalloc/jemalloc.h>
int main() {
unsigned narenas;
size_t sz = sizeof(narenas);
mallctl(\"arenas.narenas\", &narenas, &sz, NULL, 0);
return 0;
}
" HAVE_JEMALLOC_ARENA_API)

if(HAVE_JEMALLOC_ARENA_API)
set(ENABLE_JEMALLOC ON CACHE BOOL "Enable jemalloc allocator" FORCE)
message(STATUS "Jemalloc found with arena API support - enabling")
else()
if(ENABLE_JEMALLOC)
message(FATAL_ERROR "Jemalloc was found but does not have arena API support")
endif()
message(STATUS "Jemalloc found but does not have arena API support - disabling")
endif()
else()
if(ENABLE_JEMALLOC)
message(FATAL_ERROR "Jemalloc support was explicitly enabled but jemalloc was not found")
endif()
message(STATUS "Jemalloc not found - disabling")
endif()
endif()

if(ENABLE_PLUGIN_GO)
include(NetdataGoTools)

Expand Down Expand Up @@ -449,13 +484,24 @@ check_function_exists(arc4random_uniform HAVE_ARC4RANDOM_UNIFORM)
check_function_exists(getrandom HAVE_GETRANDOM)
check_function_exists(sysinfo HAVE_SYSINFO)

check_function_exists(timegm HAVE_TIMEGM)

#
# check source compilation
#

include(CheckCSourceCompiles)
include(CheckCXXSourceCompiles)

check_c_source_compiles("
#include <time.h>
int main(void) {
struct tm t;
(void)t.tm_gmtoff;
return 0;
}
" HAVE_TM_GMTOFF)

set(CMAKE_REQUIRED_LIBRARIES pthread)
check_c_source_compiles("
#define _GNU_SOURCE
Expand Down Expand Up @@ -994,6 +1040,22 @@ set(LIBNETDATA_FILES
src/libnetdata/os/get_system_pagesize.h
src/libnetdata/os/hostname.c
src/libnetdata/os/hostname.h
src/libnetdata/exit/exit_initiated.c
src/libnetdata/exit/exit_initiated.h
src/libnetdata/os/disk_space.c
src/libnetdata/os/disk_space.h
src/libnetdata/os/file_metadata.c
src/libnetdata/os/file_metadata.h
src/libnetdata/os/process_path.c
src/libnetdata/os/process_path.h
src/libnetdata/os/boottime.c
src/libnetdata/os/boottime.h
src/libnetdata/os/boot_id.c
src/libnetdata/os/boot_id.h
src/libnetdata/os/run_dir.c
src/libnetdata/os/run_dir.h
src/libnetdata/os/file_lock.c
src/libnetdata/os/file_lock.h
)

list(APPEND LIBNETDATA_FILES ${INICFG_FILES})
Expand Down Expand Up @@ -1182,8 +1244,12 @@ set(DAEMON_FILES
src/daemon/pulse/pulse-db-dbengine-retention.h
src/daemon/pulse/pulse-parents.c
src/daemon/pulse/pulse-parents.h
src/daemon/daemon-status-file.c
src/daemon/daemon-status-file.h
src/daemon/config/netdata-conf-ssl.c
src/daemon/config/netdata-conf-ssl.h
src/daemon/daemon-systemd-watcher.c
src/daemon/daemon-systemd-watcher.h
)

set(H2O_FILES
Expand Down Expand Up @@ -2170,6 +2236,13 @@ netdata_add_jsonc_to_target(libnetdata)

netdata_add_libyaml_to_target(libnetdata)

# jemalloc
if(ENABLE_JEMALLOC)
target_link_libraries(libnetdata PUBLIC ${JEMALLOC_LIBRARIES})
target_include_directories(libnetdata PUBLIC ${JEMALLOC_INCLUDE_DIRS})
target_compile_options(libnetdata PUBLIC ${JEMALLOC_CFLAGS_OTHER})
endif()

# libunwind
if(ENABLE_LIBUNWIND)
pkg_check_modules(LIBUNWIND libunwind IMPORTED_TARGET)
Expand Down
3 changes: 3 additions & 0 deletions packaging/cmake/config.cmake.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@
#cmakedefine HAVE_RAND_S
#cmakedefine HAVE_GETRANDOM
#cmakedefine HAVE_SYSINFO
#cmakedefine HAVE_TIMEGM
#cmakedefine HAVE_TM_GMTOFF

#cmakedefine HAVE_LIBUNWIND
#cmakedefine HAVE_BACKTRACE
Expand Down Expand Up @@ -116,6 +118,7 @@
#cmakedefine HAVE_FUNC_ATTRIBUTE_NORETURN
#cmakedefine HAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL
#cmakedefine HAVE_FUNC_ATTRIBUTE_WARN_UNUSED_RESULT
#cmakedefine HAVE_JEMALLOC_ARENA_API

// enabled features

Expand Down
59 changes: 49 additions & 10 deletions packaging/utils/compile-and-run-windows.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/bin/sh
#!/bin/bash

RUN_AS_SERVICE=0

# On MSYS2, install these dependencies to build netdata:
install_dependencies() {
Expand All @@ -21,13 +23,21 @@ install_dependencies() {
msys/libcurl msys/libcurl-devel
}

BUILD_FOR_PACKAGING="Off"

if [ "${1}" = "install" ]
then
install_dependencies || exit 1
exit 0
fi

BUILD_FOR_PACKAGING="Off"
if [ "${1}" = "service" ]
then
RUN_AS_SERVICE=1
BUILD_FOR_PACKAGING="On"
shift
fi

if [ "${1}" = "package" ]
then
BUILD_FOR_PACKAGING="On"
Expand Down Expand Up @@ -70,12 +80,18 @@ then
${NULL}
fi

echo "Compiling Netdata..."
ninja -v -C "${build}" || ninja -v -C "${build}" -j 1

echo "Stopping service Netdata"
sc stop "Netdata" || echo "Failed"
echo "Stopping service Netdata..."
sc stop "Netdata" || echo "stop Failed, ok"

if [ $RUN_AS_SERVICE -eq 1 ]; then
sc delete "Netdata" || echo "delete Failed, ok"
fi

ninja -v -C "${build}" install || ninja -v -C "${build}" -j 1
rm -f /opt/netdata/usr/bin/*.dll || echo "deleting old .dll files failed, ok"
ninja -v -C "${build}" install

# register the event log publisher
cmd.exe //c "$(cygpath -w -a "/opt/netdata/usr/bin/wevt_netdata_install.bat")"
Expand All @@ -84,9 +100,32 @@ cmd.exe //c "$(cygpath -w -a "/opt/netdata/usr/bin/wevt_netdata_install.bat")"
#echo "Compile with:"
#echo "ninja -v -C \"${build}\" install || ninja -v -C \"${build}\" -j 1"

echo "starting netdata..."
# enable JIT debug with gdb
export MSYS="error_start:$(cygpath -w /usr/bin/gdb)"
if [ $RUN_AS_SERVICE -eq 1 ]; then
echo
echo "Copying library files to /opt/netdata/usr/bin ..."
ldd /opt/netdata/usr/bin/netdata |\
grep " => /usr/bin/" |\
sed -e 's|\s\+| |g' -e 's|^ ||g' |\
cut -d ' ' -f 3 |\
while read x; do
cp $x /opt/netdata/usr/bin/
done

rm -rf /opt/netdata/var/log/netdata/*.log || echo
/opt/netdata/usr/bin/netdata -D
echo
echo "Registering Netdata service..."
sc create "Netdata" binPath= "$(cygpath.exe -w /opt/netdata/usr/bin/netdata.exe)" start= auto

echo "Starting Netdata service..."
sc start "Netdata"

else

echo "Starting netdata..."

# enable JIT debug with gdb
export MSYS="error_start:$(cygpath -w /usr/bin/gdb)"

rm -rf /opt/netdata/var/log/netdata/*.log || echo
/opt/netdata/usr/bin/netdata -D

fi
14 changes: 7 additions & 7 deletions src/aclk/aclk.c
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ static int wait_till_agent_claimed(void)
* @param aclk_hostname points to location where string pointer to hostname will be set
* @param aclk_port port to int where port will be saved
*
* @return If non 0 returned irrecoverable error happened (or netdata_exit) and ACLK should be terminated
* @return If non 0 returned irrecoverable error happened (or exit_initiated) and ACLK should be terminated
*/
static int wait_till_agent_claim_ready()
{
Expand Down Expand Up @@ -306,7 +306,7 @@ static int handle_connection(mqtt_wss_client client)
{
while (service_running(SERVICE_ACLK)) {
// timeout 1000 to check at least once a second
// for netdata_exit
// for exit_initiated
int rc = mqtt_wss_service(client, 1000);
if (rc < 0){
worker_is_busy(WORKER_ACLK_DISCONNECTED);
Expand Down Expand Up @@ -452,9 +452,9 @@ static unsigned long aclk_reconnect_delay() {
return aclk_tbeb_delay(0, aclk_env->backoff.base, aclk_env->backoff.min_s, aclk_env->backoff.max_s);
}

/* Block till aclk_reconnect_delay is satisfied or netdata_exit is signalled
/* Block till aclk_reconnect_delay is satisfied or exit_initiated is signalled
* @return 0 - Go ahead and connect (delay expired)
* 1 - netdata_exit
* 1 - exit_initiated
*/
#define NETDATA_EXIT_POLL_MS (MSEC_PER_SEC/4)
static int aclk_block_till_recon_allowed() {
Expand All @@ -466,7 +466,7 @@ static int aclk_block_till_recon_allowed() {
nd_log(NDLS_DAEMON, NDLP_DEBUG,
"Wait before attempting to reconnect in %.3f seconds", recon_delay / (float)MSEC_PER_SEC);

// we want to wake up from time to time to check netdata_exit
// we want to wake up from time to time to check exit_initiated
worker_is_busy(WORKER_ACLK_WAITING_TO_CONNECT);
while (recon_delay)
{
Expand Down Expand Up @@ -602,7 +602,7 @@ const char *aclk_cloud_base_url = NULL;
* @param client instance of mqtt_wss_client
* @return 0 - Successful Connection,
* <0 - Irrecoverable Error -> Kill ACLK,
* >0 - netdata_exit
* >0 - exit_initiated
*/
#define CLOUD_BASE_URL_READ_RETRY 30
#ifdef ACLK_SSL_ALLOW_SELF_SIGNED
Expand Down Expand Up @@ -865,7 +865,7 @@ void *aclk_main(void *ptr)
mqtt_wss_set_max_buf_size(mqttwss_client, 25*1024*1024);

// Keep reconnecting and talking until our time has come
// and the Grim Reaper (netdata_exit) calls
// and the Grim Reaper (exit_initiated) calls
netdata_log_info("ACLK: Starting ACLK query event loop");
aclk_query_init(mqttwss_client);
do {
Expand Down
2 changes: 2 additions & 0 deletions src/claim/cloud-conf.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ static void cloud_conf_load_defaults(void) {
}

void cloud_conf_load(int silent) {
netdata_conf_section_directories();

errno_clear();
char *filename = filename_from_path_entry_strdupz(netdata_configured_cloud_dir, "cloud.conf");
int ret = inicfg_load(&cloud_config, filename, 1, NULL);
Expand Down
2 changes: 1 addition & 1 deletion src/cli/cli.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ static void connect_cb(uv_connect_t* req, int status)
(void)req;
if (status) {
fprintf(stderr, "uv_pipe_connect(): %s\n", uv_strerror(status));
fprintf(stderr, "Make sure the netdata service is running.\n");
fprintf(stderr, "Cannot connect to '%s'.\nMake sure the netdata service is running.\n", daemon_pipename());
exit(-1);
}
if (0 == command_string_size) {
Expand Down
4 changes: 2 additions & 2 deletions src/collectors/cgroups.plugin/cgroup-name.sh.in
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,14 @@ log() {

[[ -n "$level" && -n "$LOG_LEVEL" && "$level" -gt "$LOG_LEVEL" ]] && return

systemd-cat-native --log-as-netdata --newline="--NEWLINE--" <<EOFLOG
systemd-cat-native --log-as-netdata <<EOFLOG
INVOCATION_ID=${NETDATA_INVOCATION_ID}
SYSLOG_IDENTIFIER=${PROGRAM_NAME}
PRIORITY=${level}
THREAD_TAG=cgroup-name
ND_LOG_SOURCE=collector
ND_REQUEST=${cmd_line}
MESSAGE=${*//\\n/--NEWLINE--}
MESSAGE=${*//$'\n'/\\n}
EOFLOG
# AN EMPTY LINE IS NEEDED ABOVE
Expand Down
4 changes: 2 additions & 2 deletions src/collectors/cgroups.plugin/cgroup-network-helper.sh.in
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,14 @@ log() {

[[ -n "$level" && -n "$LOG_LEVEL" && "$level" -gt "$LOG_LEVEL" ]] && return

systemd-cat-native --log-as-netdata --newline="--NEWLINE--" <<EOFLOG
systemd-cat-native --log-as-netdata <<EOFLOG
INVOCATION_ID=${NETDATA_INVOCATION_ID}
SYSLOG_IDENTIFIER=${PROGRAM_NAME}
PRIORITY=${level}
THREAD_TAG=cgroup-network-helper
ND_LOG_SOURCE=collector
ND_REQUEST=${cmd_line}
MESSAGE=${*//\\n/--NEWLINE--}
MESSAGE=${*//$'\n'/\\n}
EOFLOG
# AN EMPTY LINE IS NEEDED ABOVE
Expand Down
4 changes: 2 additions & 2 deletions src/collectors/charts.d.plugin/charts.d.plugin.in
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,13 @@ log() {

[[ -n "$level" && -n "$LOG_LEVEL" && "$level" -gt "$LOG_LEVEL" ]] && return

systemd-cat-native --log-as-netdata --newline="--NEWLINE--" <<EOFLOG
systemd-cat-native --log-as-netdata <<EOFLOG
INVOCATION_ID=${NETDATA_INVOCATION_ID}
SYSLOG_IDENTIFIER=${PROGRAM_NAME}
PRIORITY=${level}
THREAD_TAG=charts.d.plugin
ND_LOG_SOURCE=collector
MESSAGE=${MODULE_NAME}: ${*//\\n/--NEWLINE--}
MESSAGE=${MODULE_NAME}: ${*//$'\n'/\\n}
EOFLOG
# AN EMPTY LINE IS NEEDED ABOVE
Expand Down
6 changes: 3 additions & 3 deletions src/collectors/cups.plugin/cups_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ int main(int argc, char **argv) {
for (iteration = 0; 1; iteration++) {
heartbeat_next(&hb);

if (unlikely(netdata_exit))
if (unlikely(exit_initiated))
break;

reset_metrics();
Expand Down Expand Up @@ -315,7 +315,7 @@ int main(int argc, char **argv) {
}
cupsFreeDests(num_dest_total, dests);

if (unlikely(netdata_exit))
if (unlikely(exit_initiated))
break;

cups_job_t *jobs, *curr_job;
Expand Down Expand Up @@ -410,7 +410,7 @@ int main(int argc, char **argv) {

fflush(stdout);

if (unlikely(netdata_exit))
if (unlikely(exit_initiated))
break;

// restart check (14400 seconds)
Expand Down
2 changes: 1 addition & 1 deletion src/collectors/freebsd.plugin/plugin_freebsd.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ void *freebsd_main(void *ptr)

// initialize FreeBSD plugin
if (freebsd_plugin_init())
netdata_cleanup_and_exit(1, NULL, NULL, NULL);
netdata_cleanup_and_exit(EXIT_REASON_FATAL, NULL, NULL, NULL);

// check the enabled status for each module
int i;
Expand Down
Loading

0 comments on commit d28b61e

Please sign in to comment.