Skip to content

Commit

Permalink
daemon status improvements 3 (netdata#19707)
Browse files Browse the repository at this point in the history
* spawn an init spawn server while netdata runs; then stop it and run the final one

* stop the old one before dropping permissions

* remove the leading dot from spawn server filenames

* save the status file on every step during startup

* minor update

* add clarity about the double use of the function
  • Loading branch information
ktsaou authored Feb 25, 2025
1 parent f8e9791 commit 728e365
Show file tree
Hide file tree
Showing 10 changed files with 93 additions and 87 deletions.
2 changes: 1 addition & 1 deletion src/claim/claim-with-api.c
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ bool claim_agent(const char *url, const char *token, const char *rooms, const ch
bool done = false, can_retry = true;
size_t retries = 0;
do {
done = send_curl_request(registry_get_this_machine_guid(), registry_get_this_machine_hostname(), token, rooms, url, proxy, insecure, &can_retry);
done = send_curl_request(registry_get_this_machine_guid(true), registry_get_this_machine_hostname(), token, rooms, url, proxy, insecure, &can_retry);
if (done) break;
sleep_usec(300 * USEC_PER_MS + 100 * retries * USEC_PER_MS);
retries++;
Expand Down
2 changes: 1 addition & 1 deletion src/claim/cloud-conf.c
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ void cloud_conf_init_after_registry(void) {

// for machine guid and hostname we have to use inicfg_set() for that they will be saved uncommented
if(!machine_guid || !*machine_guid)
inicfg_set(&cloud_config, CONFIG_SECTION_GLOBAL, "machine_guid", registry_get_this_machine_guid());
inicfg_set(&cloud_config, CONFIG_SECTION_GLOBAL, "machine_guid", registry_get_this_machine_guid(true));

if(!hostname || !*hostname)
inicfg_set(&cloud_config, CONFIG_SECTION_GLOBAL, "hostname", registry_get_this_machine_hostname());
Expand Down
18 changes: 13 additions & 5 deletions src/daemon/daemon-status-file.c
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ static DAEMON_STATUS_FILE daemon_status_file_get(DAEMON_STATUS status) {
else if(!UUIDiszero(last_session_status.host_id))
session_status.host_id = last_session_status.host_id;
else {
const char *machine_guid = registry_get_this_machine_guid();
const char *machine_guid = registry_get_this_machine_guid(false);
if(machine_guid && *machine_guid) {
if (uuid_parse_flexi(machine_guid, session_status.host_id.uuid) != 0)
session_status.host_id = UUID_ZERO;
Expand Down Expand Up @@ -296,8 +296,7 @@ static DAEMON_STATUS_FILE daemon_status_file_get(DAEMON_STATUS status) {
if(!session_status.os_id_like && last_session_status.os_id_like)
session_status.os_id_like = strdupz(last_session_status.os_id_like);

if(status == DAEMON_STATUS_RUNNING)
get_daemon_status_fields_from_system_info(&session_status);
get_daemon_status_fields_from_system_info(&session_status);

session_status.exit_reason = exit_initiated;
session_status.profile = nd_profile_detect_and_configure(false);
Expand Down Expand Up @@ -373,7 +372,7 @@ DAEMON_STATUS_FILE daemon_status_file_load(void) {
char current_filename[FILENAME_MAX];
time_t newest_mtime = 0, current_mtime;

// Check primary directory first
// Check the primary directory first
if(check_status_file(netdata_configured_cache_dir, current_filename, sizeof(current_filename), &current_mtime)) {
strncpyz(newest_filename, current_filename, sizeof(newest_filename) - 1);
newest_mtime = current_mtime;
Expand Down Expand Up @@ -675,14 +674,22 @@ bool daemon_status_file_was_incomplete_shutdown(void) {
return last_session_status.status == DAEMON_STATUS_EXITING;
}

void daemon_status_file_startup_step(const char *step) {
freez((char *)session_status.fatal.function);
session_status.fatal.function = strdupz(step);
if(step != NULL)
daemon_status_file_save(DAEMON_STATUS_NONE);
}

// --------------------------------------------------------------------------------------------------------------------
// ng_log() hook for receiving fatal message information

void daemon_status_file_register_fatal(const char *filename, const char *function, const char *message, const char *stack_trace, long line) {
static SPINLOCK spinlock = SPINLOCK_INITIALIZER;
spinlock_lock(&spinlock);

if(session_status.fatal.filename || session_status.fatal.function || session_status.fatal.message || session_status.fatal.stack_trace) {
// do not check the function, because it may have a startup step in it
if(session_status.fatal.filename || session_status.fatal.message || session_status.fatal.stack_trace) {
spinlock_unlock(&spinlock);
freez((void *)filename);
freez((void *)function);
Expand All @@ -692,6 +699,7 @@ void daemon_status_file_register_fatal(const char *filename, const char *functio
}

session_status.fatal.filename = filename;
freez((char *)session_status.fatal.function); // it may have a startup step
session_status.fatal.function = function;
session_status.fatal.message = message;
session_status.fatal.stack_trace = stack_trace;
Expand Down
1 change: 1 addition & 0 deletions src/daemon/daemon-status-file.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ void daemon_status_file_check_crash(void);

bool daemon_status_file_has_last_crashed(void);
bool daemon_status_file_was_incomplete_shutdown(void);
void daemon_status_file_startup_step(const char *step);

void daemon_status_file_register_fatal(const char *filename, const char *function, const char *message, const char *stack_trace, long line);

Expand Down
137 changes: 69 additions & 68 deletions src/daemon/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -184,15 +184,16 @@ int help(int exitcode) {
be set in this procedure to be called in all the relevant code paths.
*/

#define delta_startup_time(msg) \
{ \
usec_t now_ut = now_monotonic_usec(); \
if(prev_msg) \
#define delta_startup_time(msg) \
{ \
usec_t now_ut = now_monotonic_usec(); \
if(prev_msg) \
netdata_log_info("NETDATA STARTUP: in %7llu ms, %s - next: %s", (now_ut - last_ut) / USEC_PER_MS, prev_msg, msg); \
else \
netdata_log_info("NETDATA STARTUP: next: %s", msg); \
last_ut = now_ut; \
prev_msg = msg; \
else \
netdata_log_info("NETDATA STARTUP: next: %s", msg); \
last_ut = now_ut; \
prev_msg = msg; \
daemon_status_file_startup_step("startup(" msg ")"); \
}

int buffer_unittest(void);
Expand Down Expand Up @@ -782,18 +783,21 @@ int netdata_main(int argc, char **argv) {

nd_profile_setup();

// start a temporary spawn server
netdata_main_spawn_server_init("init", argc, (const char **)argv);

// status and crash/update/exit detection
exit_initiated_reset();
daemon_status_file_check_crash();

// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("initialize environment");

netdata_conf_ssl();

// Get execution path before switching user to avoid permission issues
// Get the execution path before switching user to avoid permission issues
get_netdata_execution_path();

// ----------------------------------------------------------------------------------------------------------------
// data collection plugins

// prepare configuration environment variables for the plugins
set_environment_for_plugins_and_scripts();

Expand All @@ -802,13 +806,13 @@ int netdata_main(int argc, char **argv) {
fatal("Cannot cd to '%s'", netdata_configured_user_config_dir);

// ----------------------------------------------------------------------------------------------------------------
// analytics
delta_startup_time("initialize analytics");

analytics_reset();
get_system_timezone();

// ----------------------------------------------------------------------------------------------------------------
// pulse (internal netdata instrumentation)
delta_startup_time("initialize pulse");

#ifdef NETDATA_INTERNAL_CHECKS
pulse_enabled = true;
Expand All @@ -823,29 +827,23 @@ int netdata_main(int argc, char **argv) {
workers_utilization_enable();

// ----------------------------------------------------------------------------------------------------------------
// streaming, replication, functions initialization
delta_startup_time("initialize streaming and replication");

replication_initialize();
rrd_functions_inflight_init();

// --------------------------------------------------------------------
// alerts SILENCERS
// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("initialize silencers");

health_set_silencers_filename();
health_initialize_global_silencers();

// --------------------------------------------------------------------
// setup process signals

// block signals while initializing threads.
// this causes the threads to block signals.

// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("initialize signals");
nd_initialize_signals(); // setup the signals we want to use

// --------------------------------------------------------------------
// check which threads are enabled and initialize them
nd_initialize_signals();

// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("initialize static threads");

for (i = 0; static_threads[i].name != NULL ; i++) {
Expand All @@ -867,9 +865,7 @@ int netdata_main(int argc, char **argv) {
*st->global_variable = (st->enabled) ? true : false;
}

// --------------------------------------------------------------------
// create the listening sockets

// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("initialize web server");

// get the certificate and start security
Expand All @@ -884,15 +880,19 @@ int netdata_main(int argc, char **argv) {
exit(1);
}
}

// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("initialize sqlite");

if (sqlite_library_init())
fatal("Failed to initialize sqlite library");

// --------------------------------------------------------------------
// Initialize ML configuration

// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("initialize ML");

ml_init();

// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("set resource limits");

#ifdef NETDATA_INTERNAL_CHECKS
Expand All @@ -908,10 +908,14 @@ int netdata_main(int argc, char **argv) {

set_nofile_limit(&rlimit_nofile);

// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("become daemon");

// stop the old server and later start a new one under the new permissions
netdata_main_spawn_server_cleanup();

#if defined(OS_LINUX) || defined(OS_MACOS) || defined(OS_FREEBSD)
// fork, switch user, create pid file, set process priority
// fork, switch user, create the pid file, set process priority
if(become_daemon(dont_fork, user) == -1)
fatal("Cannot daemonize myself.");
#else
Expand All @@ -920,52 +924,59 @@ int netdata_main(int argc, char **argv) {

netdata_main_spawn_server_init("plugins", argc, (const char **)argv);

// init sentry
#ifdef ENABLE_SENTRY
nd_sentry_init();
// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("initialize sentry");

nd_sentry_init();
#endif

// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("initialize home");

// The "HOME" env var points to the root's home dir because Netdata starts as root. Can't use "HOME".
struct passwd *pw = getpwuid(getuid());
if (inicfg_exists(&netdata_config, CONFIG_SECTION_DIRECTORIES, "home") || !pw || !pw->pw_dir) {
netdata_configured_home_dir = inicfg_get(&netdata_config, CONFIG_SECTION_DIRECTORIES, "home", netdata_configured_home_dir);
} else {
netdata_configured_home_dir = inicfg_get(&netdata_config, CONFIG_SECTION_DIRECTORIES, "home", pw->pw_dir);
}
else
netdata_configured_home_dir = inicfg_get(&netdata_config, CONFIG_SECTION_DIRECTORIES, "home", pw->pw_dir);

nd_setenv("HOME", netdata_configured_home_dir, 1);

dyncfg_init(true);
// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("initialize dyncfg");

netdata_log_info("netdata started on pid %d.", getpid());
dyncfg_init(true);

// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("initialize threads after fork");

netdata_threads_init_after_fork((size_t)inicfg_get_size_bytes(&netdata_config, CONFIG_SECTION_GLOBAL, "pthread stack size", default_stacksize));

// initialize internal registry
// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("initialize registry");

registry_load();
cloud_conf_init_after_registry();
netdata_random_session_id_generate();

// ------------------------------------------------------------------------
// initialize rrd, registry, health, streaming, etc.

delta_startup_time("collecting system info");

struct rrdhost_system_info *system_info = rrdhost_system_info_create();
rrdhost_system_info_detect(system_info);

const char *guid = registry_get_this_machine_guid();
const char *guid = registry_get_this_machine_guid(true);
#ifdef ENABLE_SENTRY
nd_sentry_set_user(guid);
#else
UNUSED(guid);
#endif

// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("collecting system info");

struct rrdhost_system_info *system_info = rrdhost_system_info_create();
rrdhost_system_info_detect(system_info);

get_install_type(system_info);

// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("initialize RRD structures");

abort_on_fatal_disable();
Expand All @@ -974,29 +985,19 @@ int netdata_main(int argc, char **argv) {
fatal("Cannot initialize localhost instance with name '%s'.", netdata_configured_hostname);
}
abort_on_fatal_enable();
reload_host_labels();

// ------------------------------------------------------------------------
// Claim netdata agent to a cloud endpoint

// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("collect claiming info");

bearer_tokens_init();
load_claiming_state();

// ------------------------------------------------------------------------
// enable log flood protection
// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("start the static threads");

nd_log_limits_reset();

// Load host labels
delta_startup_time("collect host labels");
reload_host_labels();

// ------------------------------------------------------------------------
// spawn the threads

get_agent_event_time_median_init();
bearer_tokens_init();

delta_startup_time("start the static threads");

netdata_conf_section_web();

Expand All @@ -1013,13 +1014,12 @@ int netdata_main(int argc, char **argv) {
}
ml_start_threads();

// ------------------------------------------------------------------------
// Initialize netdata agent command serving from cli and signals

// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("initialize commands API");

commands_init();

// ----------------------------------------------------------------------------------------------------------------
delta_startup_time("ready");

usec_t ready_ut = now_monotonic_usec();
Expand Down Expand Up @@ -1057,6 +1057,7 @@ int netdata_main(int argc, char **argv) {

webrtc_initialize();

daemon_status_file_startup_step(NULL);
daemon_status_file_save(DAEMON_STATUS_RUNNING);
return 10;
}
Expand Down
2 changes: 1 addition & 1 deletion src/database/rrd.c
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ int rrd_init(const char *hostname, struct rrdhost_system_info *system_info, bool
localhost = rrdhost_create(
hostname
, registry_get_this_machine_hostname()
, registry_get_this_machine_guid()
, registry_get_this_machine_guid(true)
, os_type
, netdata_configured_timezone
, netdata_configured_abbrev_timezone
Expand Down
5 changes: 0 additions & 5 deletions src/database/rrdhost-system-info.c
Original file line number Diff line number Diff line change
Expand Up @@ -562,12 +562,7 @@ void rrdhost_system_info_to_streaming_function_array(BUFFER *wb, struct rrdhost_
void get_daemon_status_fields_from_system_info(DAEMON_STATUS_FILE *ds) {
if(ds->read_system_info) return;

struct rrdhost_system_info tmp = { 0 };
struct rrdhost_system_info *ri = (localhost && localhost->system_info) ? localhost->system_info : NULL;

if(!ri && rrdhost_system_info_detect(&tmp) == 0)
ri = &tmp;

if(!ri) {
// nothing we can do, let it be
return;
Expand Down
Loading

0 comments on commit 728e365

Please sign in to comment.