From 14dafed8f827aed171f252842e2adf5fbb5beb1d Mon Sep 17 00:00:00 2001 From: Costa Tsaousis Date: Wed, 22 Jan 2025 18:31:34 +0000 Subject: [PATCH] Pulse stream-parents (#19445) * cleanup stream handshake reasons; introduce pulse-parents * mark function params as not used * more work towards pulse parents * added 2 charts with status per node (inbound, outbound) * log the reason the receiver left, when the sender is disconnected * fix receiver exit reasons * reject a duplicate streaming request with a different hostname * log already connected on children * do not retry too frequently * fix log * insist on connecting when the parent says already connected, but it is not in the stream path * fix last commit * log already connected on parent * added streaming events charts * streaming events are available in extended pulse * archived to stale * unify aclk connection status and connection failures/disconnection reasons --- CMakeLists.txt | 2 + src/aclk/aclk.c | 271 +++++------ src/aclk/aclk.h | 34 +- src/aclk/aclk_otp.c | 207 +++++---- src/aclk/aclk_otp.h | 6 +- src/aclk/https_client.c | 279 +++++++++--- src/aclk/https_client.h | 42 +- src/aclk/mqtt_websockets/aclk_mqtt_workers.h | 1 + src/aclk/mqtt_websockets/mqtt_wss_client.c | 21 +- src/aclk/mqtt_websockets/mqtt_wss_client.h | 2 + src/claim/cloud-status.c | 1 + .../systemd-journal-annotations.c | 1 + src/daemon/pulse/pulse-parents.c | 422 ++++++++++++++++++ src/daemon/pulse/pulse-parents.h | 44 ++ src/daemon/pulse/pulse.c | 7 +- src/daemon/pulse/pulse.h | 1 + src/database/rrd.c | 1 + src/database/rrdhost.c | 3 +- src/database/sqlite/sqlite_aclk.c | 2 + src/database/sqlite/sqlite_metadata.c | 1 + src/libnetdata/socket/nd-sock.c | 26 +- src/libnetdata/socket/nd-sock.h | 3 + src/libnetdata/uuid/uuid.h | 1 + src/plugins.d/pluginsd_parser.c | 1 + src/plugins.d/pluginsd_parser.h | 4 +- src/plugins.d/pluginsd_replication.c | 11 +- .../protocol/command-chart-definition.c | 6 +- src/streaming/stream-connector.c | 63 ++- src/streaming/stream-handshake.c | 111 +++-- src/streaming/stream-handshake.h | 120 +++-- src/streaming/stream-parents.c | 60 ++- src/streaming/stream-receiver-connection.c | 84 ++-- src/streaming/stream-receiver-internals.h | 4 +- src/streaming/stream-receiver.c | 136 +++--- src/streaming/stream-replication-sender.c | 3 +- src/streaming/stream-sender-api.c | 10 +- src/streaming/stream-sender-commit.c | 3 + src/streaming/stream-sender-internals.h | 2 +- src/streaming/stream-sender.c | 78 ++-- src/streaming/stream-thread.c | 23 +- src/streaming/stream-thread.h | 66 +-- 41 files changed, 1478 insertions(+), 685 deletions(-) create mode 100644 src/daemon/pulse/pulse-parents.c create mode 100644 src/daemon/pulse/pulse-parents.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 1adb520ba69350..a3d31f05767b34 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1174,6 +1174,8 @@ set(DAEMON_FILES src/daemon/pulse/pulse-network.h src/daemon/pulse/pulse-db-dbengine-retention.c src/daemon/pulse/pulse-db-dbengine-retention.h + src/daemon/pulse/pulse-parents.c + src/daemon/pulse/pulse-parents.h ) set(H2O_FILES diff --git a/src/aclk/aclk.c b/src/aclk/aclk.c index 1d8e662ba6c544..61c1a4b5c4c342 100644 --- a/src/aclk/aclk.c +++ b/src/aclk/aclk.c @@ -93,7 +93,7 @@ static void aclk_ssl_keylog_cb(const SSL *ssl, const char *line) if (!ssl_log_file) ssl_log_file = fopen(ssl_log_filename, "a"); if (!ssl_log_file) { - netdata_log_error("Couldn't open ssl_log file (%s) for append.", ssl_log_filename); + netdata_log_error("ACLK: Couldn't open ssl_log file (%s) for append.", ssl_log_filename); return; } fputs(line, ssl_log_file); @@ -128,14 +128,14 @@ static int load_private_key() long bytes_read; char *private_key = read_by_filename(filename, &bytes_read); if (!private_key) { - netdata_log_error("Claimed agent cannot establish ACLK - unable to load private key '%s' failed.", filename); + netdata_log_error("ACLK: Claimed agent cannot establish ACLK - unable to load private key '%s' failed.", filename); return 1; } netdata_log_debug(D_ACLK, "Claimed agent loaded private key len=%ld bytes", bytes_read); BIO *key_bio = BIO_new_mem_buf(private_key, -1); if (key_bio==NULL) { - netdata_log_error("Claimed agent cannot establish ACLK - failed to create BIO for key"); + netdata_log_error("ACLK: Claimed agent cannot establish ACLK - failed to create BIO for key"); goto biofailed; } @@ -146,13 +146,13 @@ static int load_private_key() NULL, NULL); if (!aclk_dctx) { - netdata_log_error("Loading private key (from claiming) failed - no OpenSSL Decoders found"); + netdata_log_error("ACLK: Loading private key (from claiming) failed - no OpenSSL Decoders found"); goto biofailed; } // this is necesseary to avoid RSA key with wrong size if (!OSSL_DECODER_from_bio(aclk_dctx, key_bio)) { - netdata_log_error("Decoding private key (from claiming) failed - invalid format."); + netdata_log_error("ACLK: Decoding private key (from claiming) failed - invalid format."); goto biofailed; } #else @@ -166,7 +166,7 @@ static int load_private_key() } char err[512]; ERR_error_string_n(ERR_get_error(), err, sizeof(err)); - netdata_log_error("Claimed agent cannot establish ACLK - cannot create private key: %s", err); + netdata_log_error("ACLK: Claimed agent cannot establish ACLK - cannot create private key: %s", err); biofailed: freez(private_key); @@ -212,7 +212,7 @@ static int wait_till_agent_claim_ready() // We trap the impossible NULL here to keep the linter happy without using a fatal() in the code. const char *cloud_base_url = cloud_config_url_get(); if (cloud_base_url == NULL) { - netdata_log_error("Do not move the \"url\" out of netdata_conf_section_global_run_as_user!!"); + netdata_log_error("ACLK: Do not move the \"url\" out of netdata_conf_section_global_run_as_user!!"); return 1; } @@ -220,7 +220,7 @@ static int wait_till_agent_claim_ready() // TODO make it without malloc/free memset(&url, 0, sizeof(url_t)); if (url_parse(cloud_base_url, &url)) { - netdata_log_error("Agent is claimed but the URL in configuration key \"url\" is invalid, please fix"); + netdata_log_error("ACLK: Agent is claimed but the URL in configuration key \"url\" is invalid, please fix"); url_t_destroy(&url); sleep(5); continue; @@ -244,7 +244,7 @@ static void msg_callback(const char *topic, const void *msg, size_t msglen, int netdata_log_debug(D_ACLK, "Got Message From Broker Topic \"%s\" QOS %d", topic, qos); if (aclk_shared_state.mqtt_shutdown_msg_id > 0) { - netdata_log_error("Link is shutting down. Ignoring incoming message."); + netdata_log_error("ACLK: Link is shutting down. Ignoring incoming message."); return; } @@ -266,7 +266,7 @@ static void msg_callback(const char *topic, const void *msg, size_t msglen, int snprintf(filename, FN_MAX_LEN, ACLK_LOG_CONVERSATION_DIR "/%010d-rx-%s.bin", ACLK_GET_CONV_LOG_NEXT(), msgtype); logfd = open(filename, O_CREAT | O_TRUNC | O_WRONLY, S_IRUSR | S_IWUSR ); if(logfd < 0) - netdata_log_error("Error opening ACLK Conversation logfile \"%s\" for RX message.", filename); + netdata_log_error("ACLK: Error opening ACLK Conversation logfile \"%s\" for RX message.", filename); write(logfd, msg, msglen); close(logfd); #endif @@ -307,9 +307,24 @@ static int handle_connection(mqtt_wss_client client) while (service_running(SERVICE_ACLK)) { // timeout 1000 to check at least once a second // for netdata_exit - if (mqtt_wss_service(client, 1000) < 0){ + int rc = mqtt_wss_service(client, 1000); + if (rc < 0){ worker_is_busy(WORKER_ACLK_DISCONNECTED); error_report("Connection Error or Dropped"); + + if(rc == MQTT_WSS_ERR_REMOTE_CLOSED) + aclk_status_set(ACLK_STATUS_OFFLINE_CLOSED_BY_REMOTE); + else if(rc == MQTT_WSS_ERR_PROTO_MQTT) + aclk_status_set(ACLK_STATUS_OFFLINE_MQTT_PROTOCOL_ERROR); + else if(rc == MQTT_WSS_ERR_PROTO_WS) + aclk_status_set(ACLK_STATUS_OFFLINE_WS_PROTOCOL_ERROR); + else if(rc == MQTT_WSS_ERR_MSG_TOO_BIG) + aclk_status_set(ACLK_STATUS_OFFLINE_MESSAGE_TOO_BIG); + else if(rc == MQTT_WSS_ERR_POLL_FAILED) + aclk_status_set(ACLK_STATUS_OFFLINE_POLL_ERROR); + else /* if(rc == MQTT_WSS_ERR_CONN_DROP) */ + aclk_status_set(ACLK_STATUS_OFFLINE_SOCKET_ERROR); + return 1; } @@ -319,19 +334,23 @@ static int handle_connection(mqtt_wss_client client) case ACLK_CLOUD_DISCONNECT: worker_is_busy(WORKER_ACLK_CMD_DISCONNECT); reason = "cloud request"; + aclk_status_set(ACLK_STATUS_OFFLINE_CLOUD_REQUESTED_DISCONNECT); break; case ACLK_PING_TIMEOUT: worker_is_busy(WORKER_ACLK_CMD_TIMEOUT); reason = "ping timeout"; schedule_node_update = true; + aclk_status_set(ACLK_STATUS_OFFLINE_PING_TIMEOUT); break; case ACLK_RELOAD_CONF: worker_is_busy(WORKER_ACLK_CMD_RELOAD_CONF); reason = "reclaim"; + aclk_status_set(ACLK_STATUS_OFFLINE_RELOADING_CONFIG); break; default: worker_is_busy(WORKER_ACLK_CMD_UNKNOWN); reason = "unknown"; + aclk_status_set(ACLK_STATUS_OFFLINE); break; } @@ -352,13 +371,13 @@ static inline void mqtt_connected_actions(mqtt_wss_client client) char *topic = (char*)aclk_get_topic(ACLK_TOPICID_COMMAND); if (!topic) - netdata_log_error("Unable to fetch topic for COMMAND (to subscribe)"); + netdata_log_error("ACLK: Unable to fetch topic for COMMAND (to subscribe)"); else mqtt_wss_subscribe(client, topic, 1); topic = (char*)aclk_get_topic(ACLK_TOPICID_CMD_NG_V1); if (!topic) - netdata_log_error("Unable to fetch topic for protobuf COMMAND (to subscribe)"); + netdata_log_error("ACLK: Unable to fetch topic for protobuf COMMAND (to subscribe)"); else mqtt_wss_subscribe(client, topic, 1); @@ -384,7 +403,7 @@ void aclk_graceful_disconnect(mqtt_wss_client client) time_t t = now_monotonic_sec(); while (!mqtt_wss_service(client, 100)) { if (now_monotonic_sec() - t >= 2) { - netdata_log_error("Wasn't able to gracefully shutdown ACLK in time!"); + netdata_log_error("ACLK: Wasn't able to gracefully shutdown ACLK in time!"); break; } if (aclk_shared_state.mqtt_shutdown_msg_rcvd) { @@ -448,6 +467,7 @@ static int aclk_block_till_recon_allowed() { "Wait before attempting to reconnect in %.3f seconds", recon_delay / (float)MSEC_PER_SEC); // we want to wake up from time to time to check netdata_exit + worker_is_busy(WORKER_ACLK_WAITING_TO_CONNECT); while (recon_delay) { if (!service_running(SERVICE_ACLK)) @@ -460,6 +480,8 @@ static int aclk_block_till_recon_allowed() { sleep_usec(recon_delay * USEC_PER_MS); recon_delay = 0; } + + worker_is_busy(WORKER_ACLK_CONNECT); return !service_running(SERVICE_ACLK); } @@ -480,72 +502,100 @@ static int aclk_get_transport_idx(aclk_env_t *env) { } #endif -ACLK_STATUS aclk_status = ACLK_STATUS_NONE; +ACLK_STATUS aclk_status = ACLK_STATUS_OFFLINE; const char *aclk_status_to_string(void) { + if(aclk_status == ACLK_STATUS_CONNECTED) + return "connected"; + + if((int)aclk_status < (int)ND_SOCK_ERR_MAX) + return ND_SOCK_ERROR_2str((ND_SOCK_ERROR)aclk_status); + + if((int)aclk_status < (int)HTTPS_CLIENT_RESP_MAX) + return https_client_resp_t_2str((https_client_resp_t)aclk_status); + switch(aclk_status) { case ACLK_STATUS_CONNECTED: return "connected"; - case ACLK_STATUS_NONE: - return "none"; + case ACLK_STATUS_OFFLINE: + return "offline"; case ACLK_STATUS_DISABLED: return "disabled"; - case ACLK_STATUS_NO_CLOUD_URL: - return "no_cloud_url"; - - case ACLK_STATUS_INVALID_CLOUD_URL: - return "invalid_cloud_url"; - - case ACLK_STATUS_NOT_CLAIMED: - return "not_claimed"; - - case ACLK_STATUS_ENV_ENDPOINT_UNREACHABLE: - return "env_endpoint_unreachable"; - - case ACLK_STATUS_ENV_RESPONSE_NOT_200: - return "env_response_not_200"; - - case ACLK_STATUS_ENV_RESPONSE_EMPTY: - return "env_response_empty"; + case ACLK_STATUS_CANT_CONNECT_NO_CLOUD_URL: + return "configuration error, no cloud url"; - case ACLK_STATUS_ENV_RESPONSE_NOT_JSON: - return "env_response_not_json"; - - case ACLK_STATUS_ENV_FAILED: - return "env_failed"; + case ACLK_STATUS_CANT_CONNECT_INVALID_CLOUD_URL: + return "configuration error, invalid cloud url"; case ACLK_STATUS_BLOCKED: - return "blocked"; + return "agent is blocked"; case ACLK_STATUS_NO_OLD_PROTOCOL: - return "no_old_protocol"; + return "can't connect, old protocol not supported"; case ACLK_STATUS_NO_PROTOCOL_CAPABILITY: - return "no_protocol_capability"; + return "can't connect, protocol capability not supported"; case ACLK_STATUS_INVALID_ENV_AUTH_URL: - return "invalid_env_auth_url"; + return "can't connect, invalid /env auth url"; case ACLK_STATUS_INVALID_ENV_TRANSPORT_IDX: - return "invalid_env_transport_idx"; + return "can't connect, invalid /env transport idx"; case ACLK_STATUS_INVALID_ENV_TRANSPORT_URL: - return "invalid_env_transport_url"; - - case ACLK_STATUS_INVALID_OTP: - return "invalid_otp"; + return "can't connect, invalid /env transport URL"; case ACLK_STATUS_NO_LWT_TOPIC: - return "no_lwt_topic"; + return "can't connect, no LWT topic"; + + case ACLK_STATUS_OFFLINE_CLOUD_REQUESTED_DISCONNECT: + return "disconnected, due to remote request"; + + case ACLK_STATUS_OFFLINE_PING_TIMEOUT: + return "disconnected, ping timed out"; + + case ACLK_STATUS_OFFLINE_RELOADING_CONFIG: + return "disconnected, to reload config"; + + case ACLK_STATUS_OFFLINE_POLL_ERROR: + return "disconnected, poll() failed"; + + case ACLK_STATUS_OFFLINE_CLOSED_BY_REMOTE: + return "disconnected, closed by remote end"; + + case ACLK_STATUS_OFFLINE_SOCKET_ERROR: + return "disconnected, socket error"; + + case ACLK_STATUS_OFFLINE_MQTT_PROTOCOL_ERROR: + return "disconnected, MQTT protocol error"; + + case ACLK_STATUS_OFFLINE_WS_PROTOCOL_ERROR: + return "disconnected, WebSockets protocol error"; + + case ACLK_STATUS_OFFLINE_MESSAGE_TOO_BIG: + return "disconnected, message too big"; default: - return "unknown"; + return "unknown status"; } } +void aclk_status_set(ACLK_STATUS status) { + aclk_status = status; + + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &aclk_connection_msgid), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + nd_log(NDLS_DAEMON, status == ACLK_STATUS_CONNECTED ? NDLP_INFO : NDLP_ERR, + "Netdata Cloud, ACLK connection status: %s", aclk_status_to_string()); +} + const char *aclk_cloud_base_url = NULL; /* Attempts to make a connection to MQTT broker over WSS @@ -562,7 +612,7 @@ const char *aclk_cloud_base_url = NULL; #endif static int aclk_attempt_to_connect(mqtt_wss_client client) { - int ret; + https_client_resp_t rc; url_t base_url; @@ -575,23 +625,22 @@ static int aclk_attempt_to_connect(mqtt_wss_client client) while (service_running(SERVICE_ACLK)) { aclk_cloud_base_url = cloud_config_url_get(); if (aclk_cloud_base_url == NULL) { - error_report("Do not move the \"url\" out of netdata_conf_section_global_run_as_user!!"); - aclk_status = ACLK_STATUS_NO_CLOUD_URL; + error_report("ACLK: cloud base URL is empty."); + aclk_status_set(ACLK_STATUS_CANT_CONNECT_NO_CLOUD_URL); return -1; } if (aclk_block_till_recon_allowed()) { - aclk_status = ACLK_STATUS_BLOCKED; + aclk_status_set(ACLK_STATUS_BLOCKED); return 1; } - nd_log(NDLS_DAEMON, NDLP_DEBUG, - "Attempting connection now"); + nd_log(NDLS_DAEMON, NDLP_DEBUG, "ACLK: attempting to connect now"); memset(&base_url, 0, sizeof(url_t)); if (url_parse(aclk_cloud_base_url, &base_url)) { - aclk_status = ACLK_STATUS_INVALID_CLOUD_URL; - error_report("ACLK base URL configuration key could not be parsed. Will retry in %d seconds.", CLOUD_BASE_URL_READ_RETRY); + aclk_status_set(ACLK_STATUS_CANT_CONNECT_INVALID_CLOUD_URL); + error_report("ACLK: base URL '%s' cannot be parsed.", aclk_cloud_base_url); sleep(CLOUD_BASE_URL_READ_RETRY); url_t_destroy(&base_url); continue; @@ -618,79 +667,46 @@ static int aclk_attempt_to_connect(mqtt_wss_client client) } aclk_env = callocz(1, sizeof(aclk_env_t)); - ret = aclk_get_env(aclk_env, base_url.host, base_url.port, &fallback_ipv4); + rc = aclk_get_env(aclk_env, base_url.host, base_url.port, &fallback_ipv4); url_t_destroy(&base_url); - if(ret) switch(ret) { - case 1: - aclk_status = ACLK_STATUS_NOT_CLAIMED; - error_report("Failed to Get ACLK environment (agent is not claimed)"); - // delay handled by aclk_block_till_recon_allowed - continue; - - case 2: - aclk_status = ACLK_STATUS_ENV_ENDPOINT_UNREACHABLE; - error_report("Failed to Get ACLK environment (cannot contact ENV endpoint)"); - // delay handled by aclk_block_till_recon_allowed - continue; - - case 3: - aclk_status = ACLK_STATUS_ENV_RESPONSE_NOT_200; - error_report("Failed to Get ACLK environment (ENV response code is not 200)"); - // delay handled by aclk_block_till_recon_allowed - continue; - - case 4: - aclk_status = ACLK_STATUS_ENV_RESPONSE_EMPTY; - error_report("Failed to Get ACLK environment (ENV response is empty)"); - // delay handled by aclk_block_till_recon_allowed - continue; - - case 5: - aclk_status = ACLK_STATUS_ENV_RESPONSE_NOT_JSON; - error_report("Failed to Get ACLK environment (ENV response is not JSON)"); - // delay handled by aclk_block_till_recon_allowed - continue; - - default: - aclk_status = ACLK_STATUS_ENV_FAILED; - error_report("Failed to Get ACLK environment (unknown error)"); - // delay handled by aclk_block_till_recon_allowed - continue; + if (rc != HTTPS_CLIENT_RESP_OK) { + aclk_status_set((ACLK_STATUS)rc); + continue; } if (!service_running(SERVICE_ACLK)) { - aclk_status = ACLK_STATUS_DISABLED; + aclk_status_set(ACLK_STATUS_DISABLED); return 1; } if (aclk_env->encoding != ACLK_ENC_PROTO) { - aclk_status = ACLK_STATUS_NO_OLD_PROTOCOL; - error_report("This agent can only use the new cloud protocol but cloud requested old one."); + aclk_status_set(ACLK_STATUS_NO_OLD_PROTOCOL); + error_report("ACLK: this agent can only use the new cloud protocol but cloud requested old one."); continue; } if (!aclk_env_has_capa("proto")) { - aclk_status = ACLK_STATUS_NO_PROTOCOL_CAPABILITY; - error_report("Can't use encoding=proto without at least \"proto\" capability."); + aclk_status_set(ACLK_STATUS_NO_PROTOCOL_CAPABILITY); + error_report("ACLK: can't use encoding=proto without at least \"proto\" capability."); continue; } nd_log(NDLS_DAEMON, NDLP_DEBUG, - "New ACLK protobuf protocol negotiated successfully (/env response)."); + "ACLK: new ACLK protobuf protocol negotiated successfully (/env response)."); memset(&auth_url, 0, sizeof(url_t)); if (url_parse(aclk_env->auth_endpoint, &auth_url)) { - aclk_status = ACLK_STATUS_INVALID_ENV_AUTH_URL; - error_report("Parsing URL returned by env endpoint for authentication failed. \"%s\"", aclk_env->auth_endpoint); + aclk_status_set(ACLK_STATUS_INVALID_ENV_AUTH_URL); + error_report("ACLK: parsing URL returned by env endpoint for authentication failed. \"%s\"", aclk_env->auth_endpoint); url_t_destroy(&auth_url); continue; } - ret = aclk_get_mqtt_otp(aclk_private_key, (char **)&mqtt_conn_params.clientid, (char **)&mqtt_conn_params.username, (char **)&mqtt_conn_params.password, &auth_url, &fallback_ipv4); + rc = aclk_get_mqtt_otp(aclk_private_key, (char **)&mqtt_conn_params.clientid, (char **)&mqtt_conn_params.username, (char **)&mqtt_conn_params.password, &auth_url, &fallback_ipv4); url_t_destroy(&auth_url); - if (ret) { - aclk_status = ACLK_STATUS_INVALID_OTP; - error_report("Error passing Challenge/Response to get OTP"); + if (rc != HTTPS_CLIENT_RESP_OK) { + aclk_status_set((ACLK_STATUS)rc); + error_report("ACLK: error passing Challenge/Response to get OTP"); continue; } @@ -699,23 +715,23 @@ static int aclk_attempt_to_connect(mqtt_wss_client client) mqtt_conn_params.will_topic = aclk_get_topic(ACLK_TOPICID_AGENT_CONN); if (!mqtt_conn_params.will_topic) { - aclk_status = ACLK_STATUS_NO_LWT_TOPIC; - error_report("Couldn't get LWT topic. Will not send LWT."); + aclk_status_set(ACLK_STATUS_NO_LWT_TOPIC); + error_report("ACLK: couldn't get LWT topic. Will not send LWT."); continue; } // Do the MQTT connection - ret = aclk_get_transport_idx(aclk_env); - if (ret < 0) { - aclk_status = ACLK_STATUS_INVALID_ENV_TRANSPORT_IDX; - error_report("Cloud /env endpoint didn't return any transport usable by this Agent."); + int trp = aclk_get_transport_idx(aclk_env); + if (trp < 0) { + aclk_status_set(ACLK_STATUS_INVALID_ENV_TRANSPORT_IDX); + error_report("ACLK: cloud /env endpoint didn't return any transport usable by this agent."); continue; } memset(&mqtt_url, 0, sizeof(url_t)); - if (url_parse(aclk_env->transports[ret]->endpoint, &mqtt_url)){ - aclk_status = ACLK_STATUS_INVALID_ENV_TRANSPORT_URL; - error_report("Failed to parse target URL for /env trp idx %d \"%s\"", ret, aclk_env->transports[ret]->endpoint); + if (url_parse(aclk_env->transports[rc]->endpoint, &mqtt_url)){ + aclk_status_set(ACLK_STATUS_INVALID_ENV_TRANSPORT_URL); + error_report("ACLK: failed to parse target URL for /env trp idx %d \"%s\"", trp, aclk_env->transports[rc]->endpoint); url_t_destroy(&mqtt_url); continue; } @@ -728,10 +744,10 @@ static int aclk_attempt_to_connect(mqtt_wss_client client) mqtt_conn_params.will_msg = aclk_generate_lwt(&mqtt_conn_params.will_msg_len); #ifdef ACLK_DISABLE_CHALLENGE - ret = mqtt_wss_connect(client, base_url.host, base_url.port, &mqtt_conn_params, ACLK_SSL_FLAGS, &proxy_conf); + int mqtt_rc = mqtt_wss_connect(client, base_url.host, base_url.port, &mqtt_conn_params, ACLK_SSL_FLAGS, &proxy_conf); url_t_destroy(&base_url); #else - ret = mqtt_wss_connect(client, mqtt_url.host, mqtt_url.port, &mqtt_conn_params, ACLK_SSL_FLAGS, &proxy_conf, &fallback_ipv4); + int mqtt_rc = mqtt_wss_connect(client, mqtt_url.host, mqtt_url.port, &mqtt_conn_params, ACLK_SSL_FLAGS, &proxy_conf, &fallback_ipv4); url_t_destroy(&mqtt_url); freez((char*)mqtt_conn_params.clientid); @@ -744,20 +760,20 @@ static int aclk_attempt_to_connect(mqtt_wss_client client) freez((char*)proxy_conf.username); freez((char*)proxy_conf.password); - if (!ret) { + if (!mqtt_rc) { last_conn_time_mqtt = now_realtime_sec(); - nd_log(NDLS_DAEMON, NDLP_INFO, "ACLK connection successfully established"); - aclk_status = ACLK_STATUS_CONNECTED; + nd_log(NDLS_DAEMON, NDLP_INFO, "ACLK: connection successfully established"); + aclk_status_set(ACLK_STATUS_CONNECTED); nd_log(NDLS_ACCESS, NDLP_INFO, "ACLK CONNECTED"); mqtt_connected_actions(client); fallback_ipv4 = false; return 0; } - error_report("Connect failed"); + error_report("ACLK: connection failed"); } - aclk_status = ACLK_STATUS_DISABLED; + aclk_status_set(ACLK_STATUS_DISABLED); return 1; } @@ -810,11 +826,12 @@ void *aclk_main(void *ptr) worker_register_job_name(WORKER_ACLK_CPT_UNKNOWN, "cpt unknown"); worker_register_job_name(WORKER_ACLK_SEND_FRAGMENT, "send fragment"); worker_register_job_name(WORKER_ACLK_MSG_CALLBACK, "msg callback"); + worker_register_job_name(WORKER_ACLK_WAITING_TO_CONNECT, "conn wait"); ACLK_PROXY_TYPE proxy_type; aclk_get_proxy(&proxy_type); if (proxy_type == PROXY_TYPE_SOCKS5) { - netdata_log_error("SOCKS5 proxy is not supported by ACLK-NG yet."); + netdata_log_error("ACLK: SOCKS5 proxy is not supported by ACLK-NG yet."); static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; return NULL; } @@ -826,7 +843,7 @@ void *aclk_main(void *ptr) goto exit; if (!((mqttwss_client = mqtt_wss_new(msg_callback, puback_callback)))) { - netdata_log_error("Couldn't initialize MQTT_WSS network library"); + netdata_log_error("ACLK: Couldn't initialize MQTT_WSS network library"); goto exit; } @@ -849,7 +866,7 @@ void *aclk_main(void *ptr) // Keep reconnecting and talking until our time has come // and the Grim Reaper (netdata_exit) calls - netdata_log_info("Starting ACLK query event loop"); + netdata_log_info("ACLK: Starting ACLK query event loop"); aclk_query_init(mqttwss_client); do { worker_is_busy(WORKER_ACLK_CONNECT); @@ -929,7 +946,7 @@ void aclk_host_state_update(RRDHOST *host, int cmd, int queryable) int ret = get_node_id(&host->host_id.uuid, &node_id.uuid); if (ret > 0) { // this means we were not able to check if node_id already present - netdata_log_error("Unable to check for node_id. Ignoring the host state update."); + netdata_log_error("ACLK: Unable to check for node_id. Ignoring the host state update."); return; } if (ret < 0) { diff --git a/src/aclk/aclk.h b/src/aclk/aclk.h index cd0c636d5b39e4..bdf1cd70866eb6 100644 --- a/src/aclk/aclk.h +++ b/src/aclk/aclk.h @@ -7,6 +7,8 @@ #include "aclk_util.h" //#include "aclk_rrdhost_state.h" +#include "https_client.h" + // How many MQTT PUBACKs we need to get to consider connection // stable for the purposes of TBEB (truncated binary exponential backoff) #define ACLK_PUBACKS_CONN_STABLE 3 @@ -18,26 +20,35 @@ typedef enum { ACLK_PING_TIMEOUT = 3 } ACLK_DISCONNECT_ACTION; -typedef enum __attribute__((packed)) { +typedef enum { ACLK_STATUS_CONNECTED = 0, - ACLK_STATUS_NONE, + + // ND_SOCK_ERR_XXX is included here + // HTTPS_CLIENT_RESP_XXX is included here + + ACLK_STATUS_OFFLINE = HTTPS_CLIENT_RESP_MAX, ACLK_STATUS_DISABLED, - ACLK_STATUS_NO_CLOUD_URL, - ACLK_STATUS_INVALID_CLOUD_URL, - ACLK_STATUS_NOT_CLAIMED, - ACLK_STATUS_ENV_ENDPOINT_UNREACHABLE, - ACLK_STATUS_ENV_RESPONSE_NOT_200, - ACLK_STATUS_ENV_RESPONSE_EMPTY, - ACLK_STATUS_ENV_RESPONSE_NOT_JSON, - ACLK_STATUS_ENV_FAILED, + ACLK_STATUS_CANT_CONNECT_NO_CLOUD_URL, + ACLK_STATUS_CANT_CONNECT_INVALID_CLOUD_URL, ACLK_STATUS_BLOCKED, ACLK_STATUS_NO_OLD_PROTOCOL, ACLK_STATUS_NO_PROTOCOL_CAPABILITY, ACLK_STATUS_INVALID_ENV_AUTH_URL, ACLK_STATUS_INVALID_ENV_TRANSPORT_IDX, ACLK_STATUS_INVALID_ENV_TRANSPORT_URL, - ACLK_STATUS_INVALID_OTP, ACLK_STATUS_NO_LWT_TOPIC, + + // disconnection reasons + ACLK_STATUS_OFFLINE_CLOUD_REQUESTED_DISCONNECT, + ACLK_STATUS_OFFLINE_PING_TIMEOUT, + ACLK_STATUS_OFFLINE_RELOADING_CONFIG, + ACLK_STATUS_OFFLINE_POLL_ERROR, + ACLK_STATUS_OFFLINE_CLOSED_BY_REMOTE, + ACLK_STATUS_OFFLINE_SOCKET_ERROR, + ACLK_STATUS_OFFLINE_MQTT_PROTOCOL_ERROR, + ACLK_STATUS_OFFLINE_WS_PROTOCOL_ERROR, + ACLK_STATUS_OFFLINE_MESSAGE_TOO_BIG, + } ACLK_STATUS; extern ACLK_STATUS aclk_status; @@ -95,5 +106,6 @@ void add_aclk_host_labels(void); void aclk_queue_node_info(RRDHOST *host, bool immediate); struct mqtt_wss_stats aclk_statistics(void); +void aclk_status_set(ACLK_STATUS status); #endif /* ACLK_H */ diff --git a/src/aclk/aclk_otp.c b/src/aclk/aclk_otp.c index 3e4f7835ad786f..0d1e69a2f73292 100644 --- a/src/aclk/aclk_otp.c +++ b/src/aclk/aclk_otp.c @@ -4,8 +4,8 @@ #include "aclk_util.h" #include "aclk.h" -static int aclk_https_request(https_req_t *request, https_req_response_t *response, bool *fallback_ipv4) { - int rc; +static https_client_resp_t aclk_https_request(https_req_t *request, https_req_response_t *response, bool *fallback_ipv4) { + https_client_resp_t rc; // wrapper for ACLK only which loads ACLK specific proxy settings // then only calls https_request struct mqtt_wss_proxy proxy_conf = { .host = NULL, .port = 0, .username = NULL, .password = NULL, .type = MQTT_WSS_DIRECT }; @@ -33,7 +33,7 @@ struct auth_data { #define PARSE_ENV_JSON_CHK_TYPE(it, type, name) \ if (json_object_get_type(json_object_iter_peek_value(it)) != type) { \ - netdata_log_error("value of key \"%s\" should be %s", name, #type); \ + netdata_log_error("ACLK: value of key \"%s\" should be %s", name, #type); \ goto exit; \ } @@ -50,7 +50,7 @@ static int parse_passwd_response(const char *json_str, struct auth_data *auth) { json = json_tokener_parse(json_str); if (!json) { - netdata_log_error("JSON-C failed to parse the payload of http response of /env endpoint"); + netdata_log_error("ACLK: JSON-C failed to parse the payload of http response of /env endpoint"); return 1; } @@ -83,26 +83,26 @@ static int parse_passwd_response(const char *json_str, struct auth_data *auth) { PARSE_ENV_JSON_CHK_TYPE(&it, json_type_array, JSON_KEY_TOPICS) if (aclk_generate_topic_cache(json_object_iter_peek_value(&it))) { - netdata_log_error("Failed to generate topic cache!"); + netdata_log_error("ACLK: Failed to generate topic cache!"); goto exit; } json_object_iter_next(&it); continue; } - netdata_log_error("Unknown key \"%s\" in passwd response payload. Ignoring", json_object_iter_peek_name(&it)); + netdata_log_error("ACLK: Unknown key \"%s\" in passwd response payload. Ignoring", json_object_iter_peek_name(&it)); json_object_iter_next(&it); } if (!auth->client_id) { - netdata_log_error(JSON_KEY_CLIENTID " is compulsory key in /password response"); + netdata_log_error("ACLK: " JSON_KEY_CLIENTID " is compulsory key in /password response"); goto exit; } if (!auth->passwd) { - netdata_log_error(JSON_KEY_PASS " is compulsory in /password response"); + netdata_log_error("ACLK: " JSON_KEY_PASS " is compulsory in /password response"); goto exit; } if (!auth->username) { - netdata_log_error(JSON_KEY_USER " is compulsory in /password response"); + netdata_log_error("ACLK: " JSON_KEY_USER " is compulsory in /password response"); goto exit; } @@ -121,11 +121,11 @@ static int parse_passwd_response(const char *json_str, struct auth_data *auth) { static const char *get_json_str_by_path(json_object *json, const char *path) { json_object *ptr; if (json_pointer_get(json, path, &ptr)) { - netdata_log_error("Missing compulsory key \"%s\" in error response", path); + netdata_log_error("ACLK: Missing compulsory key \"%s\" in error response", path); return NULL; } if (json_object_get_type(ptr) != json_type_string) { - netdata_log_error("Value of Key \"%s\" in error response should be string", path); + netdata_log_error("ACLK: Value of Key \"%s\" in error response should be string", path); return NULL; } return json_object_get_string(ptr); @@ -142,7 +142,7 @@ static int aclk_parse_otp_error(const char *json_str) { json = json_tokener_parse(json_str); if (!json) { - netdata_log_error("JSON-C failed to parse the payload of http response of /env endpoint"); + netdata_log_error("ACLK: JSON-C failed to parse the payload of http response of /env endpoint"); return 1; } @@ -158,7 +158,7 @@ static int aclk_parse_otp_error(const char *json_str) { // optional field if (!json_pointer_get(json, "/" JSON_KEY_ERTRY, &ptr)) { if (json_object_get_type(ptr) != json_type_boolean) { - netdata_log_error("Error response Key " "/" JSON_KEY_ERTRY " should be of boolean type"); + netdata_log_error("ACLK: Error response Key " "/" JSON_KEY_ERTRY " should be of boolean type"); goto exit; } block_retry = json_object_get_boolean(ptr); @@ -167,7 +167,7 @@ static int aclk_parse_otp_error(const char *json_str) { // optional field if (!json_pointer_get(json, "/" JSON_KEY_EDELAY, &ptr)) { if (json_object_get_type(ptr) != json_type_int) { - netdata_log_error("Error response Key " "/" JSON_KEY_EDELAY " should be of integer type"); + netdata_log_error("ACLK: Error response Key " "/" JSON_KEY_EDELAY " should be of integer type"); goto exit; } backoff = json_object_get_int(ptr); @@ -179,7 +179,7 @@ static int aclk_parse_otp_error(const char *json_str) { if (backoff > 0) aclk_block_until = now_monotonic_sec() + backoff; - netdata_log_error("Cloud returned EC=\"%s\", Msg-Key:\"%s\", Msg:\"%s\", BlockRetry:%s, Backoff:%ds (-1 unset by cloud)", ec, ek, emsg, block_retry > 0 ? "true" : "false", backoff); + netdata_log_error("ACLK: Cloud returned EC=\"%s\", Msg-Key:\"%s\", Msg:\"%s\", BlockRetry:%s, Backoff:%ds (-1 unset by cloud)", ec, ek, emsg, block_retry > 0 ? "true" : "false", backoff); rc = 0; exit: json_object_put(json); @@ -200,7 +200,7 @@ static int aclk_parse_otp_error(const char *json_str) { json = json_tokener_parse(json_str); if (!json) { - netdata_log_error("JSON-C failed to parse the payload of http response of /env endpoint"); + netdata_log_error("ACLK: JSON-C failed to parse the payload of http response of /env endpoint"); return 1; } @@ -231,7 +231,7 @@ static int aclk_parse_otp_error(const char *json_str) { } if (!strcmp(json_object_iter_peek_name(&it), JSON_KEY_EDELAY)) { if (json_object_get_type(json_object_iter_peek_value(&it)) != json_type_int) { - netdata_log_error("value of key " JSON_KEY_EDELAY " should be integer"); + netdata_log_error("ACLK: value of key " JSON_KEY_EDELAY " should be integer"); goto exit; } @@ -241,7 +241,7 @@ static int aclk_parse_otp_error(const char *json_str) { } if (!strcmp(json_object_iter_peek_name(&it), JSON_KEY_ERTRY)) { if (json_object_get_type(json_object_iter_peek_value(&it)) != json_type_boolean) { - netdata_log_error("value of key " JSON_KEY_ERTRY " should be integer"); + netdata_log_error("ACLK: value of key " JSON_KEY_ERTRY " should be integer"); goto exit; } @@ -249,7 +249,7 @@ static int aclk_parse_otp_error(const char *json_str) { json_object_iter_next(&it); continue; } - netdata_log_error("Unknown key \"%s\" in error response payload. Ignoring", json_object_iter_peek_name(&it)); + netdata_log_error("ACLK: Unknown key \"%s\" in error response payload. Ignoring", json_object_iter_peek_name(&it)); json_object_iter_next(&it); } @@ -259,7 +259,7 @@ static int aclk_parse_otp_error(const char *json_str) { if (backoff > 0) aclk_block_until = now_monotonic_sec() + backoff; - netdata_log_error("Cloud returned EC=\"%s\", Msg-Key:\"%s\", Msg:\"%s\", BlockRetry:%s, Backoff:%ds (-1 unset by cloud)", ec, ek, emsg, block_retry > 0 ? "true" : "false", backoff); + netdata_log_error("ACLK: Cloud returned EC=\"%s\", Msg-Key:\"%s\", Msg:\"%s\", BlockRetry:%s, Backoff:%ds (-1 unset by cloud)", ec, ek, emsg, block_retry > 0 ? "true" : "false", backoff); rc = 0; exit: json_object_put(json); @@ -271,9 +271,9 @@ static int aclk_parse_otp_error(const char *json_str) { #define CHALLENGE_LEN_BASE64 344 #define OTP_URL_PREFIX "/api/v1/auth/node/" -int aclk_get_otp_challenge(url_t *target, const char *agent_id, unsigned char **challenge, int *challenge_bytes, bool *fallback_ipv4) +static https_client_resp_t aclk_get_otp_challenge(url_t *target, const char *agent_id, unsigned char **challenge, int *challenge_bytes, bool *fallback_ipv4) { - int rc = 1; + https_client_resp_t rc; https_req_t req = HTTPS_REQ_T_INITIALIZER; https_req_response_t resp = HTTPS_REQ_RESPONSE_T_INITIALIZER; @@ -284,13 +284,15 @@ int aclk_get_otp_challenge(url_t *target, const char *agent_id, unsigned char ** buffer_sprintf(url, "%s/node/%s/challenge", target->path, agent_id); req.url = (char *)buffer_tostring(url); - if (aclk_https_request(&req, &resp, fallback_ipv4)) { - netdata_log_error("ACLK_OTP Challenge failed"); + rc = aclk_https_request(&req, &resp, fallback_ipv4); + if (rc != HTTPS_CLIENT_RESP_OK) { + netdata_log_error("ACLK: OTP Challenge failed"); buffer_free(url); - return 1; + return rc; } if (resp.http_code != 200) { - netdata_log_error("ACLK_OTP Challenge HTTP code not 200 OK (got %d)", resp.http_code); + rc = HTTPS_CLIENT_RESP_OTP_CHALLENGE_NOT_200; + netdata_log_error("ACLK: OTP Challenge HTTP code not 200 OK (got %d)", resp.http_code); buffer_free(url); if (resp.payload_size) aclk_parse_otp_error(resp.payload); @@ -298,29 +300,32 @@ int aclk_get_otp_challenge(url_t *target, const char *agent_id, unsigned char ** } buffer_free(url); - netdata_log_info("ACLK_OTP Got Challenge from Cloud"); - json_object *json = json_tokener_parse(resp.payload); if (!json) { - netdata_log_error("Couldn't parse HTTP GET challenge payload"); + rc = HTTPS_CLIENT_RESP_OTP_CHALLENGE_INVALID; + netdata_log_error("ACLK: couldn't parse HTTP GET challenge payload"); goto cleanup_resp; } json_object *challenge_json; if (!json_object_object_get_ex(json, "challenge", &challenge_json)) { - netdata_log_error("No key named \"challenge\" in the returned JSON"); + rc = HTTPS_CLIENT_RESP_OTP_CHALLENGE_INVALID; + netdata_log_error("ACLK: No key named \"challenge\" in the returned JSON"); goto cleanup_json; } if (!json_object_is_type(challenge_json, json_type_string)) { - netdata_log_error("\"challenge\" is not a string JSON type"); + rc = HTTPS_CLIENT_RESP_OTP_CHALLENGE_INVALID; + netdata_log_error("ACLK: \"challenge\" is not a string JSON type"); goto cleanup_json; } const char *challenge_base64; if (!((challenge_base64 = json_object_get_string(challenge_json)))) { - netdata_log_error("Failed to extract challenge from JSON object"); + rc = HTTPS_CLIENT_RESP_OTP_CHALLENGE_INVALID; + netdata_log_error("ACLK: Failed to extract challenge from JSON object"); goto cleanup_json; } if (strlen(challenge_base64) != CHALLENGE_LEN_BASE64) { - netdata_log_error("Received Challenge has unexpected length of %zu (expected %d)", strlen(challenge_base64), CHALLENGE_LEN_BASE64); + rc = HTTPS_CLIENT_RESP_OTP_CHALLENGE_INVALID; + netdata_log_error("ACLK: Received Challenge has unexpected length of %zu (expected %d)", strlen(challenge_base64), CHALLENGE_LEN_BASE64); goto cleanup_json; } @@ -328,12 +333,14 @@ int aclk_get_otp_challenge(url_t *target, const char *agent_id, unsigned char ** *challenge_bytes = netdata_base64_decode(*challenge, (const unsigned char *) challenge_base64, CHALLENGE_LEN_BASE64); if (*challenge_bytes != CHALLENGE_LEN) { - netdata_log_error("Unexpected challenge length of %d instead of %d", *challenge_bytes, CHALLENGE_LEN); + rc = HTTPS_CLIENT_RESP_OTP_CHALLENGE_INVALID; + netdata_log_error("ACLK: Unexpected challenge length of %d instead of %d", *challenge_bytes, CHALLENGE_LEN); freez(*challenge); *challenge = NULL; goto cleanup_json; } - rc = 0; + + rc = HTTPS_CLIENT_RESP_OK; cleanup_json: json_object_put(json); @@ -342,9 +349,9 @@ int aclk_get_otp_challenge(url_t *target, const char *agent_id, unsigned char ** return rc; } -int aclk_send_otp_response(const char *agent_id, const unsigned char *response, int response_bytes, url_t *target, struct auth_data *mqtt_auth, bool *fallback_ipv4) +static https_client_resp_t aclk_send_otp_response(const char *agent_id, const unsigned char *response, int response_bytes, url_t *target, struct auth_data *mqtt_auth, bool *fallback_ipv4) { - int rc = 1; + https_client_resp_t rc; https_req_t req = HTTPS_REQ_T_INITIALIZER; https_req_response_t resp = HTTPS_REQ_RESPONSE_T_INITIALIZER; @@ -367,28 +374,31 @@ int aclk_send_otp_response(const char *agent_id, const unsigned char *response, req.payload = (char *)buffer_tostring(resp_json); req.payload_size = strlen(req.payload); - if (aclk_https_request(&req, &resp, fallback_ipv4)) { - netdata_log_error("ACLK_OTP Password error trying to post result to password"); + rc = aclk_https_request(&req, &resp, fallback_ipv4); + if (rc != HTTPS_CLIENT_RESP_OK) { + netdata_log_error("ACLK: OTP Password error trying to post result to password"); goto cleanup_buffers; } if (resp.http_code != 201) { - netdata_log_error("ACLK_OTP Password HTTP code not 201 Created (got %d)", resp.http_code); + rc = HTTPS_CLIENT_RESP_OTP_PASSWORD_NOT_201; + netdata_log_error("ACLK: OTP Password HTTP code not 201 Created (got %d)", resp.http_code); if (resp.payload_size) aclk_parse_otp_error(resp.payload); goto cleanup_response; } if (resp.payload_size == 0 || resp.payload == NULL) { - netdata_log_error("ACLK_OTP Password response payload is empty despite returning 201 Created!"); + rc = HTTPS_CLIENT_RESP_OTP_PASSWORD_EMPTY; + netdata_log_error("ACLK: OTP Password response payload is empty despite returning 201 Created!"); goto cleanup_response; } - netdata_log_info("ACLK_OTP Got Password from Cloud"); if (parse_passwd_response(resp.payload, mqtt_auth)){ - netdata_log_error("Error parsing response of password endpoint"); + rc = HTTPS_CLIENT_RESP_OTP_PASSWORD_NOT_JSON; + netdata_log_error("ACLK: Error parsing response of password endpoint"); goto cleanup_response; } - rc = 0; + rc = HTTPS_CLIENT_RESP_OK; cleanup_response: https_req_response_free(&resp); @@ -437,49 +447,52 @@ static int private_decrypt(RSA *p_key, unsigned char * enc_data, int data_len, u { char err[512]; ERR_error_string_n(ERR_get_error(), err, sizeof(err)); - netdata_log_error("Decryption of the challenge failed: %s", err); + netdata_log_error("ACLK: Decryption of the challenge failed: %s", err); } return result; } #if OPENSSL_VERSION_NUMBER >= OPENSSL_VERSION_300 -int aclk_get_mqtt_otp(EVP_PKEY *p_key, char **mqtt_id, char **mqtt_usr, char **mqtt_pass, url_t *target, bool *fallback_ipv4) +https_client_resp_t aclk_get_mqtt_otp(EVP_PKEY *p_key, char **mqtt_id, char **mqtt_usr, char **mqtt_pass, url_t *target, bool *fallback_ipv4) #else -int aclk_get_mqtt_otp(RSA *p_key, char **mqtt_id, char **mqtt_usr, char **mqtt_pass, url_t *target, bool *fallback_ipv4) +https_client_resp_t aclk_get_mqtt_otp(RSA *p_key, char **mqtt_id, char **mqtt_usr, char **mqtt_pass, url_t *target, bool *fallback_ipv4) #endif { unsigned char *challenge = NULL; int challenge_bytes; + https_client_resp_t rc; CLAIM_ID claim_id = claim_id_get(); if (!claim_id_is_set(claim_id)) { - netdata_log_error("Agent was not claimed - cannot perform challenge/response"); - return 1; + netdata_log_error("ACLK: Agent was not claimed - cannot perform challenge/response"); + return HTTPS_CLIENT_RESP_OTP_AGENT_NOT_CLAIMED; } // Get Challenge - if (aclk_get_otp_challenge(target, claim_id.str, &challenge, &challenge_bytes, fallback_ipv4)) { - netdata_log_error("Error getting challenge"); - return 1; + rc = aclk_get_otp_challenge(target, claim_id.str, &challenge, &challenge_bytes, fallback_ipv4); + if (rc != HTTPS_CLIENT_RESP_OK) { + netdata_log_error("ACLK: error getting challenge"); + return rc; } // Decrypt Challenge / Get response unsigned char *response_plaintext = NULL; int response_plaintext_bytes = private_decrypt(p_key, challenge, challenge_bytes, &response_plaintext); if (response_plaintext_bytes < 0) { - netdata_log_error("Couldn't decrypt the challenge received"); + netdata_log_error("ACLK: Couldn't decrypt the challenge received"); freez(response_plaintext); freez(challenge); - return 1; + return HTTPS_CLIENT_RESP_OTP_CHALLENGE_DECRYPTION_FAILED; } freez(challenge); // Encode and Send Challenge struct auth_data data = { .client_id = NULL, .passwd = NULL, .username = NULL }; - if (aclk_send_otp_response(claim_id.str, response_plaintext, response_plaintext_bytes, target, &data, fallback_ipv4)) { - netdata_log_error("Error getting response"); + rc = aclk_send_otp_response(claim_id.str, response_plaintext, response_plaintext_bytes, target, &data, fallback_ipv4); + if (rc != HTTPS_CLIENT_RESP_OK) { + netdata_log_error("ACLK: Error getting response"); freez(response_plaintext); - return 1; + return rc; } *mqtt_pass = data.passwd; @@ -487,7 +500,7 @@ int aclk_get_mqtt_otp(RSA *p_key, char **mqtt_id, char **mqtt_usr, char **mqtt_p *mqtt_id = data.client_id; freez(response_plaintext); - return 0; + return HTTPS_CLIENT_RESP_OK; } #define JSON_KEY_ENC "encoding" @@ -512,12 +525,12 @@ static int parse_json_env_transport(json_object *json, aclk_transport_desc_t *tr if (!strcmp(json_object_iter_peek_name(&it), JSON_KEY_TRP_TYPE)) { PARSE_ENV_JSON_CHK_TYPE(&it, json_type_string, JSON_KEY_TRP_TYPE) if (trp->type != ACLK_TRP_UNKNOWN) { - netdata_log_error(JSON_KEY_TRP_TYPE " set already"); + netdata_log_error("ACLK: " JSON_KEY_TRP_TYPE " set already"); goto exit; } trp->type = aclk_transport_type_t_from_str(json_object_get_string(json_object_iter_peek_value(&it))); if (trp->type == ACLK_TRP_UNKNOWN) { - netdata_log_error(JSON_KEY_TRP_TYPE " unknown type \"%s\"", json_object_get_string(json_object_iter_peek_value(&it))); + netdata_log_error("ACLK: " JSON_KEY_TRP_TYPE " unknown type \"%s\"", json_object_get_string(json_object_iter_peek_value(&it))); goto exit; } json_object_iter_next(&it); @@ -527,7 +540,7 @@ static int parse_json_env_transport(json_object *json, aclk_transport_desc_t *tr if (!strcmp(json_object_iter_peek_name(&it), JSON_KEY_TRP_ENDPOINT)) { PARSE_ENV_JSON_CHK_TYPE(&it, json_type_string, JSON_KEY_TRP_ENDPOINT) if (trp->endpoint) { - netdata_log_error(JSON_KEY_TRP_ENDPOINT " set already"); + netdata_log_error("ACLK: " JSON_KEY_TRP_ENDPOINT " set already"); goto exit; } trp->endpoint = strdupz(json_object_get_string(json_object_iter_peek_value(&it))); @@ -535,17 +548,17 @@ static int parse_json_env_transport(json_object *json, aclk_transport_desc_t *tr continue; } - netdata_log_error("unknown JSON key in dictionary (\"%s\")", json_object_iter_peek_name(&it)); + netdata_log_error("ACLK: unknown JSON key in dictionary (\"%s\")", json_object_iter_peek_name(&it)); json_object_iter_next(&it); } if (!trp->endpoint) { - netdata_log_error(JSON_KEY_TRP_ENDPOINT " is missing from JSON dictionary"); + netdata_log_error("ACLK: " JSON_KEY_TRP_ENDPOINT " is missing from JSON dictionary"); goto exit; } if (trp->type == ACLK_TRP_UNKNOWN) { - netdata_log_error("transport type not set"); + netdata_log_error("ACLK: transport type not set"); goto exit; } @@ -561,7 +574,7 @@ static int parse_json_env_transports(json_object *json_array, aclk_env_t *env) { json_object *obj; if (env->transports) { - netdata_log_error("transports have been set already"); + netdata_log_error("ACLK: transports have been set already"); return 1; } @@ -573,7 +586,7 @@ static int parse_json_env_transports(json_object *json_array, aclk_env_t *env) { trp = callocz(1, sizeof(aclk_transport_desc_t)); obj = json_object_array_get_idx(json_array, i); if (parse_json_env_transport(obj, trp)) { - netdata_log_error("error parsing transport idx %d", (int)i); + netdata_log_error("ACLK: error parsing transport idx %d", (int)i); freez(trp); return 1; } @@ -589,14 +602,14 @@ static int parse_json_env_transports(json_object *json_array, aclk_env_t *env) { static int parse_json_backoff_int(struct json_object_iterator *it, int *out, const char* name, int min, int max) { if (!strcmp(json_object_iter_peek_name(it), name)) { if (json_object_get_type(json_object_iter_peek_value(it)) != json_type_int) { - netdata_log_error("Could not parse \"%s\". Not an integer as expected.", name); + netdata_log_error("ACLK: Could not parse \"%s\". Not an integer as expected.", name); return MATCHED_ERROR; } *out = json_object_get_int(json_object_iter_peek_value(it)); if (*out < min || *out > max) { - netdata_log_error("Value of \"%s\"=%d out of range (%d-%d).", name, *out, min, max); + netdata_log_error("ACLK: Value of \"%s\"=%d out of range (%d-%d).", name, *out, min, max); return MATCHED_ERROR; } @@ -638,7 +651,7 @@ static int parse_json_backoff(json_object *json, aclk_backoff_t *backoff) { continue; } - netdata_log_error("unknown JSON key in dictionary (\"%s\")", json_object_iter_peek_name(&it)); + netdata_log_error("ACLK: unknown JSON key in dictionary (\"%s\")", json_object_iter_peek_name(&it)); json_object_iter_next(&it); } @@ -650,7 +663,7 @@ static int parse_json_env_caps(json_object *json, aclk_env_t *env) { const char *str; if (env->capabilities) { - netdata_log_error("transports have been set already"); + netdata_log_error("ACLK: transports have been set already"); return 1; } @@ -665,12 +678,12 @@ static int parse_json_env_caps(json_object *json, aclk_env_t *env) { for (size_t i = 0; i < env->capability_count; i++) { obj = json_object_array_get_idx(json, i); if (json_object_get_type(obj) != json_type_string) { - netdata_log_error("Capability at index %d not a string!", (int)i); + netdata_log_error("ACLK: Capability at index %d not a string!", (int)i); return 1; } str = json_object_get_string(obj); if (!str) { - netdata_log_error("Error parsing capabilities"); + netdata_log_error("ACLK: Error parsing capabilities"); return 1; } env->capabilities[i] = strdupz(str); @@ -686,7 +699,7 @@ static int parse_json_env(const char *json_str, aclk_env_t *env) { json = json_tokener_parse(json_str); if (!json) { - netdata_log_error("JSON-C failed to parse the payload of http response of /env endpoint"); + netdata_log_error("ACLK: JSON-C failed to parse the payload of http response of /env endpoint"); return 1; } @@ -697,7 +710,7 @@ static int parse_json_env(const char *json_str, aclk_env_t *env) { if (!strcmp(json_object_iter_peek_name(&it), JSON_KEY_AUTH_ENDPOINT)) { PARSE_ENV_JSON_CHK_TYPE(&it, json_type_string, JSON_KEY_AUTH_ENDPOINT) if (env->auth_endpoint) { - netdata_log_error("authEndpoint set already"); + netdata_log_error("ACLK: authEndpoint set already"); goto exit; } env->auth_endpoint = strdupz(json_object_get_string(json_object_iter_peek_value(&it))); @@ -708,7 +721,7 @@ static int parse_json_env(const char *json_str, aclk_env_t *env) { if (!strcmp(json_object_iter_peek_name(&it), JSON_KEY_ENC)) { PARSE_ENV_JSON_CHK_TYPE(&it, json_type_string, JSON_KEY_ENC) if (env->encoding != ACLK_ENC_UNKNOWN) { - netdata_log_error(JSON_KEY_ENC " set already"); + netdata_log_error("ACLK: " JSON_KEY_ENC " set already"); goto exit; } env->encoding = aclk_encoding_type_t_from_str(json_object_get_string(json_object_iter_peek_value(&it))); @@ -731,7 +744,7 @@ static int parse_json_env(const char *json_str, aclk_env_t *env) { if (parse_json_backoff(json_object_iter_peek_value(&it), &env->backoff)) { env->backoff.base = 0; - netdata_log_error("Error parsing Backoff parameters in env"); + netdata_log_error("ACLK: Error parsing Backoff parameters in env"); goto exit; } @@ -743,7 +756,7 @@ static int parse_json_env(const char *json_str, aclk_env_t *env) { PARSE_ENV_JSON_CHK_TYPE(&it, json_type_array, JSON_KEY_CAPS) if (parse_json_env_caps(json_object_iter_peek_value(&it), env)) { - netdata_log_error("Error parsing capabilities list"); + netdata_log_error("ACLK: Error parsing capabilities list"); goto exit; } @@ -751,25 +764,25 @@ static int parse_json_env(const char *json_str, aclk_env_t *env) { continue; } - netdata_log_error("unknown JSON key in dictionary (\"%s\")", json_object_iter_peek_name(&it)); + netdata_log_error("ACLK: unknown JSON key in dictionary (\"%s\")", json_object_iter_peek_name(&it)); json_object_iter_next(&it); } // Check all compulsory keys have been set if (env->transport_count < 1) { - netdata_log_error("env has to return at least one transport"); + netdata_log_error("ACLK: env has to return at least one transport"); goto exit; } if (!env->auth_endpoint) { - netdata_log_error(JSON_KEY_AUTH_ENDPOINT " is compulsory"); + netdata_log_error("ACLK: " JSON_KEY_AUTH_ENDPOINT " is compulsory"); goto exit; } if (env->encoding == ACLK_ENC_UNKNOWN) { - netdata_log_error(JSON_KEY_ENC " is compulsory"); + netdata_log_error("ACLK: " JSON_KEY_ENC " is compulsory"); goto exit; } if (!env->backoff.base) { - netdata_log_error(JSON_KEY_BACKOFF " is compulsory"); + netdata_log_error("ACLK: " JSON_KEY_BACKOFF " is compulsory"); goto exit; } @@ -782,9 +795,10 @@ static int parse_json_env(const char *json_str, aclk_env_t *env) { return 1; } -int aclk_get_env(aclk_env_t *env, const char* aclk_hostname, int aclk_port, bool *fallback_ipv4) { +https_client_resp_t aclk_get_env(aclk_env_t *env, const char* aclk_hostname, int aclk_port, bool *fallback_ipv4) { BUFFER *buf = buffer_create(1024, &netdata_buffers_statistics.buffers_aclk); + https_client_resp_t rc; https_req_t req = HTTPS_REQ_T_INITIALIZER; https_req_response_t resp = HTTPS_REQ_RESPONSE_T_INITIALIZER; @@ -792,9 +806,9 @@ int aclk_get_env(aclk_env_t *env, const char* aclk_hostname, int aclk_port, bool CLAIM_ID claim_id = claim_id_get(); if (!claim_id_is_set(claim_id)) { - netdata_log_error("Agent was not claimed - cannot perform challenge/response"); + netdata_log_error("ACLK: failed to get ACLK environment (agent is not claimed)"); buffer_free(buf); - return 1; + return HTTPS_CLIENT_RESP_ENV_AGENT_NOT_CLAIMED; } buffer_sprintf(buf, "/api/v1/env?v=%s&cap=proto,ctx&claim_id=%s", &(NETDATA_VERSION[1]) /* skip 'v' at beginning */, claim_id.str); @@ -802,38 +816,37 @@ int aclk_get_env(aclk_env_t *env, const char* aclk_hostname, int aclk_port, bool req.host = (char*)aclk_hostname; req.port = aclk_port; req.url = buf->buffer; - if (aclk_https_request(&req, &resp, fallback_ipv4)) { - netdata_log_error("Error trying to contact env endpoint"); + rc = aclk_https_request(&req, &resp, fallback_ipv4); + if (rc != HTTPS_CLIENT_RESP_OK) { + netdata_log_error("ACLK: failed to get ACLK environment (cannot contact ENV endpoint)"); https_req_response_free(&resp); buffer_free(buf); - return 2; + return rc; } if (resp.http_code != 200) { - netdata_log_error("The HTTP code not 200 OK (Got %d)", resp.http_code); + netdata_log_error("ACLK: failed to get ACLK environment (ENV response code is not 200) (got %d)", resp.http_code); if (resp.payload_size) aclk_parse_otp_error(resp.payload); https_req_response_free(&resp); buffer_free(buf); - return 3; + return HTTPS_CLIENT_RESP_ENV_NOT_200; } if (!resp.payload || !resp.payload_size) { - netdata_log_error("Unexpected empty payload as response to /env call"); + netdata_log_error("ACLK: failed to get ACLK environment (ENV response is empty)"); https_req_response_free(&resp); buffer_free(buf); - return 4; + return HTTPS_CLIENT_RESP_ENV_EMPTY; } if (parse_json_env(resp.payload, env)) { - netdata_log_error("error parsing /env message"); + netdata_log_error("ACLK: failed to get ACLK environment (ENV response is not JSON)"); https_req_response_free(&resp); buffer_free(buf); - return 5; + return HTTPS_CLIENT_RESP_ENV_NOT_JSON; } - netdata_log_info("Getting Cloud /env successful"); - https_req_response_free(&resp); buffer_free(buf); - return 0; + return HTTPS_CLIENT_RESP_OK; } diff --git a/src/aclk/aclk_otp.h b/src/aclk/aclk_otp.h index a4a3a60b789392..02911905b61bde 100644 --- a/src/aclk/aclk_otp.h +++ b/src/aclk/aclk_otp.h @@ -9,10 +9,10 @@ #include "aclk_util.h" #if OPENSSL_VERSION_NUMBER >= OPENSSL_VERSION_300 -int aclk_get_mqtt_otp(EVP_PKEY *p_key, char **mqtt_id, char **mqtt_usr, char **mqtt_pass, url_t *target, bool *fallback_ipv4); +https_client_resp_t aclk_get_mqtt_otp(EVP_PKEY *p_key, char **mqtt_id, char **mqtt_usr, char **mqtt_pass, url_t *target, bool *fallback_ipv4); #else -int aclk_get_mqtt_otp(RSA *p_key, char **mqtt_id, char **mqtt_usr, char **mqtt_pass, url_t *target, bool *fallback_ipv4); +https_client_resp_t aclk_get_mqtt_otp(RSA *p_key, char **mqtt_id, char **mqtt_usr, char **mqtt_pass, url_t *target, bool *fallback_ipv4); #endif -int aclk_get_env(aclk_env_t *env, const char *aclk_hostname, int aclk_port, bool *fallback_ipv4); +https_client_resp_t aclk_get_env(aclk_env_t *env, const char *aclk_hostname, int aclk_port, bool *fallback_ipv4); #endif /* ACLK_OTP_H */ diff --git a/src/aclk/https_client.c b/src/aclk/https_client.c index 9703dd7e211942..bcfd0237161dc0 100644 --- a/src/aclk/https_client.c +++ b/src/aclk/https_client.c @@ -8,6 +8,130 @@ #include "daemon/pulse/pulse.h" +ENUM_STR_MAP_DEFINE(https_client_resp_t) = { + { + .id = HTTPS_CLIENT_RESP_OK, + .name = "ok", + }, + { + .id = HTTPS_CLIENT_RESP_UNKNOWN_ERROR, + .name = "unknown error", + }, + { + .id = HTTPS_CLIENT_RESP_NO_MEM, + .name = "not enough memory", + }, + { + .id = HTTPS_CLIENT_RESP_NONBLOCK_FAILED, + .name = "cannot set socket to non-blocking mode", + }, + { + .id = HTTPS_CLIENT_RESP_PROXY_NOT_200, + .name = "proxy did not return http/200", + }, + { + .id = HTTPS_CLIENT_RESP_NO_SSL_CTX, + .name = "cannot create SSL ctx", + }, + { + .id = HTTPS_CLIENT_RESP_NO_SSL_VERIFY_PATHS, + .name = "cannot set SSL verify paths", + }, + { + .id = HTTPS_CLIENT_RESP_NO_SSL_NEW, + .name = "cannot create SSL", + }, + { + .id = HTTPS_CLIENT_RESP_NO_TLS_SNI, + .name = "cannot set TLS SNI", + }, + { + .id = HTTPS_CLIENT_RESP_SSL_CONNECT_FAILED, + .name = "SSL_connect() failed", + }, + { + .id = HTTPS_CLIENT_RESP_SSL_START_FAILED, + .name = "cannot start SSL connection", + }, + { + .id = HTTPS_CLIENT_RESP_UNKNOWN_REQUEST_TYPE, + .name = "unknown https client request type", + }, + { + .id = HTTPS_CLIENT_RESP_HEADER_WRITE_FAILED, + .name = "https client failed to write http header", + }, + { + .id = HTTPS_CLIENT_RESP_PAYLOAD_WRITE_FAILED, + .name = "https client failed to write http payload", + }, + { + .id = HTTPS_CLIENT_RESP_POLL_ERROR, + .name = "https client poll() error", + }, + { + .id = HTTPS_CLIENT_RESP_TIMEOUT, + .name = "https client timeout", + }, + { + .id = HTTPS_CLIENT_RESP_READ_ERROR, + .name = "https client read error", + }, + { + .id = HTTPS_CLIENT_RESP_PARSE_ERROR, + .name = "https client parsing of response failed", + }, + { + .id = HTTPS_CLIENT_RESP_ENV_AGENT_NOT_CLAIMED, + .name = "agent is not claimed (during /env)", + }, + { + .id = HTTPS_CLIENT_RESP_ENV_NOT_200, + .name = "/env response code is not 200", + }, + { + .id = HTTPS_CLIENT_RESP_ENV_EMPTY, + .name = "/env response is empty", + }, + { + .id = HTTPS_CLIENT_RESP_ENV_NOT_JSON, + .name = "/env response is not JSON", + }, + { + .id = HTTPS_CLIENT_RESP_OTP_CHALLENGE_NOT_200, + .name = "otp challenge response is not http/200", + }, + { + .id = HTTPS_CLIENT_RESP_OTP_CHALLENGE_INVALID, + .name = "otp challenge response is invalid", + }, + { + .id = HTTPS_CLIENT_RESP_OTP_PASSWORD_NOT_201, + .name = "otp password response is not http/201", + }, + { + .id = HTTPS_CLIENT_RESP_OTP_PASSWORD_EMPTY, + .name = "otp password response is empty", + }, + { + .id = HTTPS_CLIENT_RESP_OTP_PASSWORD_NOT_JSON, + .name = "otp password response is not JSON", + }, + { + .id = HTTPS_CLIENT_RESP_OTP_AGENT_NOT_CLAIMED, + .name = "agent is not claimed (during otp)", + }, + { + .id = HTTPS_CLIENT_RESP_OTP_CHALLENGE_DECRYPTION_FAILED, + .name = "otp challenge decryption failed", + }, + + // terminator + {.name = NULL, .id = 0} +}; + +ENUM_STR_DEFINE_FUNCTIONS(https_client_resp_t, HTTPS_CLIENT_RESP_UNKNOWN_ERROR, "unknown error"); + static const char *http_req_type_to_str(http_req_type_t req) { switch (req) { case HTTP_REQ_GET: @@ -62,16 +186,16 @@ static int process_http_hdr(http_parse_ctx *parse_ctx, const char *key, const ch // we can skip the rest if (parse_ctx->content_length < 0 && !strcmp("content-length", key)) { if (parse_ctx->content_length == TRANSFER_ENCODING_CHUNKED) { - netdata_log_error("Content-length and transfer-encoding: chunked headers are mutually exclusive"); + netdata_log_error("ACLK: Content-length and transfer-encoding: chunked headers are mutually exclusive"); return 1; } if (parse_ctx->content_length != -1) { - netdata_log_error("Duplicate content-length header"); + netdata_log_error("ACLK: Duplicate content-length header"); return 1; } parse_ctx->content_length = str2u(val); if (parse_ctx->content_length < 0) { - netdata_log_error("Invalid content-length %d", parse_ctx->content_length); + netdata_log_error("ACLK: Invalid content-length %d", parse_ctx->content_length); return 1; } return 0; @@ -79,7 +203,7 @@ static int process_http_hdr(http_parse_ctx *parse_ctx, const char *key, const ch if (!strcmp("transfer-encoding", key)) { if (!strcmp("chunked", val)) { if (parse_ctx->content_length != -1) { - netdata_log_error("Content-length and transfer-encoding: chunked headers are mutually exclusive"); + netdata_log_error("ACLK: Content-length and transfer-encoding: chunked headers are mutually exclusive"); return 1; } parse_ctx->content_length = TRANSFER_ENCODING_CHUNKED; @@ -108,17 +232,17 @@ static int parse_http_hdr(rbuf_t buf, http_parse_ctx *parse_ctx) char *ptr; if (!rbuf_find_bytes(buf, HTTP_LINE_TERM, strlen(HTTP_LINE_TERM), &idx_end)) { - netdata_log_error("CRLF expected"); + netdata_log_error("ACLK: CRLF expected"); return 1; } char *separator = rbuf_find_bytes(buf, HTTP_KEYVAL_SEPARATOR, strlen(HTTP_KEYVAL_SEPARATOR), &idx); if (!separator) { - netdata_log_error("Missing Key/Value separator"); + netdata_log_error("ACLK: Missing Key/Value separator"); return 1; } if (idx >= HTTP_HDR_BUFFER_SIZE) { - netdata_log_error("Key name is too long"); + netdata_log_error("ACLK: Key name is too long"); return 1; } @@ -128,7 +252,7 @@ static int parse_http_hdr(rbuf_t buf, http_parse_ctx *parse_ctx) rbuf_bump_tail(buf, strlen(HTTP_KEYVAL_SEPARATOR)); idx_end -= strlen(HTTP_KEYVAL_SEPARATOR) + idx; if (idx_end >= HTTP_HDR_BUFFER_SIZE) { - netdata_log_error("Value of key \"%s\" too long", buf_key); + netdata_log_error("ACLK: Value of key \"%s\" too long", buf_key); return 1; } @@ -173,7 +297,7 @@ static int process_chunked_content(rbuf_t buf, http_parse_ctx *parse_ctx) continue; } if (idx >= HTTP_HDR_BUFFER_SIZE) { - netdata_log_error("Chunk size is too long"); + netdata_log_error("ACLK: Chunk size is too long"); return HTTP_PARSE_ERROR; } char buf_size[HTTP_HDR_BUFFER_SIZE]; @@ -181,13 +305,13 @@ static int process_chunked_content(rbuf_t buf, http_parse_ctx *parse_ctx) buf_size[idx] = 0; long chunk_size = strtol(buf_size, NULL, 16); if (chunk_size < 0 || chunk_size == LONG_MAX) { - netdata_log_error("Chunk size out of range"); + netdata_log_error("ACLK: Chunk size out of range"); return HTTP_PARSE_ERROR; } parse_ctx->chunk_size = chunk_size; if (parse_ctx->chunk_size == 0) { if (errno == EINVAL) { - netdata_log_error("Invalid chunk size"); + netdata_log_error("ACLK: Invalid chunk size"); return HTTP_PARSE_ERROR; } parse_ctx->chunked_content_state = CHUNKED_CONTENT_CHUNK_END_CRLF; @@ -217,12 +341,12 @@ static int process_chunked_content(rbuf_t buf, http_parse_ctx *parse_ctx) char buf_crlf[strlen(HTTP_LINE_TERM)]; rbuf_pop(buf, buf_crlf, strlen(HTTP_LINE_TERM)); if (memcmp(buf_crlf, HTTP_LINE_TERM, strlen(HTTP_LINE_TERM))) { - netdata_log_error("CRLF expected"); + netdata_log_error("ACLK: CRLF expected"); return HTTP_PARSE_ERROR; } if (parse_ctx->chunked_content_state == CHUNKED_CONTENT_FINAL_CRLF) { if (parse_ctx->chunked_response_size != parse_ctx->chunked_response_written) - netdata_log_error("Chunked response size mismatch"); + netdata_log_error("ACLK: Chunked response size mismatch"); chunked_response_buffer_grow_by(parse_ctx, 1); parse_ctx->chunked_response[parse_ctx->chunked_response_written] = 0; return HTTP_PARSE_SUCCESS; @@ -252,28 +376,28 @@ http_parse_rc parse_http_response(rbuf_t buf, http_parse_ctx *parse_ctx) if (parse_ctx->state == HTTP_PARSE_PROXY_CONNECT) { if (rbuf_memcmp_n(buf, RESP_PROTO10, strlen(RESP_PROTO10))) { netdata_log_error( - "Expected response to start with \"%s\" or \"%s\"", RESP_PROTO, RESP_PROTO10); + "ACLK: Expected response to start with \"%s\" or \"%s\"", RESP_PROTO, RESP_PROTO10); return HTTP_PARSE_ERROR; } } else { - netdata_log_error("Expected response to start with \"%s\"", RESP_PROTO); + netdata_log_error("ACLK: Expected response to start with \"%s\"", RESP_PROTO); return HTTP_PARSE_ERROR; } } rbuf_bump_tail(buf, strlen(RESP_PROTO)); if (rbuf_pop(buf, rc, 4) != 4) { - netdata_log_error("Expected HTTP status code"); + netdata_log_error("ACLK: Expected HTTP status code"); return HTTP_PARSE_ERROR; } if (rc[3] != ' ') { - netdata_log_error("Expected space after HTTP return code"); + netdata_log_error("ACLK: Expected space after HTTP return code"); return HTTP_PARSE_ERROR; } rc[3] = 0; parse_ctx->http_code = atoi(rc); if (parse_ctx->http_code < 100 || parse_ctx->http_code >= 600) { - netdata_log_error("HTTP code not in range 100 to 599"); + netdata_log_error("ACLK: HTTP code not in range 100 to 599"); return HTTP_PARSE_ERROR; } @@ -332,7 +456,7 @@ typedef struct https_req_ctx { static int https_req_check_timedout(https_req_ctx_t *ctx) { if (now_realtime_sec() > ctx->req_start_time + ctx->request->timeout_s) { - netdata_log_error("request timed out"); + netdata_log_error("ACLK: request timed out"); return 1; } return 0; @@ -366,12 +490,12 @@ static int socket_write_all(https_req_ctx_t *ctx, char *data, size_t data_len) { do { int ret = poll(&ctx->poll_fd, 1, POLL_TO_MS); if (ret < 0) { - netdata_log_error("poll error"); + netdata_log_error("ACLK: poll error"); return 1; } if (ret == 0) { if (https_req_check_timedout(ctx)) { - netdata_log_error("Poll timed out"); + netdata_log_error("ACLK: Poll timed out"); return 2; } continue; @@ -381,7 +505,7 @@ static int socket_write_all(https_req_ctx_t *ctx, char *data, size_t data_len) { if (ret > 0) { ctx->written += ret; } else if (errno != EAGAIN && errno != EWOULDBLOCK) { - netdata_log_error("Error writing to socket"); + netdata_log_error("ACLK: Error writing to socket"); return 3; } } while (ctx->written < data_len); @@ -396,12 +520,12 @@ static int ssl_write_all(https_req_ctx_t *ctx, char *data, size_t data_len) { do { int ret = poll(&ctx->poll_fd, 1, POLL_TO_MS); if (ret < 0) { - netdata_log_error("poll error"); + netdata_log_error("ACLK: poll error"); return 1; } if (ret == 0) { if (https_req_check_timedout(ctx)) { - netdata_log_error("Poll timed out"); + netdata_log_error("ACLK: Poll timed out"); return 2; } continue; @@ -421,7 +545,7 @@ static int ssl_write_all(https_req_ctx_t *ctx, char *data, size_t data_len) { ctx->poll_fd.events |= POLLOUT; break; default: - netdata_log_error("SSL_write Err: %s", _ssl_err_tos(ret)); + netdata_log_error("ACLK: SSL_write Err: %s", _ssl_err_tos(ret)); return 3; } } @@ -436,7 +560,7 @@ static inline int https_client_write_all(https_req_ctx_t *ctx, char *data, size_ return socket_write_all(ctx, data, data_len); } -static int read_parse_response(https_req_ctx_t *ctx) { +static https_client_resp_t read_parse_response(https_req_ctx_t *ctx) { int ret; char *ptr; size_t size; @@ -445,13 +569,13 @@ static int read_parse_response(https_req_ctx_t *ctx) { do { ret = poll(&ctx->poll_fd, 1, POLL_TO_MS); if (ret < 0) { - netdata_log_error("poll error"); - return 1; + netdata_log_error("ACLK: poll error"); + return HTTPS_CLIENT_RESP_POLL_ERROR; } if (ret == 0) { if (https_req_check_timedout(ctx)) { - netdata_log_error("Poll timed out"); - return 2; + netdata_log_error("ACLK: poll() timed out"); + return HTTPS_CLIENT_RESP_TIMEOUT; } if (!ctx->ssl_ctx) continue; @@ -479,13 +603,14 @@ static int read_parse_response(https_req_ctx_t *ctx) { ctx->poll_fd.events |= POLLOUT; break; default: - netdata_log_error("SSL_read Err: %s", _ssl_err_tos(ret)); - return 3; + netdata_log_error("ACLK: SSL_read() Err: %s", _ssl_err_tos(ret)); + return HTTPS_CLIENT_RESP_READ_ERROR; } - } else { + } + else { if (errno != EAGAIN && errno != EWOULDBLOCK) { - netdata_log_error("write error"); - return 3; + netdata_log_error("ACLK: read error"); + return HTTPS_CLIENT_RESP_READ_ERROR; } ctx->poll_fd.events |= POLLIN; } @@ -494,11 +619,11 @@ static int read_parse_response(https_req_ctx_t *ctx) { } while (!(ret = parse_http_response(ctx->buf_rx, &ctx->parse_ctx))); if (ret != HTTP_PARSE_SUCCESS) { - netdata_log_error("Error parsing HTTP response"); - return 1; + netdata_log_error("ACLK: error parsing HTTP response"); + return HTTPS_CLIENT_RESP_PARSE_ERROR; } - return 0; + return HTTPS_CLIENT_RESP_OK; } static const char *http_methods[] = { @@ -510,15 +635,15 @@ static const char *http_methods[] = { #define TX_BUFFER_SIZE 8192 #define RX_BUFFER_SIZE (TX_BUFFER_SIZE*2) -static int handle_http_request(https_req_ctx_t *ctx) { +static https_client_resp_t handle_http_request(https_req_ctx_t *ctx) { BUFFER *hdr = buffer_create(TX_BUFFER_SIZE, &netdata_buffers_statistics.buffers_aclk); - int rc = 0; + https_client_resp_t rc = HTTPS_CLIENT_RESP_OK; http_req_type_t req_type = ctx->request->request_type; if (req_type >= HTTP_REQ_INVALID) { - netdata_log_error("Unknown HTTPS request type!"); - rc = 1; + netdata_log_error("ACLK: unknown HTTPS request type!"); + rc = HTTPS_CLIENT_RESP_UNKNOWN_REQUEST_TYPE; goto err_exit; } buffer_strcat(hdr, http_methods[req_type]); @@ -565,25 +690,25 @@ static int handle_http_request(https_req_ctx_t *ctx) { // Send the request if (https_client_write_all(ctx, hdr->buffer, hdr->len)) { - netdata_log_error("Couldn't write HTTP request header into SSL connection"); - rc = 2; + netdata_log_error("ACLK: couldn't write HTTP request header into SSL connection"); + rc = HTTPS_CLIENT_RESP_HEADER_WRITE_FAILED; goto err_exit; } if (req_type == HTTP_REQ_POST && ctx->request->payload && ctx->request->payload_size) { if (https_client_write_all(ctx, ctx->request->payload, ctx->request->payload_size)) { - netdata_log_error("Couldn't write payload into SSL connection"); - rc = 3; + netdata_log_error("ACLK: couldn't write payload into SSL connection"); + rc = HTTPS_CLIENT_RESP_PAYLOAD_WRITE_FAILED; goto err_exit; } } // Read The Response - if (read_parse_response(ctx)) { - netdata_log_error("Error reading or parsing response from server"); + rc = read_parse_response(ctx); + if (rc != HTTPS_CLIENT_RESP_OK) { + netdata_log_error("ACLK: error reading or parsing response from server"); if (ctx->parse_ctx.chunked_response) freez(ctx->parse_ctx.chunked_response); - rc = 4; } err_exit: @@ -613,16 +738,17 @@ static int cert_verify_callback(int preverify_ok, X509_STORE_CTX *ctx) if (!preverify_ok && err == X509_V_ERR_DEPTH_ZERO_SELF_SIGNED_CERT) { preverify_ok = 1; - netdata_log_error("Self Signed Certificate Accepted as the agent was built with ACLK_SSL_ALLOW_SELF_SIGNED"); + netdata_log_error("ACLK: Self Signed Certificate Accepted as the agent was built with ACLK_SSL_ALLOW_SELF_SIGNED"); } #endif return preverify_ok; } -int https_request(https_req_t *request, https_req_response_t *response, bool *fallback_ipv4) +https_client_resp_t https_request(https_req_t *request, https_req_response_t *response, bool *fallback_ipv4) { - int rc = 1, ret; + https_client_resp_t rc; + int ret; char connect_port_str[PORT_STR_MAX_BYTES]; const char *connect_host = request->proxy_host ? request->proxy_host : request->host; @@ -634,7 +760,8 @@ int https_request(https_req_t *request, https_req_response_t *response, bool *fa ctx->buf_rx = rbuf_create(RX_BUFFER_SIZE); if (!ctx->buf_rx) { - netdata_log_error("Couldn't allocate buffer for RX data"); + rc = HTTPS_CLIENT_RESP_NO_MEM; + netdata_log_error("ACLK: couldn't allocate buffer for RX data"); goto exit_req_ctx; } @@ -642,12 +769,14 @@ int https_request(https_req_t *request, https_req_response_t *response, bool *fa ctx->sock = connect_to_this_ip46(IPPROTO_TCP, SOCK_STREAM, connect_host, 0, connect_port_str, &timeout, fallback_ipv4); if (ctx->sock < 0) { - netdata_log_error("Error connecting TCP socket to \"%s\"", connect_host); + rc = -ctx->sock; + netdata_log_error("ACLK: error connecting TCP socket to \"%s\"", connect_host); goto exit_buf_rx; } if (fcntl(ctx->sock, F_SETFL, fcntl(ctx->sock, F_GETFL, 0) | O_NONBLOCK) == -1) { - netdata_log_error("Error setting O_NONBLOCK to TCP socket."); + rc = HTTPS_CLIENT_RESP_NONBLOCK_FAILED; + netdata_log_error("ACLK: error setting O_NONBLOCK to TCP socket."); goto exit_sock; } @@ -664,48 +793,54 @@ int https_request(https_req_t *request, https_req_response_t *response, bool *fa req.proxy_username = request->proxy_username; req.proxy_password = request->proxy_password; ctx->request = &req; - if (handle_http_request(ctx)) { - netdata_log_error("Failed to CONNECT with proxy"); + rc = handle_http_request(ctx); + if (rc != HTTPS_CLIENT_RESP_OK) { + netdata_log_error("ACLK: failed to CONNECT with proxy"); http_parse_ctx_destroy(&ctx->parse_ctx); goto exit_sock; } if (ctx->parse_ctx.http_code != 200) { - netdata_log_error("Proxy didn't return 200 OK (got %d)", ctx->parse_ctx.http_code); + rc = HTTPS_CLIENT_RESP_PROXY_NOT_200; + netdata_log_error("ACLK: proxy didn't return 200 OK (got %d)", ctx->parse_ctx.http_code); http_parse_ctx_destroy(&ctx->parse_ctx); goto exit_sock; } http_parse_ctx_destroy(&ctx->parse_ctx); - netdata_log_info("Proxy accepted CONNECT upgrade"); } ctx->request = request; ctx->ssl_ctx = netdata_ssl_create_client_ctx(0); if (ctx->ssl_ctx==NULL) { - netdata_log_error("Cannot allocate SSL context"); + rc = HTTPS_CLIENT_RESP_NO_SSL_CTX; + netdata_log_error("ACLK: cannot allocate SSL context"); goto exit_sock; } if (!SSL_CTX_set_default_verify_paths(ctx->ssl_ctx)) { - netdata_log_error("Error setting default verify paths"); + rc = HTTPS_CLIENT_RESP_NO_SSL_VERIFY_PATHS; + netdata_log_error("ACLK: error setting default verify paths"); goto exit_CTX; } SSL_CTX_set_verify(ctx->ssl_ctx, SSL_VERIFY_PEER | SSL_VERIFY_CLIENT_ONCE, cert_verify_callback); ctx->ssl = SSL_new(ctx->ssl_ctx); if (ctx->ssl==NULL) { - netdata_log_error("Cannot allocate SSL"); + rc = HTTPS_CLIENT_RESP_NO_SSL_NEW; + netdata_log_error("ACLK: cannot allocate SSL"); goto exit_CTX; } if (!SSL_set_tlsext_host_name(ctx->ssl, request->host)) { - netdata_log_error("Error setting TLS SNI host"); + rc = HTTPS_CLIENT_RESP_NO_TLS_SNI; + netdata_log_error("ACLK: error setting TLS SNI host"); goto exit_CTX; } SSL_set_fd(ctx->ssl, ctx->sock); ret = SSL_connect(ctx->ssl); if (ret != -1 && ret != 1) { - netdata_log_error("SSL could not connect"); + rc = HTTPS_CLIENT_RESP_SSL_CONNECT_FAILED; + netdata_log_error("ACLK: SSL failed to connect"); goto exit_SSL; } if (ret == -1) { @@ -713,14 +848,16 @@ int https_request(https_req_t *request, https_req_response_t *response, bool *fa // consult SSL_connect documentation for details int ec = SSL_get_error(ctx->ssl, ret); if (ec != SSL_ERROR_WANT_READ && ec != SSL_ERROR_WANT_WRITE) { - netdata_log_error("Failed to start SSL connection"); + rc = HTTPS_CLIENT_RESP_SSL_START_FAILED; + netdata_log_error("ACLK: failed to start SSL connection"); goto exit_SSL; } } // The actual request here - if (handle_http_request(ctx)) { - netdata_log_error("Couldn't process request"); + rc = handle_http_request(ctx); + if (rc != HTTPS_CLIENT_RESP_OK) { + netdata_log_error("ACLK: couldn't process request"); http_parse_ctx_destroy(&ctx->parse_ctx); goto exit_SSL; } @@ -735,7 +872,7 @@ int https_request(https_req_t *request, https_req_response_t *response, bool *fa response->payload = mallocz(response->payload_size + 1); ret = rbuf_pop(ctx->buf_rx, response->payload, response->payload_size); if (ret != (int)response->payload_size) { - netdata_log_error("Payload size doesn't match remaining data on the buffer!"); + netdata_log_error("ACLK: payload size doesn't match remaining data on the buffer!"); response->payload_size = ret; } // normally we take payload as it is and copy it @@ -746,9 +883,9 @@ int https_request(https_req_t *request, https_req_response_t *response, bool *fa // only exact data without affixed 0x00 ((char*)response->payload)[response->payload_size] = 0; // mallocz(response->payload_size + 1); } - netdata_log_info("HTTPS \"%s\" request to \"%s\" finished with HTTP code: %d", http_req_type_to_str(ctx->request->request_type), ctx->request->host, response->http_code); + netdata_log_info("ACLK: HTTPS \"%s\" request to \"%s\" finished with HTTP code: %d", http_req_type_to_str(ctx->request->request_type), ctx->request->host, response->http_code); - rc = 0; + rc = HTTPS_CLIENT_RESP_OK; exit_SSL: SSL_free(ctx->ssl); @@ -776,7 +913,7 @@ static inline char *UNUSED_FUNCTION(min_non_null)(char *a, char *b) { } #define URI_PROTO_SEPARATOR "://" -#define URL_PARSER_LOG_PREFIX "url_parser " +#define URL_PARSER_LOG_PREFIX "ACLK: url_parser " static int parse_host_port(url_t *url) { char *ptr = strrchr(url->host, ':'); diff --git a/src/aclk/https_client.h b/src/aclk/https_client.h index b1445a5b73216d..b7598cd9e67864 100644 --- a/src/aclk/https_client.h +++ b/src/aclk/https_client.h @@ -5,6 +5,46 @@ #include "libnetdata/libnetdata.h" +typedef enum https_client_resp { + HTTPS_CLIENT_RESP_OK = 0, + + // all the ND_SOCK_ERR_XXX are place here + + HTTPS_CLIENT_RESP_UNKNOWN_ERROR = ND_SOCK_ERR_MAX, + HTTPS_CLIENT_RESP_NO_MEM, + HTTPS_CLIENT_RESP_NONBLOCK_FAILED, + HTTPS_CLIENT_RESP_PROXY_NOT_200, + HTTPS_CLIENT_RESP_NO_SSL_CTX, + HTTPS_CLIENT_RESP_NO_SSL_VERIFY_PATHS, + HTTPS_CLIENT_RESP_NO_SSL_NEW, + HTTPS_CLIENT_RESP_NO_TLS_SNI, + HTTPS_CLIENT_RESP_SSL_CONNECT_FAILED, + HTTPS_CLIENT_RESP_SSL_START_FAILED, + HTTPS_CLIENT_RESP_UNKNOWN_REQUEST_TYPE, + HTTPS_CLIENT_RESP_HEADER_WRITE_FAILED, + HTTPS_CLIENT_RESP_PAYLOAD_WRITE_FAILED, + HTTPS_CLIENT_RESP_POLL_ERROR, + HTTPS_CLIENT_RESP_TIMEOUT, + HTTPS_CLIENT_RESP_READ_ERROR, + HTTPS_CLIENT_RESP_PARSE_ERROR, + HTTPS_CLIENT_RESP_ENV_AGENT_NOT_CLAIMED, + HTTPS_CLIENT_RESP_ENV_NOT_200, + HTTPS_CLIENT_RESP_ENV_EMPTY, + HTTPS_CLIENT_RESP_ENV_NOT_JSON, + HTTPS_CLIENT_RESP_OTP_CHALLENGE_NOT_200, + HTTPS_CLIENT_RESP_OTP_CHALLENGE_INVALID, + HTTPS_CLIENT_RESP_OTP_PASSWORD_NOT_201, + HTTPS_CLIENT_RESP_OTP_PASSWORD_EMPTY, + HTTPS_CLIENT_RESP_OTP_PASSWORD_NOT_JSON, + HTTPS_CLIENT_RESP_OTP_AGENT_NOT_CLAIMED, + HTTPS_CLIENT_RESP_OTP_CHALLENGE_DECRYPTION_FAILED, + + // terminator + HTTPS_CLIENT_RESP_MAX, +} https_client_resp_t; + +ENUM_STR_DEFINE_FUNCTIONS_EXTERN(https_client_resp_t); + typedef enum http_req_type { HTTP_REQ_GET = 0, HTTP_REQ_POST, @@ -75,7 +115,7 @@ void https_req_response_free(https_req_response_t *res); .proxy_port = 8080 \ } -int https_request(https_req_t *request, https_req_response_t *response, bool *fallback_ipv4); +https_client_resp_t https_request(https_req_t *request, https_req_response_t *response, bool *fallback_ipv4); // we expose previously internal parser as this is usefull also from // other parts of the code diff --git a/src/aclk/mqtt_websockets/aclk_mqtt_workers.h b/src/aclk/mqtt_websockets/aclk_mqtt_workers.h index 4bb1aff2b25d02..0b6192f9b26041 100644 --- a/src/aclk/mqtt_websockets/aclk_mqtt_workers.h +++ b/src/aclk/mqtt_websockets/aclk_mqtt_workers.h @@ -37,5 +37,6 @@ #define WORKER_ACLK_CPT_UNKNOWN 31 #define WORKER_ACLK_SEND_FRAGMENT 32 #define WORKER_ACLK_MSG_CALLBACK 33 +#define WORKER_ACLK_WAITING_TO_CONNECT 34 #endif //NETDATA_ACLK_MQTT_WORKERS_H diff --git a/src/aclk/mqtt_websockets/mqtt_wss_client.c b/src/aclk/mqtt_websockets/mqtt_wss_client.c index d3eb0152697ad1..1b56b7371f5316 100644 --- a/src/aclk/mqtt_websockets/mqtt_wss_client.c +++ b/src/aclk/mqtt_websockets/mqtt_wss_client.c @@ -816,10 +816,10 @@ int mqtt_wss_service(mqtt_wss_client client, int timeout_ms) if (errno == EINTR) { nd_log(NDLS_DAEMON, NDLP_WARNING, "poll interrupted by EINTR"); - return 0; + return MQTT_WSS_OK; } nd_log(NDLS_DAEMON, NDLP_ERR, "poll error \"%s\"", strerror(errno)); - return -2; + return MQTT_WSS_ERR_POLL_FAILED; } worker_is_busy(WORKER_ACLK_POLL_OK); @@ -842,7 +842,7 @@ int mqtt_wss_service(mqtt_wss_client client, int timeout_ms) } // if poll timed out and user requested timeout was being used // return here let user do his work and he will call us back soon - return 0; + return MQTT_WSS_OK; } } @@ -865,12 +865,20 @@ int mqtt_wss_service(mqtt_wss_client client, int timeout_ms) int errnobkp = errno; ret = SSL_get_error(client->ssl, ret); set_socket_pollfds(client, ret); + if (ret != SSL_ERROR_WANT_READ && ret != SSL_ERROR_WANT_WRITE) { worker_is_busy(WORKER_ACLK_RX_ERROR); nd_log(NDLS_DAEMON, NDLP_ERR, "SSL_read error: %d %s", ret, util_openssl_ret_err(ret)); + + if (ret == SSL_ERROR_ZERO_RETURN) { + nd_log(NDLS_DAEMON, NDLP_ERR, "SSL_read connection closed by remote end"); + return MQTT_WSS_ERR_REMOTE_CLOSED; + } + if (ret == SSL_ERROR_SYSCALL) nd_log(NDLS_DAEMON, NDLP_ERR, "SSL_read SYSCALL errno: %d %s", errnobkp, strerror(errnobkp)); + return MQTT_WSS_ERR_CONN_DROP; } } @@ -935,8 +943,15 @@ int mqtt_wss_service(mqtt_wss_client client, int timeout_ms) ret != SSL_ERROR_WANT_WRITE) { worker_is_busy(WORKER_ACLK_TX_ERROR); nd_log(NDLS_DAEMON, NDLP_ERR, "SSL_write error: %d %s", ret, util_openssl_ret_err(ret)); + + if (ret == SSL_ERROR_ZERO_RETURN) { + nd_log(NDLS_DAEMON, NDLP_ERR, "SSL_write connection closed by remote end"); + return MQTT_WSS_ERR_REMOTE_CLOSED; + } + if (ret == SSL_ERROR_SYSCALL) nd_log(NDLS_DAEMON, NDLP_ERR, "SSL_write SYSCALL errno: %d %s", errnobkp, strerror(errnobkp)); + return MQTT_WSS_ERR_CONN_DROP; } } diff --git a/src/aclk/mqtt_websockets/mqtt_wss_client.h b/src/aclk/mqtt_websockets/mqtt_wss_client.h index 0e5612bc581073..dd8698589a3c3c 100644 --- a/src/aclk/mqtt_websockets/mqtt_wss_client.h +++ b/src/aclk/mqtt_websockets/mqtt_wss_client.h @@ -15,6 +15,8 @@ #define MQTT_WSS_ERR_MSG_TOO_BIG -6 // Message size too big for server #define MQTT_WSS_ERR_CANT_DO -8 // if client was initialized with MQTT 3 but MQTT 5 feature // was requested by user of library +#define MQTT_WSS_ERR_POLL_FAILED -9 +#define MQTT_WSS_ERR_REMOTE_CLOSED -10 typedef struct mqtt_wss_client_struct *mqtt_wss_client; diff --git a/src/claim/cloud-status.c b/src/claim/cloud-status.c index 52f238128165df..1a3815675a526a 100644 --- a/src/claim/cloud-status.c +++ b/src/claim/cloud-status.c @@ -124,6 +124,7 @@ CLOUD_STATUS buffer_json_cloud_status(BUFFER *wb, time_t now_s) { CLAIM_ID claim_id = rrdhost_claim_id_get(localhost); buffer_json_member_add_string(wb, "claim_id", claim_id.str); buffer_json_member_add_string(wb, "url", cloud_config_url_get()); + buffer_json_member_add_string(wb, "reason", cloud_status_aclk_offline_reason()); break; } } diff --git a/src/collectors/systemd-journal.plugin/systemd-journal-annotations.c b/src/collectors/systemd-journal.plugin/systemd-journal-annotations.c index 3ca10ab74a0e83..ef9204db30a271 100644 --- a/src/collectors/systemd-journal.plugin/systemd-journal-annotations.c +++ b/src/collectors/systemd-journal.plugin/systemd-journal-annotations.c @@ -617,6 +617,7 @@ static void netdata_systemd_journal_message_ids_init(void) { msgid_into_dict("23e93dfccbf64e11aac858b9410d8a82", "Netdata fatal message"); msgid_into_dict("8ddaf5ba33a74078b609250db1e951f3", "Sensor state transition"); msgid_into_dict("ec87a56120d5431bace51e2fb8bba243", "Netdata log flood protection"); + msgid_into_dict("acb33cb95778476baac702eb7e4e151d", "Netdata Cloud connection"); } void netdata_systemd_journal_transform_message_id(FACETS *facets __maybe_unused, BUFFER *wb, FACETS_TRANSFORMATION_SCOPE scope __maybe_unused, void *data __maybe_unused) { diff --git a/src/daemon/pulse/pulse-parents.c b/src/daemon/pulse/pulse-parents.c new file mode 100644 index 00000000000000..19932644e32cb3 --- /dev/null +++ b/src/daemon/pulse/pulse-parents.c @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#define PULSE_INTERNALS +#include "pulse.h" + +DEFINE_JUDYL_TYPED(PHOST, PULSE_HOST_STATUS); + +// -------------------------------------------------------------------------------------------------------------------- +// parents + +struct by_reason { + size_t counters[STREAM_HANDSHAKE_NEGATIVE_MAX + 3]; + RRDSET *st; + RRDDIM *rd[STREAM_HANDSHAKE_NEGATIVE_MAX + 3]; +}; + +#define STREAM_HANDSHAKE_STREAM_INFO (STREAM_HANDSHAKE_NEGATIVE_MAX) +#define STREAM_HANDSHAKE_CONNECT (STREAM_HANDSHAKE_NEGATIVE_MAX + 1) +#define STREAM_HANDSHAKE_OTHER (STREAM_HANDSHAKE_NEGATIVE_MAX + 2) + +struct { + SPINLOCK spinlock; + PHOST_JudyLSet index; + + struct { + // counters + struct by_reason events_by_reason; + struct by_reason disconnects_by_reason; + + // gauges + ssize_t nodes_local; + ssize_t nodes_virtual; + ssize_t nodes_loading; + ssize_t nodes_archived; + ssize_t nodes_offline; + ssize_t nodes_waiting; + ssize_t nodes_replicating; + ssize_t nodes_running; + } parent; + + struct { + // counters + struct by_reason stream_info_failed_by_reason; + struct by_reason events_by_reason; + struct by_reason disconnects_by_reason; + + // gauges + ssize_t nodes_offline; + ssize_t nodes_connecting; + ssize_t nodes_pending; + ssize_t nodes_waiting; + ssize_t nodes_replicating; + ssize_t nodes_running; + ssize_t nodes_no_dst; + } sender; + +} p = { 0 }; + +static PULSE_HOST_STATUS pulse_host_detect_receiver_status(RRDHOST *host) { + RRDHOST_STATUS status = { 0 }; + rrdhost_status(host, now_realtime_sec(), &status); + + PULSE_HOST_STATUS rc = 0; + + if(status.db.status == RRDHOST_DB_STATUS_INITIALIZING || status.ingest.status == RRDHOST_INGEST_STATUS_INITIALIZING) + rc = PULSE_HOST_STATUS_LOADING; + + else if(status.ingest.type == RRDHOST_INGEST_TYPE_LOCALHOST) + rc = PULSE_HOST_STATUS_LOCAL; + + else if(status.ingest.type == RRDHOST_INGEST_TYPE_VIRTUAL) + rc = PULSE_HOST_STATUS_VIRTUAL; + + else if(status.ingest.status == RRDHOST_INGEST_STATUS_ARCHIVED) + rc = PULSE_HOST_STATUS_ARCHIVED; + + else if(status.ingest.status == RRDHOST_INGEST_STATUS_REPLICATING) + rc = PULSE_HOST_STATUS_RCV_REPLICATING; + + else if(status.ingest.status == RRDHOST_INGEST_STATUS_OFFLINE) + rc = PULSE_HOST_STATUS_RCV_OFFLINE; + + else if(status.ingest.status == RRDHOST_INGEST_STATUS_ONLINE) + rc = PULSE_HOST_STATUS_RCV_RUNNING; + + return rc; +} + +static void update_reason(struct by_reason *b, STREAM_HANDSHAKE reason) { + int r = reason; + + if(r >= 0) + r = 0; + else if(r > -STREAM_HANDSHAKE_NEGATIVE_MAX) + r = -reason; + else + r = STREAM_HANDSHAKE_NEGATIVE_MAX; + + __atomic_add_fetch(&b->counters[r], 1, __ATOMIC_RELAXED); +} + +static void pulse_host_add_sub_status(PULSE_HOST_STATUS status, ssize_t val, STREAM_HANDSHAKE reason) { + while(status) { + PULSE_HOST_STATUS s = 1 << (__builtin_ffs(status) - 1); + status &= ~s; + + bool do_parent_reason = false, do_sender_reason = false; + + switch(s) { + default: + break; + + case PULSE_HOST_STATUS_LOCAL: + __atomic_add_fetch(&p.parent.nodes_local, val, __ATOMIC_RELAXED); + break; + + case PULSE_HOST_STATUS_VIRTUAL: + __atomic_add_fetch(&p.parent.nodes_virtual, val, __ATOMIC_RELAXED); + break; + + case PULSE_HOST_STATUS_LOADING: + __atomic_add_fetch(&p.parent.nodes_loading, val, __ATOMIC_RELAXED); + break; + + case PULSE_HOST_STATUS_ARCHIVED: + __atomic_add_fetch(&p.parent.nodes_archived, val, __ATOMIC_RELAXED); + break; + + case PULSE_HOST_STATUS_RCV_OFFLINE: + __atomic_add_fetch(&p.parent.nodes_offline, val, __ATOMIC_RELAXED); + do_parent_reason = true; + break; + + case PULSE_HOST_STATUS_RCV_WAITING: + __atomic_add_fetch(&p.parent.nodes_waiting, val, __ATOMIC_RELAXED); + do_parent_reason = true; + reason = 0; + break; + + case PULSE_HOST_STATUS_RCV_REPLICATING: + __atomic_add_fetch(&p.parent.nodes_replicating, val, __ATOMIC_RELAXED); + break; + + case PULSE_HOST_STATUS_RCV_RUNNING: + __atomic_add_fetch(&p.parent.nodes_running, val, __ATOMIC_RELAXED); + break; + + case PULSE_HOST_STATUS_SND_OFFLINE: + __atomic_add_fetch(&p.sender.nodes_offline, val, __ATOMIC_RELAXED); + do_sender_reason = true; + break; + + case PULSE_HOST_STATUS_SND_PENDING: + __atomic_add_fetch(&p.sender.nodes_pending, val, __ATOMIC_RELAXED); + break; + + case PULSE_HOST_STATUS_SND_CONNECTING: + __atomic_add_fetch(&p.sender.nodes_connecting, val, __ATOMIC_RELAXED); + __atomic_add_fetch(&p.sender.events_by_reason.counters[STREAM_HANDSHAKE_CONNECT], 1, __ATOMIC_RELAXED); + break; + + case PULSE_HOST_STATUS_SND_WAITING: + __atomic_add_fetch(&p.sender.nodes_waiting, val, __ATOMIC_RELAXED); + break; + + case PULSE_HOST_STATUS_SND_REPLICATING: + __atomic_add_fetch(&p.sender.nodes_replicating, val, __ATOMIC_RELAXED); + break; + + case PULSE_HOST_STATUS_SND_RUNNING: + __atomic_add_fetch(&p.sender.nodes_running, val, __ATOMIC_RELAXED); + break; + + case PULSE_HOST_STATUS_SND_NO_DST: + __atomic_add_fetch(&p.sender.nodes_no_dst, val, __ATOMIC_RELAXED); + break; + } + + if(do_parent_reason && val > 0) + update_reason(&p.parent.disconnects_by_reason, reason); + + if(do_sender_reason && val > 0) + update_reason(&p.sender.disconnects_by_reason, reason); + } +} + +void pulse_host_status(RRDHOST *host, PULSE_HOST_STATUS status, STREAM_HANDSHAKE reason) { + PULSE_HOST_STATUS remove = 0; + + if(!status) + status = pulse_host_detect_receiver_status(host); + + PULSE_HOST_STATUS basic = PULSE_HOST_STATUS_LOCAL|PULSE_HOST_STATUS_VIRTUAL| PULSE_HOST_STATUS_LOADING |PULSE_HOST_STATUS_ARCHIVED|PULSE_HOST_STATUS_DELETED; + PULSE_HOST_STATUS rcv = PULSE_HOST_STATUS_RCV_OFFLINE|PULSE_HOST_STATUS_RCV_WAITING|PULSE_HOST_STATUS_RCV_REPLICATING|PULSE_HOST_STATUS_RCV_RUNNING; + PULSE_HOST_STATUS snd = PULSE_HOST_STATUS_SND_OFFLINE|PULSE_HOST_STATUS_SND_PENDING|PULSE_HOST_STATUS_SND_CONNECTING|PULSE_HOST_STATUS_SND_WAITING|PULSE_HOST_STATUS_SND_REPLICATING|PULSE_HOST_STATUS_SND_RUNNING|PULSE_HOST_STATUS_SND_NO_DST; + + if(status & basic) + remove = ~0; + else if(status & rcv) + remove = basic | rcv; + else if(status & snd) + remove = snd; + + spinlock_lock(&p.spinlock); + PULSE_HOST_STATUS old = PHOST_GET(&p.index, (uintptr_t)host); + if(status == PULSE_HOST_STATUS_DELETED) + PHOST_DEL(&p.index, (uintptr_t)host); + else + PHOST_SET(&p.index, (uintptr_t)host, (old & ~remove) | status); + spinlock_unlock(&p.spinlock); + + remove &= old; + pulse_host_add_sub_status(remove, -1, 0); + + if(status != PULSE_HOST_STATUS_DELETED) + pulse_host_add_sub_status(status, 1, reason); +} + +void pulse_parent_stream_info_received_request(void) { + __atomic_add_fetch(&p.parent.events_by_reason.counters[STREAM_HANDSHAKE_STREAM_INFO], 1, __ATOMIC_RELAXED); +} + +void pulse_parent_receiver_request(void) { + __atomic_add_fetch(&p.parent.events_by_reason.counters[STREAM_HANDSHAKE_CONNECT], 1, __ATOMIC_RELAXED); +} + +void pulse_parent_receiver_rejected(STREAM_HANDSHAKE reason) { + update_reason(&p.parent.events_by_reason, reason); +} + +// -------------------------------------------------------------------------------------------------------------------- +// children / senders + +void pulse_stream_info_sent_request(void) { + __atomic_add_fetch(&p.sender.events_by_reason.counters[STREAM_HANDSHAKE_STREAM_INFO], 1, __ATOMIC_RELAXED); +} + +void pulse_sender_stream_info_failed(const char *destination __maybe_unused, STREAM_HANDSHAKE reason) { + update_reason(&p.sender.stream_info_failed_by_reason, reason); +} + +void pulse_sender_connection_failed(const char *destination __maybe_unused, STREAM_HANDSHAKE reason) { + update_reason(&p.sender.events_by_reason, reason); +} + +// -------------------------------------------------------------------------------------------------------------------- + +static void chart_by_reason(struct by_reason *b, const char *id, const char *context, const char *title, int priority) { + if(!b->st) { + b->st = rrdset_create_localhost( + "netdata" + , id + , NULL + , "Streaming" + , context + , title + , "events" + , "netdata" + , "pulse" + , priority + , localhost->rrd_update_every + , RRDSET_TYPE_LINE + ); + + for(size_t i = 0; i < STREAM_HANDSHAKE_NEGATIVE_MAX ;i++) { + char buf[1024]; + if(!i) + strncpyz(buf, "connected", sizeof(buf) - 1); + else + strncpyz(buf, stream_handshake_error_to_string(-i), sizeof(buf) - 1); + for(int c = 0; buf[c] ;c++) + buf[c] = (char)tolower(buf[c]); + + b->rd[i] = rrddim_add(b->st, buf, NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + } + b->rd[STREAM_HANDSHAKE_STREAM_INFO] = rrddim_add(b->st, "info", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + b->rd[STREAM_HANDSHAKE_CONNECT] = rrddim_add(b->st, "connect", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + b->rd[STREAM_HANDSHAKE_OTHER] = rrddim_add(b->st, "other", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + } + + for(size_t i = 0; i <= STREAM_HANDSHAKE_OTHER ;i++) + rrddim_set_by_pointer(b->st, b->rd[i], (collected_number)__atomic_load_n(&b->counters[i], __ATOMIC_RELAXED)); + + rrdset_done(b->st); +} + +void pulse_parents_do(bool extended) { + if(netdata_conf_is_parent()) { + { + static RRDSET *st_nodes = NULL; + static RRDDIM *rd_loading = NULL; + static RRDDIM *rd_local = NULL; + static RRDDIM *rd_virtual = NULL; + static RRDDIM *rd_archived = NULL; + static RRDDIM *rd_offline = NULL; + static RRDDIM *rd_waiting = NULL; + static RRDDIM *rd_replicating = NULL; + static RRDDIM *rd_running = NULL; + + if (unlikely(!st_nodes)) { + st_nodes = rrdset_create_localhost( + "netdata" + , "streaming_inbound" + , NULL + , "Streaming" + , "netdata.streaming_inbound" + , "Inbound Nodes" + , "nodes" + , "netdata" + , "pulse" + , 130150 + , localhost->rrd_update_every + , RRDSET_TYPE_LINE + ); + + rd_local = rrddim_add(st_nodes, "local", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_virtual = rrddim_add(st_nodes, "virtual", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_loading = rrddim_add(st_nodes, "loading", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_archived = rrddim_add(st_nodes, "stale", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_offline = rrddim_add(st_nodes, "offline", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_waiting = rrddim_add(st_nodes, "waiting", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_replicating = rrddim_add(st_nodes, "replicating", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_running = rrddim_add(st_nodes, "running", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + } + + rrddim_set_by_pointer(st_nodes, rd_local, (collected_number)__atomic_load_n(&p.parent.nodes_local, __ATOMIC_RELAXED)); + rrddim_set_by_pointer(st_nodes, rd_virtual, (collected_number)__atomic_load_n(&p.parent.nodes_virtual, __ATOMIC_RELAXED)); + rrddim_set_by_pointer(st_nodes,rd_loading, (collected_number)__atomic_load_n(&p.parent.nodes_loading, __ATOMIC_RELAXED)); + rrddim_set_by_pointer(st_nodes, rd_archived, (collected_number)__atomic_load_n(&p.parent.nodes_archived, __ATOMIC_RELAXED)); + rrddim_set_by_pointer(st_nodes, rd_offline, (collected_number)__atomic_load_n(&p.parent.nodes_offline, __ATOMIC_RELAXED)); + rrddim_set_by_pointer(st_nodes, rd_waiting, (collected_number)__atomic_load_n(&p.parent.nodes_waiting, __ATOMIC_RELAXED)); + rrddim_set_by_pointer(st_nodes, rd_replicating, (collected_number)__atomic_load_n(&p.parent.nodes_replicating, __ATOMIC_RELAXED)); + rrddim_set_by_pointer(st_nodes, rd_running, (collected_number)__atomic_load_n(&p.parent.nodes_running, __ATOMIC_RELAXED)); + + rrdset_done(st_nodes); + } + + if(extended) { + chart_by_reason( + &p.parent.events_by_reason, + "streaming_rejections_inbound", + "netdata.streaming_events_inbound", + "Inbound Streaming Events", + 130151); + chart_by_reason( + &p.parent.disconnects_by_reason, + "streaming_disconnects_inbound", + "netdata.streaming_events_inbound", + "Inbound Streaming Events", + 130151); + } + } + + if(stream_conf_is_child()) { + { + static RRDSET *st_nodes = NULL; + static RRDDIM *rd_pending = NULL; + static RRDDIM *rd_connecting = NULL; + static RRDDIM *rd_offline = NULL; + static RRDDIM *rd_waiting = NULL; + static RRDDIM *rd_replicating = NULL; + static RRDDIM *rd_running = NULL; + static RRDDIM *rd_no_dst = NULL; + + if (unlikely(!st_nodes)) { + st_nodes = rrdset_create_localhost( + "netdata" + , "streaming_outbound" + , NULL + , "Streaming" + , "netdata.streaming_outbound" + , "Outbound Nodes" + , "nodes" + , "netdata" + , "pulse" + , 130151 + , localhost->rrd_update_every + , RRDSET_TYPE_LINE + ); + + rd_connecting = rrddim_add(st_nodes, "connecting", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_pending = rrddim_add(st_nodes, "pending", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_offline = rrddim_add(st_nodes, "offline", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_waiting = rrddim_add(st_nodes, "waiting", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_replicating = rrddim_add(st_nodes, "replicating", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_running = rrddim_add(st_nodes, "running", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_no_dst = rrddim_add(st_nodes, "no dst", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + } + + rrddim_set_by_pointer(st_nodes, rd_connecting, (collected_number)__atomic_load_n(&p.sender.nodes_connecting, __ATOMIC_RELAXED)); + rrddim_set_by_pointer(st_nodes, rd_pending, (collected_number)__atomic_load_n(&p.sender.nodes_pending, __ATOMIC_RELAXED)); + rrddim_set_by_pointer(st_nodes, rd_offline, (collected_number)__atomic_load_n(&p.sender.nodes_offline, __ATOMIC_RELAXED)); + rrddim_set_by_pointer(st_nodes, rd_waiting, (collected_number)__atomic_load_n(&p.sender.nodes_waiting, __ATOMIC_RELAXED)); + rrddim_set_by_pointer(st_nodes, rd_replicating, (collected_number)__atomic_load_n(&p.sender.nodes_replicating, __ATOMIC_RELAXED)); + rrddim_set_by_pointer(st_nodes, rd_running, (collected_number)__atomic_load_n(&p.sender.nodes_running, __ATOMIC_RELAXED)); + rrddim_set_by_pointer(st_nodes, rd_no_dst, (collected_number)__atomic_load_n(&p.sender.nodes_no_dst, __ATOMIC_RELAXED)); + + rrdset_done(st_nodes); + } + + if(extended) { + chart_by_reason( + &p.sender.stream_info_failed_by_reason, + "streaming_info_failed_outbound", + "netdata.streaming_events_outbound", + "Outbound Streaming Events", + 130152); + chart_by_reason( + &p.sender.events_by_reason, + "streaming_rejections_outbound", + "netdata.streaming_events_outbound", + "Outbound Streaming Events", + 130152); + chart_by_reason( + &p.sender.disconnects_by_reason, + "streaming_disconnects_outbound", + "netdata.streaming_events_outbound", + "Outbound Streaming Events", + 130152); + } + } +} diff --git a/src/daemon/pulse/pulse-parents.h b/src/daemon/pulse/pulse-parents.h new file mode 100644 index 00000000000000..ae1e5216e83140 --- /dev/null +++ b/src/daemon/pulse/pulse-parents.h @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_PULSE_PARENTS_H +#define NETDATA_PULSE_PARENTS_H + +#include "libnetdata/libnetdata.h" +#include "streaming/stream-handshake.h" + +typedef enum { + PULSE_HOST_STATUS_NONE = 0, + PULSE_HOST_STATUS_LOCAL = (1 << 0), + PULSE_HOST_STATUS_VIRTUAL = (1 << 1), + PULSE_HOST_STATUS_LOADING = (1 << 2), + PULSE_HOST_STATUS_ARCHIVED = (1 << 3), + PULSE_HOST_STATUS_RCV_OFFLINE = (1 << 4), + PULSE_HOST_STATUS_RCV_WAITING = (1 << 5), + PULSE_HOST_STATUS_RCV_REPLICATING = (1 << 6), + PULSE_HOST_STATUS_RCV_RUNNING = (1 << 7), + PULSE_HOST_STATUS_SND_OFFLINE = (1 << 8), + PULSE_HOST_STATUS_SND_PENDING = (1 << 9), + PULSE_HOST_STATUS_SND_CONNECTING = (1 << 10), + PULSE_HOST_STATUS_SND_NO_DST = (1 << 11), + PULSE_HOST_STATUS_SND_WAITING = (1 << 12), + PULSE_HOST_STATUS_SND_REPLICATING = (1 << 13), + PULSE_HOST_STATUS_SND_RUNNING = (1 << 14), + PULSE_HOST_STATUS_DELETED = (1 << 15), +} PULSE_HOST_STATUS; + +void pulse_host_status(RRDHOST *host, PULSE_HOST_STATUS status, STREAM_HANDSHAKE reason); + +// receiver events + +void pulse_parent_stream_info_received_request(void); +void pulse_parent_receiver_request(void); +void pulse_parent_receiver_rejected(STREAM_HANDSHAKE reason); + +// sender +void pulse_stream_info_sent_request(void); +void pulse_sender_stream_info_failed(const char *destination __maybe_unused, STREAM_HANDSHAKE reason); +void pulse_sender_connection_failed(const char *destination __maybe_unused, STREAM_HANDSHAKE reason); + +void pulse_parents_do(bool extended); + +#endif //NETDATA_PULSE_PARENTS_H diff --git a/src/daemon/pulse/pulse.c b/src/daemon/pulse/pulse.c index c5f8c5e22665d5..970e023681e280 100644 --- a/src/daemon/pulse/pulse.c +++ b/src/daemon/pulse/pulse.c @@ -19,8 +19,9 @@ #define WORKER_JOB_REGISTRY 13 #define WORKER_JOB_ARAL 14 #define WORKER_JOB_NETWORK 15 +#define WORKER_JOB_PARENTS 16 -#if WORKER_UTILIZATION_MAX_JOB_TYPES < 16 +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 17 #error "WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 14" #endif @@ -46,6 +47,7 @@ static void pulse_register_workers(void) { worker_register_job_name(WORKER_JOB_REGISTRY, "registry"); worker_register_job_name(WORKER_JOB_ARAL, "aral"); worker_register_job_name(WORKER_JOB_NETWORK, "network"); + worker_register_job_name(WORKER_JOB_PARENTS, "parents"); } static void pulse_cleanup(void *pptr) @@ -144,6 +146,9 @@ void *pulse_thread_main(void *ptr) { worker_is_busy(WORKER_JOB_ARAL); pulse_aral_do(pulse_extended_enabled); + worker_is_busy(WORKER_JOB_PARENTS); + pulse_parents_do(pulse_extended_enabled); + // keep this last to have access to the memory counters // exposed by everyone else worker_is_busy(WORKER_JOB_DAEMON); diff --git a/src/daemon/pulse/pulse.h b/src/daemon/pulse/pulse.h index b5245af7e5e483..dc9369750ce2c2 100644 --- a/src/daemon/pulse/pulse.h +++ b/src/daemon/pulse/pulse.h @@ -26,6 +26,7 @@ extern bool pulse_extended_enabled; #include "pulse-trace-allocations.h" #include "pulse-aral.h" #include "pulse-network.h" +#include "pulse-parents.h" void *pulse_thread_main(void *ptr); void *pulse_thread_sqlite3_main(void *ptr); diff --git a/src/database/rrd.c b/src/database/rrd.c index 2c952d6bcb5fd7..cf306dff00110d 100644 --- a/src/database/rrd.c +++ b/src/database/rrd.c @@ -147,6 +147,7 @@ int rrd_init(const char *hostname, struct rrdhost_system_info *system_info, bool rrdhost_flag_set(localhost, RRDHOST_FLAG_COLLECTOR_ONLINE); object_state_activate(&localhost->state_id); + pulse_host_status(localhost, 0, 0); // this will detect the receiver status ml_host_start(localhost); dyncfg_host_init(localhost); diff --git a/src/database/rrdhost.c b/src/database/rrdhost.c index 8cd9b53ab4dc08..e257d5c5666ac3 100644 --- a/src/database/rrdhost.c +++ b/src/database/rrdhost.c @@ -810,7 +810,7 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) { stream_sender_structures_free(host); if (netdata_exit || force) - stream_receiver_signal_to_stop_and_wait(host, STREAM_HANDSHAKE_DISCONNECT_HOST_CLEANUP); + stream_receiver_signal_to_stop_and_wait(host, STREAM_HANDSHAKE_SND_DISCONNECT_HOST_CLEANUP); // ------------------------------------------------------------------------ @@ -867,6 +867,7 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) { string_freez(host->hostname); __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_allocations_size, sizeof(RRDHOST), __ATOMIC_RELAXED); + pulse_host_status(host, PULSE_HOST_STATUS_DELETED, 0); freez(host); } diff --git a/src/database/sqlite/sqlite_aclk.c b/src/database/sqlite/sqlite_aclk.c index 0bd4a0c02791c1..53589e3413d682 100644 --- a/src/database/sqlite/sqlite_aclk.c +++ b/src/database/sqlite/sqlite_aclk.c @@ -204,6 +204,8 @@ static int create_host_callback(void *data, int argc, char **argv, char **column host->rrdlabels = sql_load_host_labels((nd_uuid_t *)argv[IDX_HOST_ID]); host->stream.snd.status.last_connected = last_connected; + + pulse_host_status(host, 0, 0); // this will detect the receiver status } (*number_of_chidren)++; diff --git a/src/database/sqlite/sqlite_metadata.c b/src/database/sqlite/sqlite_metadata.c index f97c695d42822b..60c141cc2337ad 100644 --- a/src/database/sqlite/sqlite_metadata.c +++ b/src/database/sqlite/sqlite_metadata.c @@ -1918,6 +1918,7 @@ static void restore_host_context(void *arg) nd_log_daemon(NDLP_DEBUG, "Contexts for host %s loaded in %s", rrdhost_hostname(host), load_duration); rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_CONTEXT_LOAD); + pulse_host_status(host, 0, 0); // this will detect the receiver status aclk_queue_node_info(host, false); diff --git a/src/libnetdata/socket/nd-sock.c b/src/libnetdata/socket/nd-sock.c index d8b5ebabf6f649..58250b0d3ab94d 100644 --- a/src/libnetdata/socket/nd-sock.c +++ b/src/libnetdata/socket/nd-sock.c @@ -3,19 +3,19 @@ #include "libnetdata/libnetdata.h" ENUM_STR_MAP_DEFINE(ND_SOCK_ERROR) = { - { .id = ND_SOCK_ERR_NONE, .name = "NONE", }, - { .id = ND_SOCK_ERR_CONNECTION_REFUSED, .name = "CONNECTION REFUSED", }, - { .id = ND_SOCK_ERR_CANNOT_RESOLVE_HOSTNAME, .name = "CANNOT RESOLVE HOSTNAME", }, - { .id = ND_SOCK_ERR_FAILED_TO_CREATE_SOCKET, .name = "FAILED TO CREATE SOCKET", }, - { .id = ND_SOCK_ERR_NO_HOST_IN_DEFINITION, .name = "NO HOST IN DEFINITION", }, - { .id = ND_SOCK_ERR_POLL_ERROR, .name = "POLL ERROR", }, - { .id = ND_SOCK_ERR_TIMEOUT, .name = "TIMEOUT", }, - { .id = ND_SOCK_ERR_SSL_CANT_ESTABLISH_SSL_CONNECTION, .name = "CANT ESTABLISH SSL CONNECTION", }, - { .id = ND_SOCK_ERR_SSL_INVALID_CERTIFICATE, .name = "INVALID SSL CERTIFICATE", }, - { .id = ND_SOCK_ERR_SSL_FAILED_TO_OPEN, .name = "FAILED TO OPEN SSL", }, - { .id = ND_SOCK_ERR_THREAD_CANCELLED, .name = "THREAD CANCELLED", }, - { .id = ND_SOCK_ERR_NO_DESTINATION_AVAILABLE, .name = "NO PARENT AVAILABLE", }, - { .id = ND_SOCK_ERR_UNKNOWN_ERROR, .name = "UNKNOWN ERROR", }, + { .id = ND_SOCK_ERR_NONE, .name = "no socket error", }, + { .id = ND_SOCK_ERR_CONNECTION_REFUSED, .name = "connection refused", }, + { .id = ND_SOCK_ERR_CANNOT_RESOLVE_HOSTNAME, .name = "cannot resolve hostname", }, + { .id = ND_SOCK_ERR_FAILED_TO_CREATE_SOCKET, .name = "cannot create socket", }, + { .id = ND_SOCK_ERR_NO_HOST_IN_DEFINITION, .name = "no host in definition", }, + { .id = ND_SOCK_ERR_POLL_ERROR, .name = "socket poll() error", }, + { .id = ND_SOCK_ERR_TIMEOUT, .name = "timeout", }, + { .id = ND_SOCK_ERR_SSL_CANT_ESTABLISH_SSL_CONNECTION, .name = "cannot establish SSL connection", }, + { .id = ND_SOCK_ERR_SSL_INVALID_CERTIFICATE, .name = "invalid SSL certification", }, + { .id = ND_SOCK_ERR_SSL_FAILED_TO_OPEN, .name = "failed to open SSL", }, + { .id = ND_SOCK_ERR_THREAD_CANCELLED, .name = "thread cancelled", }, + { .id = ND_SOCK_ERR_NO_DESTINATION_AVAILABLE, .name = "no destination available", }, + { .id = ND_SOCK_ERR_UNKNOWN_ERROR, .name = "unknown error", }, // terminator { .name = NULL, .id = 0 } diff --git a/src/libnetdata/socket/nd-sock.h b/src/libnetdata/socket/nd-sock.h index 995f9e00c1792d..75caf0cd5cb881 100644 --- a/src/libnetdata/socket/nd-sock.h +++ b/src/libnetdata/socket/nd-sock.h @@ -19,6 +19,9 @@ typedef enum __attribute__((packed)) { ND_SOCK_ERR_THREAD_CANCELLED, ND_SOCK_ERR_NO_DESTINATION_AVAILABLE, ND_SOCK_ERR_UNKNOWN_ERROR, + + // terminator + ND_SOCK_ERR_MAX, } ND_SOCK_ERROR; ENUM_STR_DEFINE_FUNCTIONS_EXTERN(ND_SOCK_ERROR); diff --git a/src/libnetdata/uuid/uuid.h b/src/libnetdata/uuid/uuid.h index 7437f9331bb41b..c0cee1c0d9305f 100644 --- a/src/libnetdata/uuid/uuid.h +++ b/src/libnetdata/uuid/uuid.h @@ -35,6 +35,7 @@ ND_UUID_DEFINE(health_alert_notification_msgid, 0x6d, 0xb0, 0x01, 0x8e, 0x83, 0x ND_UUID_DEFINE(sensors_state_transition_msgid, 0x8d, 0xda, 0xf5, 0xba, 0x33, 0xa7, 0x40, 0x78, 0xb6, 0x09, 0x25, 0x0d, 0xb1, 0xe9, 0x51, 0xf3); ND_UUID_DEFINE(log_flood_protection_msgid, 0xec, 0x87, 0xa5, 0x61, 0x20, 0xd5, 0x43, 0x1b, 0xac, 0xe5, 0x1e, 0x2f, 0xb8, 0xbb, 0xa2, 0x43); ND_UUID_DEFINE(netdata_startup_msgid, 0x1e, 0x60, 0x61, 0xa9, 0xfb, 0xd4, 0x45, 0x01, 0xb3, 0xcc, 0xc3, 0x68, 0x11, 0x9f, 0x2b, 0x69); +ND_UUID_DEFINE(aclk_connection_msgid, 0xac, 0xb3, 0x3c, 0xb9, 0x57, 0x78, 0x47, 0x6b, 0xaa, 0xc7, 0x02, 0xeb, 0x7e, 0x4e, 0x15, 0x1d); ND_UUID UUID_generate_from_hash(const void *payload, size_t payload_len); diff --git a/src/plugins.d/pluginsd_parser.c b/src/plugins.d/pluginsd_parser.c index 71a55b171c9638..c028d1be027bd9 100644 --- a/src/plugins.d/pluginsd_parser.c +++ b/src/plugins.d/pluginsd_parser.c @@ -207,6 +207,7 @@ static inline PARSER_RC pluginsd_host_define_end(char **words __maybe_unused, si object_state_activate(&host->state_id); ml_host_start(host); dyncfg_host_init(host); + pulse_host_status(host, 0, 0); // this will detect the receiver status if(host->rrdlabels) { rrdlabels_migrate_to_these(host->rrdlabels, parser->user.host_define.rrdlabels); diff --git a/src/plugins.d/pluginsd_parser.h b/src/plugins.d/pluginsd_parser.h index 26053010d21840..e165275bd6baaf 100644 --- a/src/plugins.d/pluginsd_parser.h +++ b/src/plugins.d/pluginsd_parser.h @@ -9,10 +9,10 @@ #include "streaming/stream-receiver-internals.h" #endif -#define WORKER_PARSER_FIRST_JOB 36 +#define WORKER_PARSER_FIRST_JOB 35 // this has to be in-sync with the same at stream-thread.c -#define WORKER_RECEIVER_JOB_REPLICATION_COMPLETION 25 +#define WORKER_RECEIVER_JOB_REPLICATION_COMPLETION 24 // this controls the max response size of a function #define PLUGINSD_MAX_DEFERRED_SIZE (100 * 1024 * 1024) diff --git a/src/plugins.d/pluginsd_replication.c b/src/plugins.d/pluginsd_replication.c index ef16348d7fcfac..5d93180cebee4f 100644 --- a/src/plugins.d/pluginsd_replication.c +++ b/src/plugins.d/pluginsd_replication.c @@ -57,7 +57,9 @@ PARSER_RC pluginsd_chart_definition_end(char **words, size_t num_words, PARSER * st, RRDSET_FLAG_RECEIVER_REPLICATION_IN_PROGRESS, RRDSET_FLAG_RECEIVER_REPLICATION_FINISHED); if(!(old & RRDSET_FLAG_RECEIVER_REPLICATION_IN_PROGRESS)) { - rrdhost_receiver_replicating_charts_plus_one(st->rrdhost); + if(rrdhost_receiver_replicating_charts_plus_one(st->rrdhost) == 1) + pulse_host_status(host, PULSE_HOST_STATUS_RCV_REPLICATING, 0); + __atomic_add_fetch(&host->stream.rcv.status.replication.counter_in, 1, __ATOMIC_RELAXED); #ifdef REPLICATION_TRACKING @@ -454,9 +456,10 @@ PARSER_RC pluginsd_replay_end(char **words, size_t num_words, PARSER *parser) { st, RRDSET_FLAG_RECEIVER_REPLICATION_FINISHED, RRDSET_FLAG_RECEIVER_REPLICATION_IN_PROGRESS | RRDSET_FLAG_SYNC_CLOCK); - if(!(old & RRDSET_FLAG_RECEIVER_REPLICATION_FINISHED)) - rrdhost_receiver_replicating_charts_minus_one(st->rrdhost); - + if(!(old & RRDSET_FLAG_RECEIVER_REPLICATION_FINISHED)) { + if(rrdhost_receiver_replicating_charts_minus_one(st->rrdhost) == 0) + pulse_host_status(host, PULSE_HOST_STATUS_RCV_RUNNING, 0); + } else nd_log(NDLS_DAEMON, NDLP_WARNING, "PLUGINSD REPLAY ERROR: 'host:%s/chart:%s' got a " PLUGINSD_KEYWORD_REPLAY_END " " diff --git a/src/streaming/protocol/command-chart-definition.c b/src/streaming/protocol/command-chart-definition.c index 99dc9ec77707d1..03e9bf6027fe42 100644 --- a/src/streaming/protocol/command-chart-definition.c +++ b/src/streaming/protocol/command-chart-definition.c @@ -117,8 +117,10 @@ bool stream_sender_send_rrdset_definition(BUFFER *wb, RRDSET *st) { (unsigned long long)now); RRDSET_FLAGS old = rrdset_flag_set_and_clear(st, RRDSET_FLAG_SENDER_REPLICATION_IN_PROGRESS, RRDSET_FLAG_SENDER_REPLICATION_FINISHED); - if(!(old & RRDSET_FLAG_SENDER_REPLICATION_IN_PROGRESS)) - rrdhost_sender_replicating_charts_plus_one(st->rrdhost); + if(!(old & RRDSET_FLAG_SENDER_REPLICATION_IN_PROGRESS)) { + if(rrdhost_sender_replicating_charts_plus_one(st->rrdhost) == 1) + pulse_host_status(st->rrdhost, PULSE_HOST_STATUS_SND_REPLICATING, 0); + } replication_progress = true; diff --git a/src/streaming/stream-connector.c b/src/streaming/stream-connector.c index 01f4399d8cbc10..77f67c7aa4d529 100644 --- a/src/streaming/stream-connector.c +++ b/src/streaming/stream-connector.c @@ -4,7 +4,6 @@ static struct { const char *response; - const char *status; size_t length; int32_t version; bool dynamic; @@ -16,7 +15,6 @@ static struct { { .response = START_STREAMING_PROMPT_VN, .length = sizeof(START_STREAMING_PROMPT_VN) - 1, - .status = STREAM_STATUS_CONNECTED, .version = STREAM_HANDSHAKE_OK_V3, // and above .dynamic = true, // dynamic = we will parse the version / capabilities .error = NULL, @@ -27,7 +25,6 @@ static struct { { .response = START_STREAMING_PROMPT_V2, .length = sizeof(START_STREAMING_PROMPT_V2) - 1, - .status = STREAM_STATUS_CONNECTED, .version = STREAM_HANDSHAKE_OK_V2, .dynamic = false, .error = NULL, @@ -38,7 +35,6 @@ static struct { { .response = START_STREAMING_PROMPT_V1, .length = sizeof(START_STREAMING_PROMPT_V1) - 1, - .status = STREAM_STATUS_CONNECTED, .version = STREAM_HANDSHAKE_OK_V1, .dynamic = false, .error = NULL, @@ -49,8 +45,7 @@ static struct { { .response = START_STREAMING_ERROR_SAME_LOCALHOST, .length = sizeof(START_STREAMING_ERROR_SAME_LOCALHOST) - 1, - .status = STREAM_STATUS_LOCALHOST, - .version = STREAM_HANDSHAKE_ERROR_LOCALHOST, + .version = STREAM_HANDSHAKE_PARENT_IS_LOCALHOST, .dynamic = false, .error = "remote server rejected this stream, the host we are trying to stream is its localhost", .worker_job_id = WORKER_SENDER_CONNECTOR_JOB_DISCONNECT_BAD_HANDSHAKE, @@ -60,8 +55,7 @@ static struct { { .response = START_STREAMING_ERROR_ALREADY_STREAMING, .length = sizeof(START_STREAMING_ERROR_ALREADY_STREAMING) - 1, - .status = STREAM_STATUS_ALREADY_CONNECTED, - .version = STREAM_HANDSHAKE_ERROR_ALREADY_CONNECTED, + .version = STREAM_HANDSHAKE_PARENT_NODE_ALREADY_CONNECTED, .dynamic = false, .error = "remote server rejected this stream, the host we are trying to stream is already streamed to it", .worker_job_id = WORKER_SENDER_CONNECTOR_JOB_DISCONNECT_BAD_HANDSHAKE, @@ -71,8 +65,7 @@ static struct { { .response = START_STREAMING_ERROR_NOT_PERMITTED, .length = sizeof(START_STREAMING_ERROR_NOT_PERMITTED) - 1, - .status = STREAM_STATUS_PERMISSION_DENIED, - .version = STREAM_HANDSHAKE_ERROR_DENIED, + .version = STREAM_HANDSHAKE_PARENT_DENIED_ACCESS, .dynamic = false, .error = "remote server denied access, probably we don't have the right API key?", .worker_job_id = WORKER_SENDER_CONNECTOR_JOB_DISCONNECT_BAD_HANDSHAKE, @@ -82,8 +75,7 @@ static struct { { .response = START_STREAMING_ERROR_BUSY_TRY_LATER, .length = sizeof(START_STREAMING_ERROR_BUSY_TRY_LATER) - 1, - .status = STREAM_STATUS_RATE_LIMIT, - .version = STREAM_HANDSHAKE_BUSY_TRY_LATER, + .version = STREAM_HANDSHAKE_PARENT_BUSY_TRY_LATER, .dynamic = false, .error = "remote server is currently busy, we should try later", .worker_job_id = WORKER_SENDER_CONNECTOR_JOB_DISCONNECT_BAD_HANDSHAKE, @@ -93,8 +85,7 @@ static struct { { .response = START_STREAMING_ERROR_INTERNAL_ERROR, .length = sizeof(START_STREAMING_ERROR_INTERNAL_ERROR) - 1, - .status = STREAM_STATUS_INTERNAL_SERVER_ERROR, - .version = STREAM_HANDSHAKE_INTERNAL_ERROR, + .version = STREAM_HANDSHAKE_PARENT_INTERNAL_ERROR, .dynamic = false, .error = "remote server is encountered an internal error, we should try later", .worker_job_id = WORKER_SENDER_CONNECTOR_JOB_DISCONNECT_BAD_HANDSHAKE, @@ -104,8 +95,7 @@ static struct { { .response = START_STREAMING_ERROR_INITIALIZATION, .length = sizeof(START_STREAMING_ERROR_INITIALIZATION) - 1, - .status = STREAM_STATUS_INITIALIZATION_IN_PROGRESS, - .version = STREAM_HANDSHAKE_INITIALIZATION, + .version = STREAM_HANDSHAKE_PARENT_IS_INITIALIZING, .dynamic = false, .error = "remote server is initializing, we should try later", .worker_job_id = WORKER_SENDER_CONNECTOR_JOB_DISCONNECT_BAD_HANDSHAKE, @@ -117,8 +107,7 @@ static struct { { .response = NULL, .length = 0, - .status = STREAM_STATUS_BAD_HANDSHAKE, - .version = STREAM_HANDSHAKE_ERROR_BAD_HANDSHAKE, + .version = STREAM_HANDSHAKE_CONNECT_HANDSHAKE_FAILED, .dynamic = false, .error = "remote node response is not understood, is it Netdata?", .worker_job_id = WORKER_SENDER_CONNECTOR_JOB_DISCONNECT_BAD_HANDSHAKE, @@ -216,7 +205,7 @@ static int stream_connect_upgrade_prelude(RRDHOST *host __maybe_unused, struct s static bool stream_connect_validate_first_response(RRDHOST *host, struct sender_state *s, char *http, size_t http_length) { - int32_t version = STREAM_HANDSHAKE_ERROR_BAD_HANDSHAKE; + int32_t version = STREAM_HANDSHAKE_CONNECT_HANDSHAKE_FAILED; int i; for(i = 0; stream_responses[i].response ; i++) { @@ -235,7 +224,7 @@ stream_connect_validate_first_response(RRDHOST *host, struct sender_state *s, ch } if(version >= STREAM_HANDSHAKE_OK_V1) { - stream_parent_set_reconnect_delay(host->stream.snd.parents.current, STREAM_HANDSHAKE_CONNECTED, + stream_parent_set_reconnect_delay(host->stream.snd.parents.current, STREAM_HANDSHAKE_SP_CONNECTED, stream_send.parents.reconnect_delay_s); s->capabilities = convert_stream_version_to_capabilities(version, host, true); return true; @@ -243,7 +232,6 @@ stream_connect_validate_first_response(RRDHOST *host, struct sender_state *s, ch ND_LOG_FIELD_PRIORITY priority = stream_responses[i].priority; const char *error = stream_responses[i].error; - const char *status = stream_responses[i].status; int worker_job_id = stream_responses[i].worker_job_id; int delay = stream_responses[i].postpone_reconnect_seconds; @@ -251,7 +239,7 @@ stream_connect_validate_first_response(RRDHOST *host, struct sender_state *s, ch stream_parent_set_reconnect_delay(host->stream.snd.parents.current, version, delay); ND_LOG_STACK lgs[] = { - ND_LOG_FIELD_TXT(NDF_RESPONSE_CODE, status), + ND_LOG_FIELD_I64(NDF_RESPONSE_CODE, stream_handshake_error_to_response_code(version)), ND_LOG_FIELD_END(), }; ND_LOG_STACK_PUSH(lgs); @@ -280,14 +268,19 @@ bool stream_connect(struct sender_state *s, uint16_t default_port, time_t timeou s->sock.verify_certificate = netdata_ssl_validate_certificate_sender; s->sock.ctx = netdata_ssl_streaming_sender_ctx; + pulse_host_status(s->host, PULSE_HOST_STATUS_SND_PENDING, 0); if(!stream_parent_connect_to_one( &s->sock, host, default_port, timeout, s->remote_ip, sizeof(s->remote_ip) - 1, &host->stream.snd.parents.current)) { - if(s->sock.error != ND_SOCK_ERR_NO_DESTINATION_AVAILABLE) + if(s->sock.error != ND_SOCK_ERR_NO_DESTINATION_AVAILABLE) { + pulse_host_status(s->host, PULSE_HOST_STATUS_SND_OFFLINE, STREAM_HANDSHAKE_CONNECTION_FAILED); nd_log(NDLS_DAEMON, NDLP_WARNING, "can't connect to a parent, last error: %s", ND_SOCK_ERROR_2str(s->sock.error)); + } + else + pulse_host_status(s->host, PULSE_HOST_STATUS_SND_NO_DST, 0); nd_sock_close(&s->sock); return false; @@ -319,16 +312,10 @@ bool stream_connect(struct sender_state *s, uint16_t default_port, time_t timeou buffer_strcat(wb, "Accept: */*" HTTP_HDR_END); if (s->parent_using_h2o && stream_connect_upgrade_prelude(host, s)) { - ND_LOG_STACK lgs[] = { - ND_LOG_FIELD_TXT(NDF_RESPONSE_CODE, STREAM_STATUS_CANT_UPGRADE_CONNECTION), - ND_LOG_FIELD_END(), - }; - ND_LOG_STACK_PUSH(lgs); - worker_is_busy(WORKER_SENDER_CONNECTOR_JOB_DISCONNECT_CANT_UPGRADE_CONNECTION); nd_sock_close(&s->sock); stream_parent_set_reconnect_delay( - host->stream.snd.parents.current, STREAM_HANDSHAKE_ERROR_HTTP_UPGRADE, 60); + host->stream.snd.parents.current, STREAM_HANDSHAKE_SND_DISCONNECT_HTTP_UPGRADE_FAILED, 60); return false; } @@ -336,7 +323,7 @@ bool stream_connect(struct sender_state *s, uint16_t default_port, time_t timeou ssize_t bytes = nd_sock_send_timeout(&s->sock, (void *)buffer_tostring(wb), len, 0, timeout); if(bytes <= 0) { // timeout is 0 ND_LOG_STACK lgs[] = { - ND_LOG_FIELD_TXT(NDF_RESPONSE_CODE, STREAM_STATUS_TIMEOUT), + ND_LOG_FIELD_I64(NDF_RESPONSE_CODE, stream_handshake_error_to_response_code(STREAM_HANDSHAKE_CONNECT_SEND_TIMEOUT)), ND_LOG_FIELD_END(), }; ND_LOG_STACK_PUSH(lgs); @@ -349,7 +336,7 @@ bool stream_connect(struct sender_state *s, uint16_t default_port, time_t timeou rrdhost_hostname(host), s->remote_ip); stream_parent_set_reconnect_delay( - host->stream.snd.parents.current, STREAM_HANDSHAKE_ERROR_SEND_TIMEOUT, 60); + host->stream.snd.parents.current, STREAM_HANDSHAKE_CONNECT_SEND_TIMEOUT, 60); return false; } @@ -359,7 +346,7 @@ bool stream_connect(struct sender_state *s, uint16_t default_port, time_t timeou nd_sock_close(&s->sock); ND_LOG_STACK lgs[] = { - ND_LOG_FIELD_TXT(NDF_RESPONSE_CODE, STREAM_STATUS_TIMEOUT), + ND_LOG_FIELD_I64(NDF_RESPONSE_CODE, stream_handshake_error_to_response_code(STREAM_HANDSHAKE_CONNECT_RECEIVE_TIMEOUT)), ND_LOG_FIELD_END(), }; ND_LOG_STACK_PUSH(lgs); @@ -371,7 +358,7 @@ bool stream_connect(struct sender_state *s, uint16_t default_port, time_t timeou rrdhost_hostname(host), s->remote_ip); stream_parent_set_reconnect_delay( - host->stream.snd.parents.current, STREAM_HANDSHAKE_ERROR_RECEIVE_TIMEOUT, 30); + host->stream.snd.parents.current, STREAM_HANDSHAKE_CONNECT_RECEIVE_TIMEOUT, 30); return false; } @@ -387,7 +374,7 @@ bool stream_connect(struct sender_state *s, uint16_t default_port, time_t timeou log_sender_capabilities(s); ND_LOG_STACK lgs[] = { - ND_LOG_FIELD_TXT(NDF_RESPONSE_CODE, STREAM_STATUS_CONNECTED), + ND_LOG_FIELD_I64(NDF_RESPONSE_CODE, HTTP_RESP_OK), ND_LOG_FIELD_END(), }; ND_LOG_STACK_PUSH(lgs); @@ -466,6 +453,8 @@ void stream_connector_requeue(struct sender_state *s) { SENDERS_SET(&sc->queue.senders, (Word_t)s, s); spinlock_unlock(&sc->queue.spinlock); + pulse_host_status(s->host, PULSE_HOST_STATUS_SND_PENDING, 0); + // signal the connector to catch the job completion_mark_complete_a_job(&sc->completion); } @@ -504,7 +493,9 @@ static void stream_connector_remove(struct sender_state *s) { "STREAM CNT '%s' [to %s]: streaming connector removed host: %s (signaled to stop)", rrdhost_hostname(s->host), s->remote_ip, stream_handshake_error_to_string(s->exit.reason)); - stream_sender_remove(s); + STREAM_HANDSHAKE reason = s->exit.reason ? s->exit.reason : STREAM_HANDSHAKE_DISCONNECT_SIGNALED_TO_STOP; + pulse_host_status(s->host, PULSE_HOST_STATUS_SND_OFFLINE, reason); + stream_sender_remove(s, reason); } static void *stream_connector_thread(void *ptr) { diff --git a/src/streaming/stream-handshake.c b/src/streaming/stream-handshake.c index 5d4ef3f567e150..2cf293c978883e 100644 --- a/src/streaming/stream-handshake.c +++ b/src/streaming/stream-handshake.c @@ -5,54 +5,58 @@ static struct { STREAM_HANDSHAKE err; const char *str; + int response_code; } handshake_errors[] = { - {STREAM_HANDSHAKE_OK_V3, "CONNECTED"}, - {STREAM_HANDSHAKE_OK_V2, "CONNECTED"}, - {STREAM_HANDSHAKE_OK_V1, "CONNECTED"}, - {STREAM_HANDSHAKE_NEVER, ""}, - {STREAM_HANDSHAKE_ERROR_BAD_HANDSHAKE, "BAD HANDSHAKE"}, - {STREAM_HANDSHAKE_ERROR_LOCALHOST, "LOCALHOST"}, - {STREAM_HANDSHAKE_ERROR_ALREADY_CONNECTED, "ALREADY CONNECTED"}, - {STREAM_HANDSHAKE_ERROR_DENIED, "DENIED"}, - {STREAM_HANDSHAKE_ERROR_SEND_TIMEOUT, "SEND TIMEOUT"}, - {STREAM_HANDSHAKE_ERROR_RECEIVE_TIMEOUT, "RECEIVE TIMEOUT"}, - {STREAM_HANDSHAKE_ERROR_INVALID_CERTIFICATE, "INVALID CERTIFICATE"}, - {STREAM_HANDSHAKE_ERROR_SSL_ERROR, "SSL ERROR"}, - {STREAM_HANDSHAKE_ERROR_CANT_CONNECT, "CANT CONNECT"}, - {STREAM_HANDSHAKE_BUSY_TRY_LATER, "BUSY TRY LATER"}, - {STREAM_HANDSHAKE_INTERNAL_ERROR, "INTERNAL ERROR"}, - {STREAM_HANDSHAKE_INITIALIZATION, "REMOTE IS INITIALIZING"}, - {STREAM_HANDSHAKE_DISCONNECT_HOST_CLEANUP, "DISCONNECTED HOST CLEANUP"}, - {STREAM_HANDSHAKE_DISCONNECT_STALE_RECEIVER, "DISCONNECTED STALE RECEIVER"}, - {STREAM_HANDSHAKE_DISCONNECT_SHUTDOWN, "DISCONNECTED SHUTDOWN REQUESTED"}, - {STREAM_HANDSHAKE_DISCONNECT_NETDATA_EXIT, "DISCONNECTED NETDATA EXIT"}, - {STREAM_HANDSHAKE_DISCONNECT_PARSER_EXIT, "DISCONNECTED PARSE ENDED"}, - {STREAM_HANDSHAKE_DISCONNECT_UNKNOWN_SOCKET_READ_ERROR, "DISCONNECTED UNKNOWN SOCKET READ ERROR"}, - {STREAM_HANDSHAKE_DISCONNECT_PARSER_FAILED, "DISCONNECTED PARSE ERROR"}, - {STREAM_HANDSHAKE_DISCONNECT_RECEIVER_LEFT, "DISCONNECTED RECEIVER LEFT"}, - {STREAM_HANDSHAKE_DISCONNECT_ORPHAN_HOST, "DISCONNECTED ORPHAN HOST"}, - {STREAM_HANDSHAKE_NON_STREAMABLE_HOST, "NON STREAMABLE HOST"}, - {STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_READ_BUFFER, "DISCONNECTED NOT SUFFICIENT READ BUFFER"}, - {STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_SENDER_COMPRESSION_FAILED, "DISCONNECTED SND COMPRESSION FAILED"}, - {STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_SEND_BUFFER, "DISCONNECTED NOT SUFFICIENT SEND BUFFER"}, - {STREAM_HANDSHAKE_DISCONNECT_SOCKET_EOF, "DISCONNECTED SOCKET EOF"}, - {STREAM_HANDSHAKE_DISCONNECT_SOCKET_READ_FAILED, "DISCONNECTED SOCKET READ FAILED"}, - {STREAM_HANDSHAKE_DISCONNECT_SOCKET_TIMEOUT, "DISCONNECTED SOCKET TIMEOUT"}, - {STREAM_HANDSHAKE_DISCONNECT_SOCKET_ERROR, "DISCONNECT SOCKET ERROR"}, - {STREAM_HANDSHAKE_DISCONNECT_SOCKET_WRITE_FAILED, "DISCONNECTED SOCKET WRITE FAILED"}, - {STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END, "DISCONNECTED SOCKET CLOSED BY REMOTE END"}, - {STREAM_HANDSHAKE_ERROR_HTTP_UPGRADE, "HTTP UPGRADE ERROR"}, - {STREAM_HANDSHAKE_NO_HOST_IN_DESTINATION, "NO HOST IN DESTINATION - CONFIG ERROR"}, - {STREAM_HANDSHAKE_CONNECT_TIMEOUT, "CONNECT TIMEOUT"}, - {STREAM_HANDSHAKE_CONNECTION_REFUSED, "CONNECTION REFUSED"}, - {STREAM_HANDSHAKE_CANT_RESOLVE_HOSTNAME, "CANT RESOLVE HOSTNAME"}, - {STREAM_HANDSHAKE_PREPARING, "PREPARING"}, - {STREAM_HANDSHAKE_CONNECTING, "CONNECTING"}, - {STREAM_HANDSHAKE_CONNECTED, "CONNECTED"}, - {STREAM_HANDSHAKE_EXITING, "EXITING"}, - {STREAM_HANDSHAKE_NO_STREAM_INFO, "NO STREAM INFO"}, - {STREAM_HANDSHAKE_REPLICATION_STALLED, "REPLICATION STALLED"}, - { 0, NULL }, + {STREAM_HANDSHAKE_OK_V3, "CONNECTED", 200}, + {STREAM_HANDSHAKE_OK_V2, "CONNECTED", 200}, + {STREAM_HANDSHAKE_OK_V1, "CONNECTED", 200}, + {STREAM_HANDSHAKE_NEVER, "", 204}, // No Content + {STREAM_HANDSHAKE_CONNECT_HANDSHAKE_FAILED, "BAD HANDSHAKE", 400}, // Bad Request + {STREAM_HANDSHAKE_PARENT_IS_LOCALHOST, "LOCALHOST", 101}, // Switching Protocols + {STREAM_HANDSHAKE_PARENT_NODE_ALREADY_CONNECTED, "ALREADY CONNECTED", 409}, // Conflict + {STREAM_HANDSHAKE_PARENT_DENIED_ACCESS, "DENIED", 403}, // Forbidden + {STREAM_HANDSHAKE_CONNECT_SEND_TIMEOUT, "SEND TIMEOUT", 408}, // Request Timeout + {STREAM_HANDSHAKE_CONNECT_RECEIVE_TIMEOUT, "RECEIVE TIMEOUT", 504}, // Gateway Timeout + {STREAM_HANDSHAKE_CONNECT_INVALID_CERTIFICATE, "INVALID CERTIFICATE", 495}, // Custom: SSL Certificate Error + {STREAM_HANDSHAKE_CONNECT_SSL_ERROR, "SSL ERROR", 525}, // SSL Handshake Failure + {STREAM_HANDSHAKE_CONNECTION_FAILED, "CANT CONNECT", 502}, // Bad Gateway + {STREAM_HANDSHAKE_PARENT_BUSY_TRY_LATER, "BUSY TRY LATER", 503}, // Service Unavailable + {STREAM_HANDSHAKE_PARENT_INTERNAL_ERROR, "INTERNAL ERROR", 500}, // Internal Server Error + {STREAM_HANDSHAKE_PARENT_IS_INITIALIZING, "REMOTE IS INITIALIZING", 102}, // Processing (WebDAV) + + // receiver only codes + {STREAM_HANDSHAKE_RCV_DISCONNECT_PARSER_FAILED, "DISCONNECTED PARSE ERROR", 400}, // Bad Request + {STREAM_HANDSHAKE_RCV_DISCONNECT_STALE_RECEIVER, "DISCONNECTED STALE RECEIVER", 410}, // Gone + {STREAM_HANDSHAKE_RCV_DECOMPRESSION_FAILED, "DISCONNECTED DECOMPRESSION FAILED", 415}, // Unsupported Media Type + + // sender only codes + {STREAM_HANDSHAKE_SND_DISCONNECT_HOST_CLEANUP, "DISCONNECTED HOST CLEANUP", 202}, // Accepted + {STREAM_HANDSHAKE_SND_DISCONNECT_COMPRESSION_FAILED, "DISCONNECTED SND COMPRESSION FAILED", 415}, // Unsupported Media Type + {STREAM_HANDSHAKE_SND_DISCONNECT_HTTP_UPGRADE_FAILED, "HTTP UPGRADE ERROR", 426}, // Upgrade Required + {STREAM_HANDSHAKE_SND_DISCONNECT_RECEIVER_LEFT, "RECEIVER LEFT", 498}, + + // receiver and sender codes + {STREAM_HANDSHAKE_DISCONNECT_SIGNALED_TO_STOP, "DISCONNECTED SIGNALED TO STOP", 499}, // Client Closed Request + {STREAM_HANDSHAKE_DISCONNECT_SHUTDOWN, "DISCONNECTED SHUTDOWN REQUESTED", 503}, // Service Unavailable + {STREAM_HANDSHAKE_DISCONNECT_SOCKET_READ_FAILED, "DISCONNECTED SOCKET READ FAILED", 502}, // Bad Gateway + {STREAM_HANDSHAKE_DISCONNECT_SOCKET_WRITE_FAILED, "DISCONNECTED SOCKET WRITE FAILED", 502}, // Bad Gateway + {STREAM_HANDSHAKE_DISCONNECT_SOCKET_ERROR, "DISCONNECT SOCKET ERROR", 500}, // Internal Server Error + {STREAM_HANDSHAKE_DISCONNECT_TIMEOUT, "DISCONNECTED TIMEOUT", 504}, // Gateway Timeout + {STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE, "DISCONNECTED SOCKET CLOSED BY REMOTE END", 499}, // Client Closed Request + {STREAM_HANDSHAKE_DISCONNECT_BUFFER_OVERFLOW, "DISCONNECTED NOT SUFFICIENT SEND BUFFER", 413}, // Payload Too Large + {STREAM_HANDSHAKE_DISCONNECT_REPLICATION_STALLED, "REPLICATION STALLED", 507}, // Insufficient Storage + + // sender (stream parents - SP) failures to connect + {STREAM_HANDSHAKE_SP_PREPARING, "PREPARING", 102}, // Processing (WebDAV) + {STREAM_HANDSHAKE_SP_NO_HOST_IN_DESTINATION, "NO HOST IN DESTINATION - CONFIG ERROR", 404}, // Not Found + {STREAM_HANDSHAKE_SP_CONNECT_TIMEOUT, "CONNECT TIMEOUT", 408}, // Request Timeout + {STREAM_HANDSHAKE_SP_CONNECTION_REFUSED, "CONNECTION REFUSED", 403}, // Forbidden + {STREAM_HANDSHAKE_SP_CANT_RESOLVE_HOSTNAME, "CANT RESOLVE HOSTNAME", 400}, // Bad Request + {STREAM_HANDSHAKE_SP_CONNECTING, "CONNECTING", 102}, // Processing (WebDAV) + {STREAM_HANDSHAKE_SP_CONNECTED, "CONNECTED", 200}, // OK + {STREAM_HANDSHAKE_SP_NO_STREAM_INFO, "NO STREAM INFO", 404}, // Not Found + + { 0, NULL, 0 }, }; const char *stream_handshake_error_to_string(STREAM_HANDSHAKE reason) { @@ -67,3 +71,16 @@ const char *stream_handshake_error_to_string(STREAM_HANDSHAKE reason) { return "UNKNOWN"; } + +int stream_handshake_error_to_response_code(STREAM_HANDSHAKE reason) { + if(reason >= STREAM_HANDSHAKE_OK_V1) + // handshake_error is the whole version / capabilities number + return 200; + + for(size_t i = 0; handshake_errors[i].str ; i++) { + if(reason == handshake_errors[i].err) + return handshake_errors[i].response_code; + } + + return 404; +} diff --git a/src/streaming/stream-handshake.h b/src/streaming/stream-handshake.h index e7d502ae434a09..696417be546b3c 100644 --- a/src/streaming/stream-handshake.h +++ b/src/streaming/stream-handshake.h @@ -17,81 +17,65 @@ #define START_STREAMING_ERROR_INTERNAL_ERROR "The server encountered an internal error. Try later." #define START_STREAMING_ERROR_INITIALIZATION "The server is initializing. Try later." -#define STREAM_STATUS_CONNECTED "CONNECTED" -#define STREAM_STATUS_ALREADY_CONNECTED "ALREADY CONNECTED" -#define STREAM_STATUS_DISCONNECTED "DISCONNECTED" -#define STREAM_STATUS_RATE_LIMIT "RATE LIMIT TRY LATER" -#define STREAM_STATUS_INITIALIZATION_IN_PROGRESS "INITIALIZATION IN PROGRESS RETRY LATER" -#define STREAM_STATUS_INTERNAL_SERVER_ERROR "INTERNAL SERVER ERROR DROPPING CONNECTION" -#define STREAM_STATUS_DUPLICATE_RECEIVER "DUPLICATE RECEIVER DROPPING CONNECTION" -#define STREAM_STATUS_CANT_REPLY "CANT REPLY DROPPING CONNECTION" -#define STREAM_STATUS_NO_HOSTNAME "NO HOSTNAME PERMISSION DENIED" -#define STREAM_STATUS_NO_API_KEY "NO API KEY PERMISSION DENIED" -#define STREAM_STATUS_INVALID_API_KEY "INVALID API KEY PERMISSION DENIED" -#define STREAM_STATUS_NO_MACHINE_GUID "NO MACHINE GUID PERMISSION DENIED" -#define STREAM_STATUS_MACHINE_GUID_DISABLED "MACHINE GUID DISABLED PERMISSION DENIED" -#define STREAM_STATUS_INVALID_MACHINE_GUID "INVALID MACHINE GUID PERMISSION DENIED" -#define STREAM_STATUS_API_KEY_DISABLED "API KEY DISABLED PERMISSION DENIED" -#define STREAM_STATUS_NOT_ALLOWED_IP "NOT ALLOWED IP PERMISSION DENIED" -#define STREAM_STATUS_LOCALHOST "LOCALHOST PERMISSION DENIED" -#define STREAM_STATUS_PERMISSION_DENIED "PERMISSION DENIED" -#define STREAM_STATUS_BAD_HANDSHAKE "BAD HANDSHAKE" -#define STREAM_STATUS_TIMEOUT "TIMEOUT" -#define STREAM_STATUS_CANT_UPGRADE_CONNECTION "CANT UPGRADE CONNECTION" - typedef enum { - STREAM_HANDSHAKE_OK_V3 = 3, // v3+ - STREAM_HANDSHAKE_OK_V2 = 2, // v2 - STREAM_HANDSHAKE_OK_V1 = 1, // v1 - STREAM_HANDSHAKE_NEVER = 0, // never tried to connect - STREAM_HANDSHAKE_ERROR_BAD_HANDSHAKE = -1, - STREAM_HANDSHAKE_ERROR_LOCALHOST = -2, - STREAM_HANDSHAKE_ERROR_ALREADY_CONNECTED = -3, - STREAM_HANDSHAKE_ERROR_DENIED = -4, - STREAM_HANDSHAKE_ERROR_SEND_TIMEOUT = -5, - STREAM_HANDSHAKE_ERROR_RECEIVE_TIMEOUT = -6, - STREAM_HANDSHAKE_ERROR_INVALID_CERTIFICATE = -7, - STREAM_HANDSHAKE_ERROR_SSL_ERROR = -8, - STREAM_HANDSHAKE_ERROR_CANT_CONNECT = -9, - STREAM_HANDSHAKE_BUSY_TRY_LATER = -10, - STREAM_HANDSHAKE_INTERNAL_ERROR = -11, - STREAM_HANDSHAKE_INITIALIZATION = -12, - STREAM_HANDSHAKE_DISCONNECT_HOST_CLEANUP = -13, - STREAM_HANDSHAKE_DISCONNECT_STALE_RECEIVER = -14, - STREAM_HANDSHAKE_DISCONNECT_SHUTDOWN = -15, - STREAM_HANDSHAKE_DISCONNECT_NETDATA_EXIT = -16, - STREAM_HANDSHAKE_DISCONNECT_PARSER_EXIT = -17, + // negotiated version values + // IMPORTANT: maintain compatibility across netdata version - do not change these + STREAM_HANDSHAKE_OK_V3 = 3, // v3+ + STREAM_HANDSHAKE_OK_V2 = 2, // v2 + STREAM_HANDSHAKE_OK_V1 = 1, // v1 + STREAM_HANDSHAKE_NEVER = 0, // never tried to connect + STREAM_HANDSHAKE_CONNECT_HANDSHAKE_FAILED = -1, // sent by parent - DO NOT CHANGE + STREAM_HANDSHAKE_PARENT_IS_LOCALHOST = -2, // sent by parent - DO NOT CHANGE + STREAM_HANDSHAKE_PARENT_NODE_ALREADY_CONNECTED = -3, // sent by parent - DO NOT CHANGE + STREAM_HANDSHAKE_PARENT_DENIED_ACCESS = -4, // sent by parent - DO NOT CHANGE + STREAM_HANDSHAKE_CONNECT_SEND_TIMEOUT = -5, // generated by child + STREAM_HANDSHAKE_CONNECT_RECEIVE_TIMEOUT = -6, // generated by child + STREAM_HANDSHAKE_CONNECT_INVALID_CERTIFICATE = -7, // generated by child + STREAM_HANDSHAKE_CONNECT_SSL_ERROR = -8, // generated by child + STREAM_HANDSHAKE_CONNECTION_FAILED = -9, // generated by child + STREAM_HANDSHAKE_PARENT_BUSY_TRY_LATER = -10, // sent by parent - DO NOT CHANGE + STREAM_HANDSHAKE_PARENT_INTERNAL_ERROR = -11, // sent by parent - DO NOT CHANGE + STREAM_HANDSHAKE_PARENT_IS_INITIALIZING = -12, // sent by parent - DO NOT CHANGE + + // --- internal handshake reasons --- + + // receiver only codes + STREAM_HANDSHAKE_RCV_DISCONNECT_PARSER_FAILED = -13, + STREAM_HANDSHAKE_RCV_DISCONNECT_STALE_RECEIVER = -14, + STREAM_HANDSHAKE_RCV_DECOMPRESSION_FAILED = -15, - STREAM_HANDSHAKE_DISCONNECT_UNKNOWN_SOCKET_READ_ERROR = -18, - STREAM_HANDSHAKE_DISCONNECT_PARSER_FAILED = -19, - STREAM_HANDSHAKE_DISCONNECT_RECEIVER_LEFT = -20, - STREAM_HANDSHAKE_DISCONNECT_ORPHAN_HOST = -21, - STREAM_HANDSHAKE_NON_STREAMABLE_HOST = -22, + // sender only codes + STREAM_HANDSHAKE_SND_DISCONNECT_HOST_CLEANUP = -16, + STREAM_HANDSHAKE_SND_DISCONNECT_COMPRESSION_FAILED = -17, + STREAM_HANDSHAKE_SND_DISCONNECT_HTTP_UPGRADE_FAILED = -18, + STREAM_HANDSHAKE_SND_DISCONNECT_RECEIVER_LEFT = -19, // used only in pulse - STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_READ_BUFFER = -23, - STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_SEND_BUFFER = -25, + // receiver and sender codes + STREAM_HANDSHAKE_DISCONNECT_SIGNALED_TO_STOP = -20, // a fallback when (s|rpt)->exit.reason is not set + STREAM_HANDSHAKE_DISCONNECT_SHUTDOWN = -21, + STREAM_HANDSHAKE_DISCONNECT_SOCKET_READ_FAILED = -22, + STREAM_HANDSHAKE_DISCONNECT_SOCKET_WRITE_FAILED = -23, + STREAM_HANDSHAKE_DISCONNECT_SOCKET_ERROR = -24, + STREAM_HANDSHAKE_DISCONNECT_TIMEOUT = -25, + STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE = -26, + STREAM_HANDSHAKE_DISCONNECT_BUFFER_OVERFLOW = -27, + STREAM_HANDSHAKE_DISCONNECT_REPLICATION_STALLED = -28, - STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_SENDER_COMPRESSION_FAILED = -24, - STREAM_HANDSHAKE_DISCONNECT_SOCKET_EOF = -26, - STREAM_HANDSHAKE_DISCONNECT_SOCKET_READ_FAILED = -27, - STREAM_HANDSHAKE_DISCONNECT_SOCKET_TIMEOUT = -28, - STREAM_HANDSHAKE_DISCONNECT_SOCKET_ERROR = -29, - STREAM_HANDSHAKE_DISCONNECT_SOCKET_WRITE_FAILED = -30, - STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END = -31, - STREAM_HANDSHAKE_ERROR_HTTP_UPGRADE = -32, - STREAM_HANDSHAKE_NO_HOST_IN_DESTINATION = -33, - STREAM_HANDSHAKE_CONNECT_TIMEOUT = -34, - STREAM_HANDSHAKE_CONNECTION_REFUSED = -35, - STREAM_HANDSHAKE_CANT_RESOLVE_HOSTNAME = -36, - STREAM_HANDSHAKE_PREPARING = -37, - STREAM_HANDSHAKE_CONNECTING = -38, - STREAM_HANDSHAKE_CONNECTED = -39, - STREAM_HANDSHAKE_EXITING = -40, - STREAM_HANDSHAKE_NO_STREAM_INFO = -41, - STREAM_HANDSHAKE_REPLICATION_STALLED = -42, + // sender (stream parents - SP) failures to connect + STREAM_HANDSHAKE_SP_PREPARING = -29, + STREAM_HANDSHAKE_SP_NO_HOST_IN_DESTINATION = -30, + STREAM_HANDSHAKE_SP_CONNECT_TIMEOUT = -31, + STREAM_HANDSHAKE_SP_CONNECTION_REFUSED = -32, + STREAM_HANDSHAKE_SP_CANT_RESOLVE_HOSTNAME = -33, + STREAM_HANDSHAKE_SP_CONNECTING = -34, + STREAM_HANDSHAKE_SP_CONNECTED = -35, + STREAM_HANDSHAKE_SP_NO_STREAM_INFO = -36, + // terminator - keep this positive, bigger than all negative values + STREAM_HANDSHAKE_NEGATIVE_MAX = 37, } STREAM_HANDSHAKE; const char *stream_handshake_error_to_string(STREAM_HANDSHAKE reason); +int stream_handshake_error_to_response_code(STREAM_HANDSHAKE reason); #endif //NETDATA_STREAM_HANDSHAKE_H diff --git a/src/streaming/stream-parents.c b/src/streaming/stream-parents.c index 46752837525b4c..7c7a1d725e4c88 100644 --- a/src/streaming/stream-parents.c +++ b/src/streaming/stream-parents.c @@ -84,7 +84,7 @@ static bool is_a_blocked_parent(STREAM_PARENT *d) { // -------------------------------------------------------------------------------------------------------------------- STREAM_HANDSHAKE stream_parent_get_disconnect_reason(STREAM_PARENT *d) { - if(!d) return STREAM_HANDSHAKE_INTERNAL_ERROR; + if(!d) return STREAM_HANDSHAKE_PARENT_INTERNAL_ERROR; return d->reason; } @@ -105,7 +105,7 @@ static inline usec_t randomize_wait_ut(time_t min, time_t max) { } void rrdhost_stream_parents_reset(RRDHOST *host, STREAM_HANDSHAKE reason) { - usec_t until_ut = randomize_wait_ut(5, stream_send.parents.reconnect_delay_s); + usec_t until_ut = randomize_wait_ut(stream_send.parents.reconnect_delay_s / 2, stream_send.parents.reconnect_delay_s + 5); rw_spinlock_write_lock(&host->stream.snd.parents.spinlock); for (STREAM_PARENT *d = host->stream.snd.parents.all; d; d = d->next) { d->postpone_until_ut = until_ut; @@ -233,39 +233,39 @@ void rrdhost_stream_parent_ssl_init(struct sender_state *s) { static void stream_parent_nd_sock_error_to_reason(STREAM_PARENT *d, ND_SOCK *sock) { switch (sock->error) { case ND_SOCK_ERR_CONNECTION_REFUSED: - d->reason = STREAM_HANDSHAKE_CONNECTION_REFUSED; + d->reason = STREAM_HANDSHAKE_SP_CONNECTION_REFUSED; d->postpone_until_ut = randomize_wait_ut(30, 60); block_parent_for_all_nodes(d, 30); break; case ND_SOCK_ERR_CANNOT_RESOLVE_HOSTNAME: - d->reason = STREAM_HANDSHAKE_CANT_RESOLVE_HOSTNAME; + d->reason = STREAM_HANDSHAKE_SP_CANT_RESOLVE_HOSTNAME; d->postpone_until_ut = randomize_wait_ut(30, 60); block_parent_for_all_nodes(d, 30); break; case ND_SOCK_ERR_NO_HOST_IN_DEFINITION: - d->reason = STREAM_HANDSHAKE_NO_HOST_IN_DESTINATION; + d->reason = STREAM_HANDSHAKE_SP_NO_HOST_IN_DESTINATION; d->banned_for_this_session = true; d->postpone_until_ut = randomize_wait_ut(30, 60); block_parent_for_all_nodes(d, 30); break; case ND_SOCK_ERR_TIMEOUT: - d->reason = STREAM_HANDSHAKE_CONNECT_TIMEOUT; + d->reason = STREAM_HANDSHAKE_SP_CONNECT_TIMEOUT; d->postpone_until_ut = randomize_wait_ut(300, d->remote.nodes < 10 ? 600 : 900); block_parent_for_all_nodes(d, 300); break; case ND_SOCK_ERR_SSL_INVALID_CERTIFICATE: - d->reason = STREAM_HANDSHAKE_ERROR_INVALID_CERTIFICATE; + d->reason = STREAM_HANDSHAKE_CONNECT_INVALID_CERTIFICATE; d->postpone_until_ut = randomize_wait_ut(300, 600); block_parent_for_all_nodes(d, 300); break; case ND_SOCK_ERR_SSL_CANT_ESTABLISH_SSL_CONNECTION: case ND_SOCK_ERR_SSL_FAILED_TO_OPEN: - d->reason = STREAM_HANDSHAKE_ERROR_SSL_ERROR; + d->reason = STREAM_HANDSHAKE_CONNECT_SSL_ERROR; d->postpone_until_ut = randomize_wait_ut(60, 180); block_parent_for_all_nodes(d, 60); break; @@ -274,19 +274,21 @@ static void stream_parent_nd_sock_error_to_reason(STREAM_PARENT *d, ND_SOCK *soc case ND_SOCK_ERR_POLL_ERROR: case ND_SOCK_ERR_FAILED_TO_CREATE_SOCKET: case ND_SOCK_ERR_UNKNOWN_ERROR: - d->reason = STREAM_HANDSHAKE_INTERNAL_ERROR; + d->reason = STREAM_HANDSHAKE_PARENT_INTERNAL_ERROR; d->postpone_until_ut = randomize_wait_ut(30, 60); break; case ND_SOCK_ERR_THREAD_CANCELLED: case ND_SOCK_ERR_NO_DESTINATION_AVAILABLE: - d->reason = STREAM_HANDSHAKE_INTERNAL_ERROR; + d->reason = STREAM_HANDSHAKE_PARENT_INTERNAL_ERROR; d->postpone_until_ut = randomize_wait_ut(30, 60); break; } } int stream_info_to_json_v1(BUFFER *wb, const char *machine_guid) { + pulse_parent_stream_info_received_request(); + buffer_reset(wb); buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT); @@ -386,8 +388,10 @@ static bool stream_info_fetch(STREAM_PARENT *d, const char *uuid, int default_po "STREAM PARENTS '%s': fetching stream info from '%s'...", hostname, string2str(d->destination)); + pulse_stream_info_sent_request(); + // Establish connection - d->reason = STREAM_HANDSHAKE_CONNECTING; + d->reason = STREAM_HANDSHAKE_SP_CONNECTING; if (!nd_sock_connect_to_this(&sock, string2str(d->destination), default_port, 5, ssl)) { d->selection.info = false; stream_parent_nd_sock_error_to_reason(d, &sock); @@ -424,7 +428,7 @@ static bool stream_info_fetch(STREAM_PARENT *d, const char *uuid, int default_po "STREAM PARENTS '%s': stream info receive buffer is full while receiving response from '%s'", hostname, string2str(d->destination)); d->selection.info = false; - d->reason = STREAM_HANDSHAKE_INTERNAL_ERROR; + d->reason = STREAM_HANDSHAKE_PARENT_INTERNAL_ERROR; return false; } @@ -465,7 +469,7 @@ static bool stream_info_fetch(STREAM_PARENT *d, const char *uuid, int default_po hostname, string2str(d->destination)); d->selection.info = false; - d->reason = STREAM_HANDSHAKE_INTERNAL_ERROR; + d->reason = STREAM_HANDSHAKE_PARENT_INTERNAL_ERROR; return false; } content_length = strtoul(content_length_ptr + strlen("Content-Length: "), NULL, 10); @@ -475,7 +479,7 @@ static bool stream_info_fetch(STREAM_PARENT *d, const char *uuid, int default_po hostname, string2str(d->destination)); d->selection.info = false; - d->reason = STREAM_HANDSHAKE_INTERNAL_ERROR; + d->reason = STREAM_HANDSHAKE_PARENT_INTERNAL_ERROR; return false; } } @@ -485,7 +489,7 @@ static bool stream_info_fetch(STREAM_PARENT *d, const char *uuid, int default_po CLEAN_JSON_OBJECT *jobj = json_tokener_parse(payload_start); if (!jobj) { d->selection.info = false; - d->reason = STREAM_HANDSHAKE_NO_STREAM_INFO; + d->reason = STREAM_HANDSHAKE_SP_NO_STREAM_INFO; nd_log(NDLS_DAEMON, NDLP_WARNING, "STREAM PARENTS '%s': failed to parse stream info response from '%s', JSON data: %s", hostname, string2str(d->destination), payload_start); @@ -496,7 +500,7 @@ static bool stream_info_fetch(STREAM_PARENT *d, const char *uuid, int default_po if(!stream_info_json_parse_v1(jobj, "", d, error)) { d->selection.info = false; - d->reason = STREAM_HANDSHAKE_NO_STREAM_INFO; + d->reason = STREAM_HANDSHAKE_SP_NO_STREAM_INFO; nd_log(NDLS_DAEMON, NDLP_WARNING, "STREAM PARENTS '%s': failed to extract fields from JSON stream info response from '%s': %s" " - JSON data: %s", @@ -605,7 +609,7 @@ bool stream_parent_connect_to_one_unsafe( switch(d->remote.ingest_type) { case RRDHOST_INGEST_TYPE_VIRTUAL: case RRDHOST_INGEST_TYPE_LOCALHOST: - d->reason = STREAM_HANDSHAKE_ERROR_LOCALHOST; + d->reason = STREAM_HANDSHAKE_PARENT_IS_LOCALHOST; d->since_ut = now_ut; d->postpone_until_ut = randomize_wait_ut(3600, 7200); @@ -619,8 +623,10 @@ bool stream_parent_connect_to_one_unsafe( rrdhost_hostname(host), string2str(d->destination)); continue; } - else + else { + pulse_sender_stream_info_failed(string2str(d->destination), d->reason); skip = true; + } break; default: @@ -631,16 +637,17 @@ bool stream_parent_connect_to_one_unsafe( switch(d->remote.ingest_status) { case RRDHOST_INGEST_STATUS_INITIALIZING: - d->reason = STREAM_HANDSHAKE_INITIALIZATION; + d->reason = STREAM_HANDSHAKE_PARENT_IS_INITIALIZING; d->since_ut = now_ut; d->postpone_until_ut = randomize_wait_ut(30, 60); + pulse_sender_stream_info_failed(string2str(d->destination), d->reason); skip = true; break; case RRDHOST_INGEST_STATUS_REPLICATING: case RRDHOST_INGEST_STATUS_ONLINE: - d->reason = STREAM_HANDSHAKE_ERROR_ALREADY_CONNECTED; if(rrdhost_is_host_in_stream_path_before_us(host, d->remote.host_id, host->sender->hops)) { + d->reason = STREAM_HANDSHAKE_PARENT_NODE_ALREADY_CONNECTED; d->since_ut = now_ut; d->postpone_until_ut = randomize_wait_ut(3600, 7200); d->banned_for_this_session = true; @@ -648,8 +655,17 @@ bool stream_parent_connect_to_one_unsafe( nd_log(NDLS_DAEMON, NDLP_INFO, "STREAM PARENTS '%s': destination '%s' is banned for this session, because it is in our path before us.", rrdhost_hostname(host), string2str(d->destination)); + pulse_sender_stream_info_failed(string2str(d->destination), d->reason); continue; } +// else { +// skip = true; +// if(!netdata_conf_is_parent()) { +// nd_log(NDLS_DAEMON, NDLP_INFO, +// "STREAM PARENTS '%s': destination '%s' reports I am already connected.", +// rrdhost_hostname(host), string2str(d->destination)); +// } +// } break; default: @@ -657,6 +673,8 @@ bool stream_parent_connect_to_one_unsafe( break; } } + else + pulse_sender_stream_info_failed(string2str(d->destination), d->reason); if(skip) { skipped_but_useful++; @@ -786,6 +804,7 @@ bool stream_parent_connect_to_one_unsafe( d->since_ut = now_ut; d->attempts++; + pulse_host_status(host, PULSE_HOST_STATUS_SND_CONNECTING, 0); if (nd_sock_connect_to_this(sender_sock, string2str(d->destination), default_port, timeout, stream_parent_is_ssl(d))) { @@ -810,6 +829,7 @@ bool stream_parent_connect_to_one_unsafe( } else { stream_parent_nd_sock_error_to_reason(d, sender_sock); + pulse_sender_connection_failed(string2str(d->destination), d->reason); nd_log(NDLS_DAEMON, NDLP_DEBUG, "STREAM PARENTS '%s': stream connection to '%s' failed (default port: %d): %s", rrdhost_hostname(host), diff --git a/src/streaming/stream-receiver-connection.c b/src/streaming/stream-receiver-connection.c index e036831d50bfb9..43c08aea7849e8 100644 --- a/src/streaming/stream-receiver-connection.c +++ b/src/streaming/stream-receiver-connection.c @@ -27,32 +27,37 @@ static void stream_receiver_connected_msg(RRDHOST *host, char *dst, size_t len) } } -void stream_receiver_log_status(struct receiver_state *rpt, const char *msg, const char *status, ND_LOG_FIELD_PRIORITY priority) { +void stream_receiver_log_status(struct receiver_state *rpt, const char *msg, STREAM_HANDSHAKE reason, ND_LOG_FIELD_PRIORITY priority) { // this function may be called BEFORE we spawn the receiver thread // so, we need to add the fields again (it does not harm) ND_LOG_STACK lgs[] = { ND_LOG_FIELD_TXT(NDF_SRC_IP, rpt->remote_ip), ND_LOG_FIELD_TXT(NDF_SRC_PORT, rpt->remote_port), ND_LOG_FIELD_TXT(NDF_NIDL_NODE, (rpt->hostname && *rpt->hostname) ? rpt->hostname : ""), - ND_LOG_FIELD_TXT(NDF_RESPONSE_CODE, status), + ND_LOG_FIELD_I64(NDF_RESPONSE_CODE, stream_handshake_error_to_response_code(reason)), ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &streaming_from_child_msgid), ND_LOG_FIELD_END(), }; ND_LOG_STACK_PUSH(lgs); - nd_log(NDLS_ACCESS, priority, "api_key:'%s' machine_guid:'%s' msg:'%s'" + nd_log(NDLS_ACCESS, priority, "api_key:'%s' machine_guid:'%s' node:'%s' msg:'%s' reason:'%s'" , (rpt->key && *rpt->key)? rpt->key : "" , (rpt->machine_guid && *rpt->machine_guid) ? rpt->machine_guid : "" - , msg); + , (rpt->hostname && *rpt->hostname) ? rpt->hostname : "" + , msg + , stream_handshake_error_to_string(reason)); nd_log(NDLS_DAEMON, priority, "STREAM RCV '%s' [from [%s]:%s]: %s %s%s%s" , (rpt->hostname && *rpt->hostname) ? rpt->hostname : "" , rpt->remote_ip, rpt->remote_port , msg - , rpt->exit.reason != STREAM_HANDSHAKE_NEVER?" (":"" - , stream_handshake_error_to_string(rpt->exit.reason) - , rpt->exit.reason != STREAM_HANDSHAKE_NEVER?")":"" + , reason != STREAM_HANDSHAKE_NEVER?" (":"" + , stream_handshake_error_to_string(reason) + , reason != STREAM_HANDSHAKE_NEVER?")":"" ); + + if(reason < 0) + pulse_parent_receiver_rejected(reason); } // -------------------------------------------------------------------------------------------------------------------- @@ -175,7 +180,7 @@ static bool stream_receiver_send_first_response(struct receiver_state *rpt) { stream_receiver_log_status( rpt, "rejecting streaming connection; failed to find or create the required host structure", - STREAM_STATUS_INTERNAL_SERVER_ERROR, NDLP_ERR); + STREAM_HANDSHAKE_PARENT_INTERNAL_ERROR, NDLP_ERR); stream_send_error_on_taken_over_connection(rpt, START_STREAMING_ERROR_INTERNAL_ERROR); return false; @@ -185,7 +190,7 @@ static bool stream_receiver_send_first_response(struct receiver_state *rpt) { stream_receiver_log_status( rpt, "rejecting streaming connection; host is initializing, retry later", - STREAM_STATUS_INITIALIZATION_IN_PROGRESS, NDLP_NOTICE); + STREAM_HANDSHAKE_PARENT_IS_INITIALIZING, NDLP_NOTICE); stream_send_error_on_taken_over_connection(rpt, START_STREAMING_ERROR_INITIALIZATION); return false; @@ -196,7 +201,7 @@ static bool stream_receiver_send_first_response(struct receiver_state *rpt) { // stream_receiver_log_status( // rpt, // "rejecting streaming connection; the system is backfilling higher tiers with high-resolution data, retry later", -// STREAM_STATUS_INITIALIZATION_IN_PROGRESS, NDLP_NOTICE); +// STREAM_HANDSHAKE_PARENT_IS_INITIALIZING, NDLP_NOTICE); // // stream_send_error_on_taken_over_connection(rpt, START_STREAMING_ERROR_INITIALIZATION); // return false; @@ -206,7 +211,7 @@ static bool stream_receiver_send_first_response(struct receiver_state *rpt) { stream_receiver_log_status( rpt, "rejecting streaming connection; host is already served by another receiver", - STREAM_STATUS_DUPLICATE_RECEIVER, NDLP_INFO); + STREAM_HANDSHAKE_PARENT_NODE_ALREADY_CONNECTED, NDLP_INFO); stream_send_error_on_taken_over_connection(rpt, START_STREAMING_ERROR_ALREADY_STREAMING); return false; @@ -231,7 +236,7 @@ static bool stream_receiver_send_first_response(struct receiver_state *rpt) { stream_select_receiver_compression_algorithm(rpt); { - // netdata_log_info("STREAM %s [receive from [%s]:%s]: initializing communication...", rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port); + // netdata_log_info("STREAM RCV %s [from [%s]:%s]: initializing communication...", rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port); char initial_response[HTTP_HEADER_SIZE]; if (stream_has_capability(rpt, STREAM_CAP_VCAPS)) { log_receiver_capabilities(rpt); @@ -285,8 +290,8 @@ static bool stream_receiver_send_first_response(struct receiver_state *rpt) { stream_receiver_log_status( rpt, "cannot reply back, dropping connection", - STREAM_STATUS_CANT_REPLY, NDLP_ERR); - rrdhost_clear_receiver(rpt); + STREAM_HANDSHAKE_CONNECT_SEND_TIMEOUT, NDLP_ERR); + rrdhost_clear_receiver(rpt, STREAM_HANDSHAKE_DISCONNECT_SOCKET_WRITE_FAILED); return false; } #ifdef ENABLE_H2O @@ -298,6 +303,7 @@ static bool stream_receiver_send_first_response(struct receiver_state *rpt) { } int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_string, void *h2o_ctx __maybe_unused) { + pulse_parent_receiver_request(); if(!service_running(ABILITY_STREAMING_CONNECTIONS)) return stream_receiver_response_too_busy_now(w); @@ -434,7 +440,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ stream_receiver_log_status( rpt, "rejecting streaming connection; request without an API key", - STREAM_STATUS_NO_API_KEY, NDLP_WARNING); + STREAM_HANDSHAKE_PARENT_DENIED_ACCESS, NDLP_WARNING); stream_receiver_free(rpt); return stream_receiver_response_permission_denied(w); @@ -444,7 +450,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ stream_receiver_log_status( rpt, "rejecting streaming connection; request without a hostname", - STREAM_STATUS_NO_HOSTNAME, NDLP_WARNING); + STREAM_HANDSHAKE_PARENT_DENIED_ACCESS, NDLP_WARNING); stream_receiver_free(rpt); return stream_receiver_response_permission_denied(w); @@ -457,7 +463,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ stream_receiver_log_status( rpt, "rejecting streaming connection; request without a machine UUID", - STREAM_STATUS_NO_MACHINE_GUID, NDLP_WARNING); + STREAM_HANDSHAKE_PARENT_DENIED_ACCESS, NDLP_WARNING); stream_receiver_free(rpt); return stream_receiver_response_permission_denied(w); @@ -470,7 +476,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ stream_receiver_log_status( rpt, "rejecting streaming connection; API key is not a valid UUID (use the command uuidgen to generate one)", - STREAM_STATUS_INVALID_API_KEY, NDLP_WARNING); + STREAM_HANDSHAKE_PARENT_DENIED_ACCESS, NDLP_WARNING); stream_receiver_free(rpt); return stream_receiver_response_permission_denied(w); @@ -480,7 +486,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ stream_receiver_log_status( rpt, "rejecting streaming connection; machine UUID is not a valid UUID", - STREAM_STATUS_INVALID_MACHINE_GUID, NDLP_WARNING); + STREAM_HANDSHAKE_PARENT_DENIED_ACCESS, NDLP_WARNING); stream_receiver_free(rpt); return stream_receiver_response_permission_denied(w); @@ -491,7 +497,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ stream_receiver_log_status( rpt, "rejecting streaming connection; API key provided is a machine UUID (did you mix them up?)", - STREAM_STATUS_INVALID_API_KEY, NDLP_WARNING); + STREAM_HANDSHAKE_PARENT_DENIED_ACCESS, NDLP_WARNING); stream_receiver_free(rpt); return stream_receiver_response_permission_denied(w); @@ -503,7 +509,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ stream_receiver_log_status( rpt, "rejecting streaming connection; API key is not enabled in stream.conf", - STREAM_STATUS_API_KEY_DISABLED, NDLP_WARNING); + STREAM_HANDSHAKE_PARENT_DENIED_ACCESS, NDLP_WARNING); stream_receiver_free(rpt); return stream_receiver_response_permission_denied(w); @@ -513,7 +519,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ stream_receiver_log_status( rpt, "rejecting streaming connection; API key is not allowed from this IP", - STREAM_STATUS_NOT_ALLOWED_IP, NDLP_WARNING); + STREAM_HANDSHAKE_PARENT_DENIED_ACCESS, NDLP_WARNING); stream_receiver_free(rpt); return stream_receiver_response_permission_denied(w); @@ -523,7 +529,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ stream_receiver_log_status( rpt, "rejecting streaming connection; machine UUID is an API key (did you mix them up?)", - STREAM_STATUS_INVALID_MACHINE_GUID, NDLP_WARNING); + STREAM_HANDSHAKE_PARENT_DENIED_ACCESS, NDLP_WARNING); stream_receiver_free(rpt); return stream_receiver_response_permission_denied(w); @@ -535,7 +541,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ stream_receiver_log_status( rpt, "rejecting streaming connection; machine UUID is not enabled in stream.conf", - STREAM_STATUS_MACHINE_GUID_DISABLED, NDLP_WARNING); + STREAM_HANDSHAKE_PARENT_DENIED_ACCESS, NDLP_WARNING); stream_receiver_free(rpt); return stream_receiver_response_permission_denied(w); @@ -545,7 +551,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ stream_receiver_log_status( rpt, "rejecting streaming connection; machine UUID is not allowed from this IP", - STREAM_STATUS_NOT_ALLOWED_IP, NDLP_WARNING); + STREAM_HANDSHAKE_PARENT_DENIED_ACCESS, NDLP_WARNING); stream_receiver_free(rpt); return stream_receiver_response_permission_denied(w); @@ -557,7 +563,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ stream_receiver_log_status( rpt, "rejecting streaming connection; machine UUID is my own", - STREAM_STATUS_LOCALHOST, NDLP_DEBUG); + STREAM_HANDSHAKE_PARENT_IS_LOCALHOST, NDLP_DEBUG); char initial_response[HTTP_HEADER_SIZE + 1]; snprintfz(initial_response, HTTP_HEADER_SIZE, "%s", START_STREAMING_ERROR_SAME_LOCALHOST); @@ -591,7 +597,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ "rejecting streaming connection; rate limit, will accept new connection in %ld secs", (long)(web_client_streaming_rate_t - (now - last_stream_accepted_t))); - stream_receiver_log_status(rpt, msg, STREAM_STATUS_RATE_LIMIT, NDLP_NOTICE); + stream_receiver_log_status(rpt, msg, STREAM_HANDSHAKE_PARENT_BUSY_TRY_LATER, NDLP_NOTICE); stream_receiver_free(rpt); return stream_receiver_response_too_busy_now(w); @@ -634,13 +640,23 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ } rrd_rdunlock(); + if (receiver_stale && string_strcmp(host->hostname, rpt->hostname) != 0) { + stream_receiver_log_status( + rpt, + "rejecting streaming connection; machine GUID is connected with a different hostname", + STREAM_HANDSHAKE_PARENT_DENIED_ACCESS, NDLP_WARNING); + + stream_receiver_free(rpt); + return stream_receiver_response_permission_denied(w); + } + if (receiver_stale && - stream_receiver_signal_to_stop_and_wait(host, STREAM_HANDSHAKE_DISCONNECT_STALE_RECEIVER)) { + stream_receiver_signal_to_stop_and_wait(host, STREAM_HANDSHAKE_RCV_DISCONNECT_STALE_RECEIVER)) { // we stopped the receiver // we can proceed with this connection receiver_stale = false; - nd_log_daemon(NDLP_NOTICE, "STREAM '%s' [receive from [%s]:%s]: " + nd_log_daemon(NDLP_NOTICE, "STREAM RCV '%s' [from [%s]:%s]: " "stopped previous stale receiver to accept this one." , rpt->hostname , rpt->remote_ip, rpt->remote_port); @@ -652,11 +668,11 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ char msg[200 + 1]; snprintfz(msg, sizeof(msg) - 1, - "rejecting streaming connection; multiple connections for same host, " + "rejecting streaming connection; multiple connections for the same host, " "old connection was last used %ld secs ago%s", age, receiver_stale ? " (signaled old receiver to stop)" : " (new connection not accepted)"); - stream_receiver_log_status(rpt, msg, STREAM_STATUS_ALREADY_CONNECTED, NDLP_DEBUG); + stream_receiver_log_status(rpt, msg, STREAM_HANDSHAKE_PARENT_NODE_ALREADY_CONNECTED, NDLP_WARNING); // Have not set WEB_CLIENT_FLAG_DONT_CLOSE_SOCKET - caller should clean up buffer_flush(w->response.data); @@ -679,9 +695,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ char msg[256]; stream_receiver_connected_msg(rpt->host, msg, sizeof(msg)); - stream_receiver_log_status( - rpt, msg, - STREAM_STATUS_CONNECTED, NDLP_INFO); + stream_receiver_log_status(rpt, msg, 0, NDLP_INFO); // in case we have cloud connection we inform cloud a new child connected schedule_node_state_update(rpt->host, 300); @@ -691,7 +705,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ rrdhost_option_set(rpt->host, RRDHOST_OPTION_EPHEMERAL_HOST); // let it reconnect to parents asap - rrdhost_stream_parents_reset(rpt->host, STREAM_HANDSHAKE_PREPARING); + rrdhost_stream_parents_reset(rpt->host, STREAM_HANDSHAKE_SP_PREPARING); // add it to a stream thread queue stream_receiver_add_to_queue(rpt); diff --git a/src/streaming/stream-receiver-internals.h b/src/streaming/stream-receiver-internals.h index c50b4a8bcc54b3..f07406eb604119 100644 --- a/src/streaming/stream-receiver-internals.h +++ b/src/streaming/stream-receiver-internals.h @@ -112,8 +112,8 @@ struct receiver_state { #endif bool rrdhost_set_receiver(RRDHOST *host, struct receiver_state *rpt); -void rrdhost_clear_receiver(struct receiver_state *rpt); -void stream_receiver_log_status(struct receiver_state *rpt, const char *msg, const char *status, ND_LOG_FIELD_PRIORITY priority); +void rrdhost_clear_receiver(struct receiver_state *rpt, STREAM_HANDSHAKE reason); +void stream_receiver_log_status(struct receiver_state *rpt, const char *msg, STREAM_HANDSHAKE reason, ND_LOG_FIELD_PRIORITY priority); void stream_receiver_free(struct receiver_state *rpt); bool stream_receiver_signal_to_stop_and_wait(RRDHOST *host, STREAM_HANDSHAKE reason); diff --git a/src/streaming/stream-receiver.c b/src/streaming/stream-receiver.c index fb47ba3da3f37c..886b2d779df20f 100644 --- a/src/streaming/stream-receiver.c +++ b/src/streaming/stream-receiver.c @@ -65,7 +65,7 @@ void stream_receiver_log_payload(struct receiver_state *rpt, const char *payload } #endif -static void stream_receiver_remove(struct stream_thread *sth, struct receiver_state *rpt, const char *why); +static void stream_receiver_remove(struct stream_thread *sth, struct receiver_state *rpt, STREAM_HANDSHAKE reason); // When a child disconnects this is the maximum we will wait // before we update the cloud that the child is offline @@ -297,8 +297,8 @@ static void receiver_set_exit_reason(struct receiver_state *rpt, STREAM_HANDSHAK } static inline bool receiver_should_stop(struct receiver_state *rpt) { - if(unlikely(__atomic_load_n(&rpt->exit.shutdown, __ATOMIC_RELAXED))) { - receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_SHUTDOWN, false); + if(unlikely(__atomic_load_n(&rpt->exit.shutdown, __ATOMIC_ACQUIRE))) { + receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_SIGNALED_TO_STOP, false); return true; } @@ -331,7 +331,7 @@ void stream_receiver_handle_op(struct stream_thread *sth, struct receiver_state sth->id, rrdhost_hostname(rpt->host), rpt->remote_ip, rpt->remote_port, stats.bytes_size, stats.bytes_max_size, stats.bytes_outstanding, stats.bytes_available); - stream_receiver_remove(sth, rpt, "receiver send buffer overflow"); + stream_receiver_remove(sth, rpt, STREAM_HANDSHAKE_DISCONNECT_BUFFER_OVERFLOW); return; } @@ -350,19 +350,23 @@ static ssize_t send_to_child(const char *txt, void *data, STREAM_TRAFFIC_TYPE ty bool was_empty = stats->bytes_outstanding == 0; struct stream_opcode msg = rpt->thread.send_to_child.msg; msg.opcode = STREAM_OPCODE_NONE; + msg.reason = 0; size_t size = strlen(txt); ssize_t rc = (ssize_t)size; if(!stream_circular_buffer_add_unsafe(scb, txt, size, size, type, true)) { // should never happen, because of autoscaling msg.opcode = STREAM_OPCODE_RECEIVER_BUFFER_OVERFLOW; + msg.reason = STREAM_HANDSHAKE_DISCONNECT_BUFFER_OVERFLOW; rc = -1; } else { stream_receiver_log_payload(rpt, txt, type, false); - if(was_empty) + if(was_empty) { msg.opcode = STREAM_OPCODE_RECEIVER_POLLOUT; + msg.opcode = 0; + } } spinlock_unlock(&rpt->thread.send_to_child.spinlock); @@ -487,6 +491,8 @@ void stream_receiver_move_to_running_unsafe(struct stream_thread *sth, struct re parser->h2o_ctx = rpt->h2o_ctx; #endif + pulse_host_status(rpt->host, PULSE_HOST_STATUS_RCV_RUNNING, 0); + // keep this last - it needs everything ready since to sends data to the child stream_receiver_send_node_and_claim_id_to_child(rpt->host); } @@ -504,7 +510,7 @@ void stream_receiver_move_entire_queue_to_running_unsafe(struct stream_thread *s } } -static void stream_receiver_remove(struct stream_thread *sth, struct receiver_state *rpt, const char *why) { +static void stream_receiver_remove(struct stream_thread *sth, struct receiver_state *rpt, STREAM_HANDSHAKE reason) { internal_fatal(sth->tid != gettid_cached(), "Function %s() should only be used by the dispatcher thread", __FUNCTION__ ); ND_LOG_STACK lgs[] = { @@ -530,7 +536,7 @@ static void stream_receiver_remove(struct stream_thread *sth, struct receiver_st , rpt->remote_ip ? rpt->remote_ip : "-" , rpt->remote_port ? rpt->remote_port : "-" , count - , why ? why : ""); + , stream_handshake_error_to_string(reason)); internal_fatal(META_GET(&sth->run.meta, (Word_t)&rpt->thread.meta) == NULL, "Receiver to be removed is not found in the list of receivers"); @@ -555,16 +561,17 @@ static void stream_receiver_remove(struct stream_thread *sth, struct receiver_st } stream_thread_node_removed(rpt->host); + pulse_host_status(rpt->host, PULSE_HOST_STATUS_RCV_OFFLINE, reason); // set a default exit reason, if not set - receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_PARSER_EXIT, false); + receiver_set_exit_reason(rpt, reason, false); // in case we are connected to netdata cloud, // we inform cloud that a child got disconnected uint64_t total_reboot = rrdhost_stream_path_total_reboot_time_ms(rpt->host); schedule_node_state_update(rpt->host, MIN((total_reboot * MAX_CHILD_DISC_TOLERANCE), MAX_CHILD_DISC_DELAY)); - rrdhost_clear_receiver(rpt); + rrdhost_clear_receiver(rpt, reason); rrdhost_set_is_parent_label(); stream_receiver_free(rpt); @@ -617,8 +624,8 @@ stream_receive_and_process(struct stream_thread *sth, struct receiver_state *rpt while (buffered_reader_next_line(&rpt->thread.uncompressed, rpt->thread.line_buffer)) { if (unlikely(parser_action(parser, rpt->thread.line_buffer->buffer))) { - receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_PARSER_FAILED, false); - stream_receiver_remove(sth, rpt, "parser action failed"); + receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_RCV_DISCONNECT_PARSER_FAILED, false); + stream_receiver_remove(sth, rpt, STREAM_HANDSHAKE_RCV_DISCONNECT_PARSER_FAILED); *removed = true; return -1; } @@ -631,8 +638,8 @@ stream_receive_and_process(struct stream_thread *sth, struct receiver_state *rpt break; else { - receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_PARSER_FAILED, false); - stream_receiver_remove(sth, rpt, "receiver decompressor failed"); + receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_RCV_DISCONNECT_PARSER_FAILED, false); + stream_receiver_remove(sth, rpt, STREAM_HANDSHAKE_RCV_DECOMPRESSION_FAILED); *removed = true; return -1; } @@ -641,16 +648,17 @@ stream_receive_and_process(struct stream_thread *sth, struct receiver_state *rpt else if (feed_rc == DECOMPRESS_NEED_MORE_DATA) break; else { - receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_PARSER_FAILED, false); - stream_receiver_remove(sth, rpt, "receiver compressed data invalid"); + receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_RCV_DISCONNECT_PARSER_FAILED, false); + stream_receiver_remove(sth, rpt, STREAM_HANDSHAKE_RCV_DECOMPRESSION_FAILED); *removed = true; return -1; } } if(receiver_should_stop(rpt)) { - receiver_set_exit_reason(rpt, rpt->exit.reason, false); - stream_receiver_remove(sth, rpt, "received stop signal"); + STREAM_HANDSHAKE reason = rpt->exit.reason ? rpt->exit.reason : STREAM_HANDSHAKE_DISCONNECT_SIGNALED_TO_STOP; + receiver_set_exit_reason(rpt, reason, false); + stream_receiver_remove(sth, rpt, reason); *removed = true; return -1; } @@ -662,8 +670,8 @@ stream_receive_and_process(struct stream_thread *sth, struct receiver_state *rpt while(buffered_reader_next_line(&rpt->thread.uncompressed, rpt->thread.line_buffer)) { if(unlikely(parser_action(parser, rpt->thread.line_buffer->buffer))) { - receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_PARSER_FAILED, false); - stream_receiver_remove(sth, rpt, "parser action failed"); + receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_RCV_DISCONNECT_PARSER_FAILED, false); + stream_receiver_remove(sth, rpt, STREAM_HANDSHAKE_RCV_DISCONNECT_PARSER_FAILED); *removed = true; return -1; } @@ -727,32 +735,29 @@ bool stream_receiver_send_data(struct stream_thread *sth, struct receiver_state spinlock_unlock(&rpt->thread.send_to_child.spinlock); if (status == EVLOOP_STATUS_SOCKET_ERROR || status == EVLOOP_STATUS_SOCKET_CLOSED) { - const char *disconnect_reason; STREAM_HANDSHAKE reason; if(status == EVLOOP_STATUS_SOCKET_ERROR) { worker_is_busy(WORKER_STREAM_JOB_DISCONNECT_SEND_ERROR); - disconnect_reason = "socket reports error while writing"; reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_WRITE_FAILED; } else /* if(status == EVLOOP_STATUS_SOCKET_CLOSED) */ { worker_is_busy(WORKER_STREAM_JOB_DISCONNECT_REMOTE_CLOSED); - disconnect_reason = "socket reports EOF (closed by child)"; - reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END; + reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE; } nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM RCV[%zu] '%s' [from [%s]:%s]: %s (%zd, on fd %d) - closing receiver connection - " "we have sent %zu bytes in %zu operations.", sth->id, rrdhost_hostname(rpt->host), rpt->remote_ip, rpt->remote_port, - disconnect_reason, rc, rpt->sock.fd, stats->bytes_sent, stats->sends); + stream_handshake_error_to_string(reason), rc, rpt->sock.fd, stats->bytes_sent, stats->sends); receiver_set_exit_reason(rpt, reason, false); if(process_opcodes_and_enable_removal) { // this is not executed from the opcode handling mechanism // so we can safely remove the receiver. - stream_receiver_remove(sth, rpt, disconnect_reason); + stream_receiver_remove(sth, rpt, reason); } else { // protection against this case: @@ -816,26 +821,24 @@ bool stream_receiver_receive_data(struct stream_thread *sth, struct receiver_sta } if(status == EVLOOP_STATUS_SOCKET_ERROR || status == EVLOOP_STATUS_SOCKET_CLOSED) { - const char *disconnect_reason; STREAM_HANDSHAKE reason; if(status == EVLOOP_STATUS_SOCKET_ERROR) { worker_is_busy(WORKER_STREAM_JOB_DISCONNECT_RECEIVE_ERROR); reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_READ_FAILED; - disconnect_reason = "error during receive"; } else /* if(status == EVLOOP_STATUS_SOCKET_CLOSED) */ { worker_is_busy(WORKER_STREAM_JOB_DISCONNECT_REMOTE_CLOSED); - reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END; - disconnect_reason = "socket reports EOF (closed by child)"; + reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE; } nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM RCV[%zu] '%s' [from [%s]:%s]: %s (fd %d) - closing receiver connection.", - sth->id, rrdhost_hostname(rpt->host), rpt->remote_ip, rpt->remote_port, disconnect_reason, rpt->sock.fd); + sth->id, rrdhost_hostname(rpt->host), rpt->remote_ip, rpt->remote_port, + stream_handshake_error_to_string(reason), rpt->sock.fd); receiver_set_exit_reason(rpt, reason, false); - stream_receiver_remove(sth, rpt, disconnect_reason); + stream_receiver_remove(sth, rpt, reason); } else if(status == EVLOOP_STATUS_CONTINUE && process_opcodes && stream_thread_process_opcodes(sth, &rpt->thread.meta)) status = EVLOOP_STATUS_OPCODE_ON_ME; @@ -860,33 +863,26 @@ bool stream_receive_process_poll_events(struct stream_thread *sth, struct receiv ND_LOG_STACK_PUSH(lgs); if (receiver_should_stop(rpt)) { - receiver_set_exit_reason(rpt, rpt->exit.reason, false); - stream_receiver_remove(sth, rpt, "received stop signal"); + STREAM_HANDSHAKE reason = rpt->exit.reason ? rpt->exit.reason : STREAM_HANDSHAKE_DISCONNECT_SIGNALED_TO_STOP; + receiver_set_exit_reason(rpt, reason, false); + stream_receiver_remove(sth, rpt, reason); return false; } if (unlikely(events & (ND_POLL_ERROR | ND_POLL_HUP | ND_POLL_INVALID))) { // we have errors on this socket - worker_is_busy(WORKER_STREAM_JOB_SOCKET_ERROR); + worker_is_busy(WORKER_STREAM_JOB_DISCONNECT_SOCKET_ERROR); - char *error = "unknown error"; - - if (events & ND_POLL_ERROR) - error = "socket reports errors"; - else if (events & ND_POLL_HUP) - error = "connection closed by remote end (HUP)"; - else if (events & ND_POLL_INVALID) - error = "connection is invalid"; - - worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SOCKET_ERROR); + STREAM_HANDSHAKE reason = events & ND_POLL_HUP ? STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE : STREAM_HANDSHAKE_DISCONNECT_SOCKET_ERROR; nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM RCV[%zu] '%s' [from [%s]:%s]: %s - closing connection", - sth->id, rrdhost_hostname(rpt->host), rpt->remote_ip, rpt->remote_port, error); + sth->id, rrdhost_hostname(rpt->host), rpt->remote_ip, rpt->remote_port, + stream_handshake_error_to_string(reason)); - receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_SOCKET_ERROR, false); - stream_receiver_remove(sth, rpt, error); + receiver_set_exit_reason(rpt, reason, false); + stream_receiver_remove(sth, rpt, reason); return false; } @@ -938,7 +934,7 @@ void stream_receiver_check_all_nodes_from_poll(struct stream_thread *sth, usec_t }; ND_LOG_STACK_PUSH(lgs); - worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT); + worker_is_busy(WORKER_STREAM_JOB_DISCONNECT_TIMEOUT); char duration[RFC3339_MAX_LENGTH]; duration_snprintf(duration, sizeof(duration), (int64_t)(now_monotonic_usec() - rpt->thread.last_traffic_ut), "us", true); @@ -954,8 +950,8 @@ void stream_receiver_check_all_nodes_from_poll(struct stream_thread *sth, usec_t sth->id, rrdhost_hostname(rpt->host), rpt->remote_ip, timeout_s, stats.bytes_sent, stats.sends, duration, pending, stats.buffer_ratio); - receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_SOCKET_TIMEOUT, false); - stream_receiver_remove(sth, rpt, "timeout"); + receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_TIMEOUT, false); + stream_receiver_remove(sth, rpt, STREAM_HANDSHAKE_DISCONNECT_TIMEOUT); continue; } @@ -1046,8 +1042,8 @@ void stream_receiver_replication_check_from_poll(struct stream_thread *sth, usec __atomic_load_n(&host->stream.rcv.status.replication.counter_out, __ATOMIC_RELAXED), __atomic_load_n(&host->stream.rcv.status.replication.counter_in, __ATOMIC_RELAXED)); - receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_REPLICATION_STALLED, false); - stream_receiver_remove(sth, rpt, "replication reception stalled"); + receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_REPLICATION_STALLED, false); + stream_receiver_remove(sth, rpt, STREAM_HANDSHAKE_DISCONNECT_REPLICATION_STALLED); } rpt->replication.last_checked_ut = rpt->replication.last_progress_ut; @@ -1061,8 +1057,8 @@ void stream_receiver_cleanup(struct stream_thread *sth) { m = META_NEXT(&sth->run.meta, &idx)) { if (m->type != POLLFD_TYPE_RECEIVER) continue; struct receiver_state *rpt = m->rpt; - receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_SHUTDOWN, false); - stream_receiver_remove(sth, rpt, "shutdown"); + receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_SHUTDOWN, true); + stream_receiver_remove(sth, rpt, STREAM_HANDSHAKE_DISCONNECT_SHUTDOWN); } } @@ -1110,7 +1106,8 @@ bool rrdhost_set_receiver(RRDHOST *host, struct receiver_state *rpt) { host->receiver = rpt; rpt->host = host; - __atomic_store_n(&rpt->exit.shutdown, false, __ATOMIC_RELAXED); + rpt->exit.reason = 0; + __atomic_store_n(&rpt->exit.shutdown, false, __ATOMIC_RELEASE); host->stream.rcv.status.last_connected = now_realtime_sec(); host->stream.rcv.status.last_disconnected = 0; host->stream.rcv.status.last_chart = 0; @@ -1139,7 +1136,7 @@ bool rrdhost_set_receiver(RRDHOST *host, struct receiver_state *rpt) { rrdhost_flag_set(rpt->host, RRDHOST_FLAG_COLLECTOR_ONLINE); aclk_queue_node_info(rpt->host, true); - rrdhost_stream_parents_reset(host, STREAM_HANDSHAKE_PREPARING); + rrdhost_stream_parents_reset(host, STREAM_HANDSHAKE_SP_PREPARING); set_this = true; } @@ -1155,7 +1152,7 @@ bool rrdhost_set_receiver(RRDHOST *host, struct receiver_state *rpt) { return set_this; } -void rrdhost_clear_receiver(struct receiver_state *rpt) { +void rrdhost_clear_receiver(struct receiver_state *rpt, STREAM_HANDSHAKE reason) { RRDHOST *host = rpt->host; if(!host) return; @@ -1176,13 +1173,13 @@ void rrdhost_clear_receiver(struct receiver_state *rpt) { rrdhost_set_health_evloop_iteration(host); ml_host_stop(host); stream_path_child_disconnected(host); - stream_sender_signal_to_stop_and_wait(host, STREAM_HANDSHAKE_DISCONNECT_RECEIVER_LEFT, false); + stream_sender_signal_to_stop_and_wait(host, reason, false); rrdcontext_host_child_disconnected(host); if (rpt->config.health.enabled) rrdcalc_child_disconnected(host); - rrdhost_stream_parents_reset(host, STREAM_HANDSHAKE_DISCONNECT_RECEIVER_LEFT); + rrdhost_stream_parents_reset(host, reason); } rrdhost_receiver_lock(host); @@ -1191,7 +1188,8 @@ void rrdhost_clear_receiver(struct receiver_state *rpt) { stream_receiver_replication_reset(host); streaming_receiver_disconnected(); - __atomic_store_n(&host->receiver->exit.shutdown, false, __ATOMIC_RELAXED); + host->receiver->exit.reason = 0; + __atomic_store_n(&host->receiver->exit.shutdown, false, __ATOMIC_RELEASE); host->stream.rcv.status.check_obsolete = false; host->stream.rcv.status.last_connected = 0; host->stream.rcv.status.last_disconnected = now_realtime_sec(); @@ -1215,16 +1213,18 @@ bool stream_receiver_signal_to_stop_and_wait(RRDHOST *host, STREAM_HANDSHAKE rea rrdhost_receiver_lock(host); - if(host->receiver) { - if(!__atomic_load_n(&host->receiver->exit.shutdown, __ATOMIC_RELAXED)) { - __atomic_store_n(&host->receiver->exit.shutdown, true, __ATOMIC_RELAXED); - receiver_set_exit_reason(host->receiver, reason, true); - shutdown(host->receiver->sock.fd, SHUT_RDWR); + struct receiver_state *rpt = host->receiver; + + if(rpt) { + if(!__atomic_load_n(&rpt->exit.shutdown, __ATOMIC_ACQUIRE)) { + receiver_set_exit_reason(rpt, reason, true); + __atomic_store_n(&rpt->exit.shutdown, true, __ATOMIC_RELEASE); + shutdown(rpt->sock.fd, SHUT_RDWR); } } int count = 2000; - while (host->receiver && count-- > 0) { + while (host->receiver == rpt && count-- > 0) { rrdhost_receiver_unlock(host); // let the lock for the receiver thread to exit @@ -1233,11 +1233,11 @@ bool stream_receiver_signal_to_stop_and_wait(RRDHOST *host, STREAM_HANDSHAKE rea rrdhost_receiver_lock(host); } - if(host->receiver) + if(host->receiver == rpt) netdata_log_error("STREAM RCV[x] '%s' [from [%s]:%s]: " "streaming thread takes too long to stop, giving up..." , rrdhost_hostname(host) - , host->receiver->remote_ip, host->receiver->remote_port); + , rpt->remote_ip, rpt->remote_port); else ret = true; diff --git a/src/streaming/stream-replication-sender.c b/src/streaming/stream-replication-sender.c index e0ee0691233de2..e15f5400c3fe70 100644 --- a/src/streaming/stream-replication-sender.c +++ b/src/streaming/stream-replication-sender.c @@ -705,7 +705,8 @@ bool replication_response_execute_finalize_and_send(struct replication_query *q, RRDSET_FLAGS old = rrdset_flag_set_and_clear(st, RRDSET_FLAG_SENDER_REPLICATION_FINISHED, RRDSET_FLAG_SENDER_REPLICATION_IN_PROGRESS); if(!(old & RRDSET_FLAG_SENDER_REPLICATION_FINISHED)) { - rrdhost_sender_replicating_charts_minus_one(st->rrdhost); + if(rrdhost_sender_replicating_charts_minus_one(st->rrdhost) == 0) + pulse_host_status(st->rrdhost, PULSE_HOST_STATUS_SND_RUNNING, 0); if(!finished_with_gap) st->stream.snd.resync_time_s = 0; diff --git a/src/streaming/stream-sender-api.c b/src/streaming/stream-sender-api.c index 4f74ed541f2acb..9a0719b6df7bcf 100644 --- a/src/streaming/stream-sender-api.c +++ b/src/streaming/stream-sender-api.c @@ -62,7 +62,7 @@ void stream_sender_structures_free(struct rrdhost *host) { if (unlikely(!host->sender)) return; // stop a possibly running thread - stream_sender_signal_to_stop_and_wait(host, STREAM_HANDSHAKE_DISCONNECT_HOST_CLEANUP, true); + stream_sender_signal_to_stop_and_wait(host, STREAM_HANDSHAKE_SND_DISCONNECT_HOST_CLEANUP, true); stream_circular_buffer_destroy(host->sender->scb); host->sender->scb = NULL; waitq_destroy(&host->sender->waitq); @@ -110,10 +110,12 @@ void stream_sender_signal_to_stop_and_wait(struct rrdhost *host, STREAM_HANDSHAK struct stream_opcode msg = host->sender->thread.msg; stream_sender_unlock(host->sender); - if(reason == STREAM_HANDSHAKE_DISCONNECT_RECEIVER_LEFT) - msg.opcode = STREAM_OPCODE_SENDER_STOP_RECEIVER_LEFT; - else + if(reason == STREAM_HANDSHAKE_SND_DISCONNECT_HOST_CLEANUP) msg.opcode = STREAM_OPCODE_SENDER_STOP_HOST_CLEANUP; + else + msg.opcode = STREAM_OPCODE_SENDER_STOP_RECEIVER_LEFT; + msg.reason = reason; + stream_sender_send_opcode(host->sender, msg); while(wait && rrdhost_flag_check(host, RRDHOST_FLAG_STREAM_SENDER_ADDED)) diff --git a/src/streaming/stream-sender-commit.c b/src/streaming/stream-sender-commit.c index 0f79d5790fd7be..0f64a6318b89f4 100644 --- a/src/streaming/stream-sender-commit.c +++ b/src/streaming/stream-sender-commit.c @@ -195,6 +195,7 @@ void sender_buffer_commit(struct sender_state *s, BUFFER *wb, struct sender_buff if (enable_sending) { msg.opcode = STREAM_OPCODE_SENDER_POLLOUT; + msg.reason = 0; stream_sender_send_opcode(s, msg); } @@ -205,6 +206,7 @@ overflow_with_lock: { stream_sender_unlock(s); waitq_release(&s->waitq); msg.opcode = STREAM_OPCODE_SENDER_BUFFER_OVERFLOW; + msg.reason = STREAM_HANDSHAKE_DISCONNECT_BUFFER_OVERFLOW; stream_sender_send_opcode(s, msg); nd_log_limit_static_global_var(erl, 1, 0); nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR, @@ -221,6 +223,7 @@ compression_failed_with_lock: { stream_sender_unlock(s); waitq_release(&s->waitq); msg.opcode = STREAM_OPCODE_SENDER_RECONNECT_WITHOUT_COMPRESSION; + msg.reason = STREAM_HANDSHAKE_SND_DISCONNECT_COMPRESSION_FAILED; stream_sender_send_opcode(s, msg); nd_log_limit_static_global_var(erl, 1, 0); nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR, diff --git a/src/streaming/stream-sender-internals.h b/src/streaming/stream-sender-internals.h index b9c1e6aa484d5b..3d662ae12f3eb7 100644 --- a/src/streaming/stream-sender-internals.h +++ b/src/streaming/stream-sender-internals.h @@ -152,7 +152,7 @@ bool stream_connector_is_signaled_to_stop(struct sender_state *s); void stream_sender_on_connect(struct sender_state *s); -void stream_sender_remove(struct sender_state *s); +void stream_sender_remove(struct sender_state *s, STREAM_HANDSHAKE reason); #ifdef NETDATA_LOG_STREAM_SENDER void stream_sender_log_payload(struct sender_state *s, BUFFER *payload, STREAM_TRAFFIC_TYPE type, bool inbound); diff --git a/src/streaming/stream-sender.c b/src/streaming/stream-sender.c index 13b3b8f6af7e8f..b934cd832d0365 100644 --- a/src/streaming/stream-sender.c +++ b/src/streaming/stream-sender.c @@ -4,7 +4,7 @@ #include "stream-sender-internals.h" #include "stream-replication-sender.h" -static void stream_sender_move_running_to_connector_or_remove(struct stream_thread *sth, struct sender_state *s, STREAM_HANDSHAKE reason, bool reconnect); +static void stream_sender_move_running_to_connector_or_remove(struct stream_thread *sth, struct sender_state *s, STREAM_HANDSHAKE reason, STREAM_HANDSHAKE receiver_reason, bool reconnect); // -------------------------------------------------------------------------------------------------------------------- @@ -229,14 +229,17 @@ void stream_sender_handle_op(struct stream_thread *sth, struct sender_state *s, stats.bytes_size, stats.bytes_max_size, stats.bytes_outstanding, stats.bytes_available); stream_sender_move_running_to_connector_or_remove( - sth, s, STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_SEND_BUFFER, true); + sth, s, STREAM_HANDSHAKE_DISCONNECT_BUFFER_OVERFLOW, 0, true); return; } if(msg->opcode & STREAM_OPCODE_SENDER_STOP_RECEIVER_LEFT) { worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_RECEIVER_LEFT); stream_sender_move_running_to_connector_or_remove( - sth, s, STREAM_HANDSHAKE_DISCONNECT_RECEIVER_LEFT, false); + sth, s, STREAM_HANDSHAKE_SND_DISCONNECT_RECEIVER_LEFT, msg->reason, false); + + // at this point we also have access to the receiver exit reason as msg->reason + return; } @@ -248,14 +251,14 @@ void stream_sender_handle_op(struct stream_thread *sth, struct sender_state *s, sth->id, rrdhost_hostname(s->host), s->remote_ip); stream_sender_move_running_to_connector_or_remove( - sth, s, STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_SENDER_COMPRESSION_FAILED, true); + sth, s, STREAM_HANDSHAKE_SND_DISCONNECT_COMPRESSION_FAILED, 0, true); return; } if(msg->opcode & STREAM_OPCODE_SENDER_STOP_HOST_CLEANUP) { worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_HOST_CLEANUP); stream_sender_move_running_to_connector_or_remove( - sth, s, STREAM_HANDSHAKE_DISCONNECT_HOST_CLEANUP, false); + sth, s, STREAM_HANDSHAKE_SND_DISCONNECT_HOST_CLEANUP, 0, false); return; } @@ -330,27 +333,35 @@ void stream_sender_move_queue_to_running_unsafe(struct stream_thread *sth) { sth->id, rrdhost_hostname(s->host), s->remote_ip); stream_sender_on_ready_to_dispatch(s); + + pulse_host_status(s->host, PULSE_HOST_STATUS_SND_RUNNING, 0); } } -void stream_sender_remove(struct sender_state *s) { +void stream_sender_remove(struct sender_state *s, STREAM_HANDSHAKE reason) { // THIS FUNCTION IS USED BY THE CONNECTOR TOO // when it gives up on a certain node stream_sender_lock(s); + if(reason == STREAM_HANDSHAKE_DISCONNECT_SIGNALED_TO_STOP && s->exit.reason) { + reason = s->exit.reason; + s->exit.reason = 0; + } + __atomic_store_n(&s->exit.shutdown, false, __ATOMIC_RELAXED); rrdhost_flag_clear(s->host, - RRDHOST_FLAG_STREAM_SENDER_ADDED | RRDHOST_FLAG_STREAM_SENDER_CONNECTED | - RRDHOST_FLAG_STREAM_SENDER_READY_4_METRICS); + RRDHOST_FLAG_STREAM_SENDER_ADDED | RRDHOST_FLAG_STREAM_SENDER_CONNECTED | + RRDHOST_FLAG_STREAM_SENDER_READY_4_METRICS); s->last_state_since_t = now_realtime_sec(); - stream_parent_set_disconnect_reason(s->host->stream.snd.parents.current, s->exit.reason, s->last_state_since_t); + stream_parent_set_disconnect_reason(s->host->stream.snd.parents.current, reason, s->last_state_since_t); s->connector.id = -1; + s->exit.reason = 0; stream_sender_unlock(s); - rrdhost_stream_parents_reset(s->host, STREAM_HANDSHAKE_EXITING); + rrdhost_stream_parents_reset(s->host, reason); #ifdef NETDATA_LOG_STREAM_SENDER spinlock_lock(&s->log.spinlock); @@ -364,21 +375,26 @@ void stream_sender_remove(struct sender_state *s) { #endif } -static void stream_sender_log_disconnection(struct stream_thread *sth, struct sender_state *s, STREAM_HANDSHAKE reason) { +static void stream_sender_log_disconnection(struct stream_thread *sth, struct sender_state *s, STREAM_HANDSHAKE reason, STREAM_HANDSHAKE receiver_reason) { ND_LOG_STACK lgs[] = { ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &streaming_to_parent_msgid), ND_LOG_FIELD_END(), }; ND_LOG_STACK_PUSH(lgs); - nd_log(NDLS_DAEMON, NDLP_NOTICE, - "STREAM SND[%zu] '%s' [to %s]: sender disconnected from parent, reason: %s (replication in: %u, out: %u, pending: %zu)", - sth->id, rrdhost_hostname(s->host), s->remote_ip, stream_handshake_error_to_string(reason), - s->host->stream.snd.status.replication.counter_in, s->host->stream.snd.status.replication.counter_out, - dictionary_entries(s->replication.requests)); + if(reason == STREAM_HANDSHAKE_SND_DISCONNECT_RECEIVER_LEFT && receiver_reason) + nd_log(NDLS_DAEMON, NDLP_NOTICE, + "STREAM SND[%zu] '%s' [to %s]: sender disconnected from parent, reason: %s (receiver left due to: %s)", + sth->id, rrdhost_hostname(s->host), s->remote_ip, + stream_handshake_error_to_string(reason), + stream_handshake_error_to_string(receiver_reason)); + else + nd_log(NDLS_DAEMON, NDLP_NOTICE, + "STREAM SND[%zu] '%s' [to %s]: sender disconnected from parent, reason: %s", + sth->id, rrdhost_hostname(s->host), s->remote_ip, stream_handshake_error_to_string(reason)); } -static void stream_sender_move_running_to_connector_or_remove(struct stream_thread *sth, struct sender_state *s, STREAM_HANDSHAKE reason, bool reconnect) { +static void stream_sender_move_running_to_connector_or_remove(struct stream_thread *sth, struct sender_state *s, STREAM_HANDSHAKE reason, STREAM_HANDSHAKE receiver_reason, bool reconnect) { internal_fatal(sth->tid != gettid_cached(), "Function %s() should only be used by the dispatcher thread", __FUNCTION__ ); ND_LOG_STACK lgs[] = { @@ -412,7 +428,7 @@ static void stream_sender_move_running_to_connector_or_remove(struct stream_thre s->host->stream.snd.status.tid = 0; stream_sender_unlock(s); - stream_sender_log_disconnection(sth, s, reason); + stream_sender_log_disconnection(sth, s, reason, receiver_reason); nd_sock_close(&s->sock); @@ -423,8 +439,10 @@ static void stream_sender_move_running_to_connector_or_remove(struct stream_thre stream_thread_node_removed(s->host); + pulse_host_status(s->host, PULSE_HOST_STATUS_SND_OFFLINE, reason); + if (should_remove) - stream_sender_remove(s); + stream_sender_remove(s, reason); else stream_connector_requeue(s); } @@ -466,7 +484,7 @@ void stream_sender_check_all_nodes_from_poll(struct stream_thread *sth, usec_t n }; ND_LOG_STACK_PUSH(lgs); - worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT); + worker_is_busy(WORKER_STREAM_JOB_DISCONNECT_TIMEOUT); char duration[RFC3339_MAX_LENGTH]; duration_snprintf(duration, sizeof(duration), (int64_t)(now_monotonic_usec() - s->thread.last_traffic_ut), "us", true); @@ -483,7 +501,7 @@ void stream_sender_check_all_nodes_from_poll(struct stream_thread *sth, usec_t n stats.bytes_sent, stats.sends, duration, pending, stats.buffer_ratio); - stream_sender_move_running_to_connector_or_remove(sth, s, STREAM_HANDSHAKE_DISCONNECT_SOCKET_TIMEOUT, true); + stream_sender_move_running_to_connector_or_remove(sth, s, STREAM_HANDSHAKE_DISCONNECT_TIMEOUT, 0, true); continue; } @@ -594,7 +612,7 @@ void stream_sender_replication_check_from_poll(struct stream_thread *sth, usec_t __atomic_load_n(&host->stream.snd.status.replication.counter_in, __ATOMIC_RELAXED), __atomic_load_n(&host->stream.snd.status.replication.counter_out, __ATOMIC_RELAXED)); - stream_sender_move_running_to_connector_or_remove(sth, s, STREAM_HANDSHAKE_REPLICATION_STALLED, true); + stream_sender_move_running_to_connector_or_remove(sth, s, STREAM_HANDSHAKE_DISCONNECT_REPLICATION_STALLED, 0, true); } s->replication.last_checked_ut = s->replication.last_progress_ut; @@ -665,7 +683,7 @@ bool stream_sender_send_data(struct stream_thread *sth, struct sender_state *s, else /* if(status == EVLOOP_STATUS_SOCKET_CLOSED) */ { worker_is_busy(WORKER_STREAM_JOB_DISCONNECT_REMOTE_CLOSED); disconnect_reason = "socket reports EOF (closed by parent)"; - reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END; + reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE; } nd_log(NDLS_DAEMON, NDLP_ERR, @@ -677,7 +695,7 @@ bool stream_sender_send_data(struct stream_thread *sth, struct sender_state *s, if(process_opcodes_and_enable_removal) { // this is not executed from the opcode handling mechanism // so we can safely remove the sender - stream_sender_move_running_to_connector_or_remove(sth, s, reason, true); + stream_sender_move_running_to_connector_or_remove(sth, s, reason, 0, true); } else { // protection against this case: @@ -739,7 +757,7 @@ bool stream_sender_receive_data(struct stream_thread *sth, struct sender_state * } else /* if(status == EVLOOP_STATUS_SOCKET_CLOSED) */ { worker_is_busy(WORKER_STREAM_JOB_DISCONNECT_REMOTE_CLOSED); - reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END; + reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE; disconnect_reason = "socket reports EOF (closed by parent)"; } @@ -748,7 +766,7 @@ bool stream_sender_receive_data(struct stream_thread *sth, struct sender_state * sth->id, rrdhost_hostname(s->host), s->remote_ip, disconnect_reason, s->sock.fd); stream_sender_move_running_to_connector_or_remove( - sth, s, reason, true); + sth, s, reason, 0, true); } else if(status == EVLOOP_STATUS_CONTINUE && process_opcodes && stream_thread_process_opcodes(sth, &s->thread.meta)) status = EVLOOP_STATUS_OPCODE_ON_ME; @@ -775,8 +793,6 @@ bool stream_sender_process_poll_events(struct stream_thread *sth, struct sender_ if(unlikely(events & (ND_POLL_ERROR|ND_POLL_HUP|ND_POLL_INVALID))) { // we have errors on this socket - worker_is_busy(WORKER_STREAM_JOB_SOCKET_ERROR); - char *error = "unknown error"; if (events & ND_POLL_ERROR) @@ -786,7 +802,7 @@ bool stream_sender_process_poll_events(struct stream_thread *sth, struct sender_ else if (events & ND_POLL_INVALID) error = "connection is invalid"; - worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SOCKET_ERROR); + worker_is_busy(WORKER_STREAM_JOB_DISCONNECT_SOCKET_ERROR); stream_sender_lock(s); // copy the statistics @@ -797,7 +813,7 @@ bool stream_sender_process_poll_events(struct stream_thread *sth, struct sender_ "STREAM SND[%zu] '%s' [to %s]: %s restarting connection - %zu bytes transmitted in %zu operations.", sth->id, rrdhost_hostname(s->host), s->remote_ip, error, stats.bytes_sent, stats.sends); - stream_sender_move_running_to_connector_or_remove(sth, s, STREAM_HANDSHAKE_DISCONNECT_SOCKET_ERROR, true); + stream_sender_move_running_to_connector_or_remove(sth, s, STREAM_HANDSHAKE_DISCONNECT_SOCKET_ERROR, 0, true); return false; } @@ -827,6 +843,6 @@ void stream_sender_cleanup(struct stream_thread *sth) { s->exit.reason = STREAM_HANDSHAKE_DISCONNECT_SHUTDOWN; s->exit.shutdown = true; - stream_sender_move_running_to_connector_or_remove(sth, s, STREAM_HANDSHAKE_DISCONNECT_SHUTDOWN, false); + stream_sender_move_running_to_connector_or_remove(sth, s, STREAM_HANDSHAKE_DISCONNECT_SHUTDOWN, 0, false); } } diff --git a/src/streaming/stream-thread.c b/src/streaming/stream-thread.c index aeab3496e26edb..0110ab32c16ca5 100644 --- a/src/streaming/stream-thread.c +++ b/src/streaming/stream-thread.c @@ -142,6 +142,8 @@ void stream_receiver_send_opcode(struct receiver_state *rpt, struct stream_opcod if (sth->messages.array[i].meta == &rpt->thread.meta) { rpt->thread.send_to_child.msg_slot = i; sth->messages.array[rpt->thread.send_to_child.msg_slot].opcode |= msg.opcode; + if(msg.reason) + sth->messages.array[rpt->thread.send_to_child.msg_slot].reason = msg.reason; spinlock_unlock(&sth->messages.spinlock); internal_fatal(true, "the stream opcode queue is full, but this receiver is already on slot %zu", i); return; @@ -158,9 +160,12 @@ void stream_receiver_send_opcode(struct receiver_state *rpt, struct stream_opcod rpt->thread.send_to_child.msg_slot = sth->messages.used++; sth->messages.array[rpt->thread.send_to_child.msg_slot] = msg; } - else + else { // the existing slot is good sth->messages.array[rpt->thread.send_to_child.msg_slot].opcode |= msg.opcode; + if(msg.reason) + sth->messages.array[rpt->thread.send_to_child.msg_slot].reason = msg.reason; + } } spinlock_unlock(&sth->messages.spinlock); @@ -223,6 +228,8 @@ void stream_sender_send_opcode(struct sender_state *s, struct stream_opcode msg) if (sth->messages.array[i].meta == &s->thread.meta) { s->thread.msg_slot = i; sth->messages.array[s->thread.msg_slot].opcode |= msg.opcode; + if(msg.reason) + sth->messages.array[s->thread.msg_slot].reason = msg.reason; spinlock_unlock(&sth->messages.spinlock); internal_fatal(true, "the dispatcher message queue is full, but this sender is already on slot %zu", i); return; @@ -239,9 +246,12 @@ void stream_sender_send_opcode(struct sender_state *s, struct stream_opcode msg) s->thread.msg_slot = sth->messages.used++; sth->messages.array[s->thread.msg_slot] = msg; } - else + else { // the existing slot is good sth->messages.array[s->thread.msg_slot].opcode |= msg.opcode; + if(msg.reason) + sth->messages.array[s->thread.msg_slot].reason = msg.reason; + } } spinlock_unlock(&sth->messages.spinlock); @@ -395,7 +405,6 @@ void *stream_thread(void *ptr) { // both sender and receiver worker_register_job_name(WORKER_STREAM_JOB_SOCKET_RECEIVE, "receive"); worker_register_job_name(WORKER_STREAM_JOB_SOCKET_SEND, "send"); - worker_register_job_name(WORKER_STREAM_JOB_SOCKET_ERROR, "sock error"); // receiver worker_register_job_name(WORKER_STREAM_JOB_COMPRESS, "compress"); @@ -409,8 +418,8 @@ void *stream_thread(void *ptr) { // disconnection reasons worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_OVERFLOW, "disconnect overflow"); - worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT, "disconnect timeout"); - worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_SOCKET_ERROR, "disconnect socket error"); + worker_register_job_name(WORKER_STREAM_JOB_DISCONNECT_TIMEOUT, "disconnect timeout"); + worker_register_job_name(WORKER_STREAM_JOB_DISCONNECT_SOCKET_ERROR, "disconnect socket error"); worker_register_job_name(WORKER_STREAM_JOB_DISCONNECT_REMOTE_CLOSED, "disconnect remote closed"); worker_register_job_name(WORKER_STREAM_JOB_DISCONNECT_RECEIVE_ERROR, "disconnect receive error"); worker_register_job_name(WORKER_STREAM_JOB_DISCONNECT_SEND_ERROR, "disconnect send error"); @@ -745,6 +754,8 @@ void stream_receiver_add_to_queue(struct receiver_state *rpt) { RECEIVERS_SET(&sth->queue.receivers, ++sth->queue.id, rpt); sth->queue.receivers_waiting++; spinlock_unlock(&sth->queue.spinlock); + + pulse_host_status(rpt->host, PULSE_HOST_STATUS_RCV_WAITING, 0); } void stream_sender_add_to_queue(struct sender_state *s) { @@ -759,6 +770,8 @@ void stream_sender_add_to_queue(struct sender_state *s) { spinlock_lock(&sth->queue.spinlock); SENDERS_SET(&sth->queue.senders, ++sth->queue.id, s); spinlock_unlock(&sth->queue.spinlock); + + pulse_host_status(s->host, PULSE_HOST_STATUS_SND_WAITING, 0); } void stream_threads_cancel(void) { diff --git a/src/streaming/stream-thread.h b/src/streaming/stream-thread.h index e0e4c8c3258d12..0180f2f4da342f 100644 --- a/src/streaming/stream-thread.h +++ b/src/streaming/stream-thread.h @@ -5,6 +5,7 @@ #include "libnetdata/libnetdata.h" #include "stream-circular-buffer.h" +#include "stream-handshake.h" struct stream_thread; struct pollfd_slotted { @@ -30,6 +31,7 @@ struct stream_opcode { int32_t thread_slot; // the dispatcher id this message refers to uint32_t session; // random number used to verify that the message the dispatcher receives is for this sender STREAM_OPCODE opcode; // the actual message to be delivered + STREAM_HANDSHAKE reason; struct pollfd_meta *meta; }; @@ -45,46 +47,48 @@ struct stream_opcode { // socket operations #define WORKER_STREAM_JOB_SOCKET_RECEIVE 5 #define WORKER_STREAM_JOB_SOCKET_SEND 6 -#define WORKER_STREAM_JOB_SOCKET_ERROR 7 // compression -#define WORKER_STREAM_JOB_COMPRESS 8 -#define WORKER_STREAM_JOB_DECOMPRESS 9 +#define WORKER_STREAM_JOB_COMPRESS 7 +#define WORKER_STREAM_JOB_DECOMPRESS 8 // receiver events -#define WORKER_RECEIVER_JOB_BYTES_READ 10 -#define WORKER_RECEIVER_JOB_BYTES_UNCOMPRESSED 11 +#define WORKER_RECEIVER_JOB_BYTES_READ 9 +#define WORKER_RECEIVER_JOB_BYTES_UNCOMPRESSED 10 // sender received commands -#define WORKER_SENDER_JOB_EXECUTE 12 -#define WORKER_SENDER_JOB_EXECUTE_REPLAY 13 -#define WORKER_SENDER_JOB_EXECUTE_FUNCTION 14 -#define WORKER_SENDER_JOB_EXECUTE_META 15 - -#define WORKER_SENDER_JOB_DISCONNECT_OVERFLOW 16 -#define WORKER_SENDER_JOB_DISCONNECT_TIMEOUT 17 -#define WORKER_SENDER_JOB_DISCONNECT_SOCKET_ERROR 18 -#define WORKER_STREAM_JOB_DISCONNECT_REMOTE_CLOSED 19 -#define WORKER_STREAM_JOB_DISCONNECT_RECEIVE_ERROR 20 -#define WORKER_STREAM_JOB_DISCONNECT_SEND_ERROR 21 -#define WORKER_SENDER_JOB_DISCONNECT_COMPRESSION_ERROR 22 -#define WORKER_SENDER_JOB_DISCONNECT_RECEIVER_LEFT 23 -#define WORKER_SENDER_JOB_DISCONNECT_HOST_CLEANUP 24 +#define WORKER_SENDER_JOB_EXECUTE 11 +#define WORKER_SENDER_JOB_EXECUTE_REPLAY 12 +#define WORKER_SENDER_JOB_EXECUTE_FUNCTION 13 +#define WORKER_SENDER_JOB_EXECUTE_META 14 + +// disconnect reasons +#define WORKER_STREAM_JOB_DISCONNECT_REMOTE_CLOSED 15 +#define WORKER_STREAM_JOB_DISCONNECT_RECEIVE_ERROR 16 +#define WORKER_STREAM_JOB_DISCONNECT_SEND_ERROR 17 +#define WORKER_STREAM_JOB_DISCONNECT_TIMEOUT 18 +#define WORKER_STREAM_JOB_DISCONNECT_SOCKET_ERROR 19 + +// sender-only disconnect reasons +#define WORKER_SENDER_JOB_DISCONNECT_OVERFLOW 20 +#define WORKER_SENDER_JOB_DISCONNECT_COMPRESSION_ERROR 21 +#define WORKER_SENDER_JOB_DISCONNECT_RECEIVER_LEFT 22 +#define WORKER_SENDER_JOB_DISCONNECT_HOST_CLEANUP 23 // dispatcher metrics // this has to be the same at pluginsd_parser.h -#define WORKER_RECEIVER_JOB_REPLICATION_COMPLETION 25 -#define WORKER_STREAM_METRIC_NODES 26 -#define WORKER_SENDER_JOB_BUFFER_RATIO 27 -#define WORKER_SENDER_JOB_BYTES_RECEIVED 28 -#define WORKER_SENDER_JOB_BYTES_SENT 29 -#define WORKER_SENDER_JOB_BYTES_COMPRESSED 30 -#define WORKER_SENDER_JOB_BYTES_UNCOMPRESSED 31 -#define WORKER_SENDER_JOB_BYTES_COMPRESSION_RATIO 32 -#define WORKER_SENDER_JOB_REPLAY_DICT_SIZE 33 -#define WORKER_SENDER_JOB_MESSAGES 34 -#define WORKER_STREAM_JOB_RECEIVERS_WAITING_LIST_SIZE 35 -#define WORKER_STREAM_JOB_SEND_MISSES 36 +#define WORKER_RECEIVER_JOB_REPLICATION_COMPLETION 24 +#define WORKER_STREAM_METRIC_NODES 25 +#define WORKER_SENDER_JOB_BUFFER_RATIO 26 +#define WORKER_SENDER_JOB_BYTES_RECEIVED 27 +#define WORKER_SENDER_JOB_BYTES_SENT 28 +#define WORKER_SENDER_JOB_BYTES_COMPRESSED 29 +#define WORKER_SENDER_JOB_BYTES_UNCOMPRESSED 30 +#define WORKER_SENDER_JOB_BYTES_COMPRESSION_RATIO 31 +#define WORKER_SENDER_JOB_REPLAY_DICT_SIZE 32 +#define WORKER_SENDER_JOB_MESSAGES 33 +#define WORKER_STREAM_JOB_RECEIVERS_WAITING_LIST_SIZE 34 +#define WORKER_STREAM_JOB_SEND_MISSES 35 // IMPORTANT: to add workers, you have to edit WORKER_PARSER_FIRST_JOB accordingly