From 8df8cd7c927cf6b3ff09dc00a29de61a1dc7b994 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 1 Feb 2024 10:40:40 +0100 Subject: [PATCH 001/148] Start on adding federate type (transient or not) --- core/federated/RTI/rti_remote.c | 18 ++++++++++++------ core/federated/federate.c | 19 +++++++++++++++++++ include/core/federated/federate.h | 6 ++++++ 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 6f705d2b9..d7d1c352b 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1045,9 +1045,9 @@ void send_reject(int* socket_id, unsigned char error_code) { * @param client_fd The socket address. * @return The federate ID for success or -1 for failure. */ -static int32_t receive_and_check_fed_id_message(int* socket_id) { - // Buffer for message ID, federate ID, and federation ID length. - size_t length = 1 + sizeof(uint16_t) + 1; // Message ID, federate ID, length of fedration ID. +static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_in* client_fd) { + // Buffer for message ID, federate ID, type (persistent or transient), and federation ID length. + size_t length = 1 + sizeof(uint16_t) + 1 + 1; // Message ID, federate ID, length of fedration ID, type. unsigned char buffer[length]; // Read bytes from the socket. We need 4 bytes. @@ -1057,11 +1057,12 @@ static int32_t receive_and_check_fed_id_message(int* socket_id) { } uint16_t fed_id = rti_remote->base.number_of_scheduling_nodes; // Initialize to an invalid value. + bool is_transient = false; // First byte received is the message type. if (buffer[0] != MSG_TYPE_FED_IDS) { if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } if (buffer[0] == MSG_TYPE_P2P_SENDING_FED_ID || buffer[0] == MSG_TYPE_P2P_TAGGED_MESSAGE) { // The federate is trying to connect to a peer, not to the RTI. @@ -1083,10 +1084,15 @@ static int32_t receive_and_check_fed_id_message(int* socket_id) { } else { // Received federate ID. fed_id = extract_uint16(buffer + 1); - LF_PRINT_DEBUG("RTI received federate ID: %d.", fed_id); + is_transient = (buffer[sizeof(uint16_t) + 1] == 1) ? true : false; + if (is_transient) { + LF_PRINT_LOG("RTI received federate ID: %d, which is transient.", fed_id); + } else { + LF_PRINT_LOG("RTI received federate ID: %d, which is persistent.", fed_id); + } // Read the federation ID. First read the length, which is one byte. - size_t federation_id_length = (size_t)buffer[sizeof(uint16_t) + 1]; + size_t federation_id_length = (size_t)buffer[sizeof(uint16_t) + 2]; char federation_id_received[federation_id_length + 1]; // One extra for null terminator. // Next read the actual federation ID. if (read_from_socket_close_on_error(socket_id, federation_id_length, (unsigned char*)federation_id_received)) { diff --git a/core/federated/federate.c b/core/federated/federate.c index fc2f86911..3e0d7046f 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -75,6 +75,7 @@ int max_level_allowed_to_advance; * The state of this federate instance. Each executable has exactly one federate instance, * and the _fed global variable refers to that instance. */ +<<<<<<< HEAD federate_instance_t _fed = {.socket_TCP_RTI = -1, .number_of_inbound_p2p_connections = 0, .inbound_socket_listeners = NULL, @@ -90,6 +91,24 @@ federate_instance_t _fed = {.socket_TCP_RTI = -1, .last_sent_LTC = {.time = NEVER, .microstep = 0u}, .last_sent_NET = {.time = NEVER, .microstep = 0u}, .min_delay_from_physical_action_to_federate_output = NEVER}; +======= +federate_instance_t _fed = {.socket_TCP_RTI = -1, + .number_of_inbound_p2p_connections = 0, + .inbound_socket_listeners = NULL, + .number_of_outbound_p2p_connections = 0, + .inbound_p2p_handling_thread_id = 0, + .server_socket = -1, + .server_port = -1, + .last_TAG = {.time = NEVER, .microstep = 0u}, + .is_last_TAG_provisional = false, + .has_upstream = false, + .has_downstream = false, + .received_stop_request_from_rti = false, + .last_sent_LTC = (tag_t){.time = NEVER, .microstep = 0u}, + .last_sent_NET = (tag_t){.time = NEVER, .microstep = 0u}, + .min_delay_from_physical_action_to_federate_output = NEVER, + .is_transient = false}; +>>>>>>> 6fbf4094 (Start on adding federate type (transient or not)) federation_metadata_t federation_metadata = { .federation_id = "Unidentified Federation", .rti_host = NULL, .rti_port = -1, .rti_user = NULL}; diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index 50c59daa1..604a36637 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -176,6 +176,12 @@ typedef struct federate_instance_t { */ instant_t min_delay_from_physical_action_to_federate_output; + /** + * Indicator of whether this federate is transient. + * The default value of false may be overridden in _lf_initialize_trigger_objects. + */ + bool is_transient; + #ifdef FEDERATED_DECENTRALIZED /** * Thread responsible for setting ports to absent by an STAA offset if they From e20f58a1daa7af31fb372d38b72c56050897e18f Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 1 Feb 2024 11:04:17 +0100 Subject: [PATCH 002/148] Add cmdline argument -nt to RTI and add nbr_transient_federates attribute --- core/federated/RTI/main.c | 32 +++++++++++++++++++++++++++++++- core/federated/RTI/rti_remote.c | 2 ++ core/federated/RTI/rti_remote.h | 7 +++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index 17d73e93e..005b784ae 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -121,6 +121,8 @@ void usage(int argc, const char* argv[]) { lf_print(" The ID of the federation that this RTI will control.\n"); lf_print(" -n, --number_of_federates "); lf_print(" The number of federates in the federation that this RTI will control.\n"); + lf_print(" -nt, --number_of_transient_federates "); + lf_print(" The number of transient federates in the federation that this RTI will control.\n"); lf_print(" -p, --port "); lf_print(" The port number to use for the RTI. Must be larger than 0 and smaller than %d. Default is %d.\n", UINT16_MAX, DEFAULT_PORT); @@ -232,6 +234,21 @@ int process_args(int argc, const char* argv[]) { } rti.base.number_of_scheduling_nodes = (int32_t)num_federates; // FIXME: Loses numbers on 64-bit machines lf_print("RTI: Number of federates: %d", rti.base.number_of_scheduling_nodes); + } else if (strcmp(argv[i], "-nt") == 0 || strcmp(argv[i], "--number_of_transient_federates") == 0) { + if (argc < i + 2) { + lf_print_error("--number_of_transient_federates needs a valid positive argument."); + usage(argc, argv); + return 0; + } + i++; + long num_transient_federates = strtol(argv[i], NULL, 10); + if (num_transient_federates == LONG_MAX || num_transient_federates == LONG_MIN) { + lf_print_error("--number_of_transient_federates needs a valid positive or null integer argument."); + usage(argc, argv); + return 0; + } + rti.number_of_transient_federates = (int32_t)num_transient_federates; // FIXME: Loses numbers on 64-bit machines + lf_print("RTI: Number of transient federates: %d", rti.number_of_transient_federates); } else if (strcmp(argv[i], "-p") == 0 || strcmp(argv[i], "--port") == 0) { if (argc < i + 2) { lf_print_error("--port needs a short unsigned integer argument ( > 0 and < %d).", UINT16_MAX); @@ -272,6 +289,16 @@ int process_args(int argc, const char* argv[]) { return 0; } } + if (rti.base.number_of_scheduling_nodes == 0) { + lf_print_error("--number_of_federates needs a valid positive integer argument."); + usage(argc, argv); + return 0; + } + if (rti.number_of_transient_federates > rti.base.number_of_scheduling_nodes) { + lf_print_error("--number_of_transient_federates cannot be higher than the number of federates."); + usage(argc, argv); + return 0; + } return 1; } int main(int argc, const char* argv[]) { @@ -311,8 +338,11 @@ int main(int argc, const char* argv[]) { lf_print("Tracing the RTI execution in %s file.", rti_trace_file_name); } - lf_print("Starting RTI for %d federates in federation ID %s.", rti.base.number_of_scheduling_nodes, + lf_print("Starting RTI for a total of %d federates, with %d being transient, in federation ID %s", + rti.base.number_of_scheduling_nodes, rti.number_of_transient_federates, + rti.federation_id); + assert(rti.base.number_of_scheduling_nodes < UINT16_MAX); // Allocate memory for the federates diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index d7d1c352b..4bf73c0d6 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1509,6 +1509,7 @@ void initialize_federate(federate_info_t* fed, uint16_t id) { strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); fed->server_ip_addr.s_addr = 0; fed->server_port = -1; + fed->is_transient = true; } int32_t start_rti_server(uint16_t port) { @@ -1603,6 +1604,7 @@ void initialize_RTI(rti_remote_t* rti) { rti_remote->authentication_enabled = false; rti_remote->base.tracing_enabled = false; rti_remote->stop_in_progress = false; + rti_remote->num_transient_federates = 0; } // The RTI includes clock.c, which requires the following functions that are defined diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index de6b144aa..d7b9ede7c 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -67,6 +67,7 @@ typedef struct federate_info_t { // RTI has not been informed of the port number. struct in_addr server_ip_addr; // Information about the IP address of the socket // server of the federate. + bool is_transient; } federate_info_t; /** @@ -153,10 +154,16 @@ typedef struct rti_remote_t { * Boolean indicating that authentication is enabled. */ bool authentication_enabled; + /** * Boolean indicating that a stop request is already in progress. */ bool stop_in_progress; + + /** + * Number of transient federates + */ + int32_t number_of_transient_federates; } rti_remote_t; /** From c61b22b76f99cca2d06f83559a9deb92a3fbbff3 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 1 Feb 2024 13:37:09 +0100 Subject: [PATCH 003/148] Differentiate between connecting to persistent federates and transient federates --- core/federated/RTI/rti_remote.c | 164 ++++++++++++++++++++++++++++---- core/federated/RTI/rti_remote.h | 33 ++++++- 2 files changed, 176 insertions(+), 21 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 4bf73c0d6..0f7c371d5 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1415,9 +1415,27 @@ static bool authenticate_federate(int* socket) { } #endif -void lf_connect_to_federates(int socket_descriptor) { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, -1); +void lf_connect_to_persistent_federates(int socket_descriptor) { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote.number_of_transient_federates; i++) { + // Wait for an incoming connection request. + struct sockaddr client_fd; + uint32_t client_length = sizeof(client_fd); + // The following blocks until a federate connects. + int socket_id = -1; + while (1) { + socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); + if (socket_id >= 0) { + // Got a socket + break; + } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { + lf_print_error_system_failure("RTI failed to accept the socket."); + } else { + // Try again + lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); + continue; + } + } + // Wait for the first message from the federate when RTI -a option is on. #ifdef __RTI_AUTH__ if (rti_remote->authentication_enabled) { @@ -1445,13 +1463,21 @@ void lf_connect_to_federates(int socket_descriptor) { // synchronization messages. federate_info_t* fed = GET_FED_INFO(fed_id); lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + + // If the federate is transient, then do not count it. + if (fed->is_transient) { + rti_remote.number_of_connected_transient_federates++; + assert(rti_remote.number_of_connected_transient_federates <= rti_remote.number_of_transient_federates); + i--; + lf_print("RTI: Transient federate %d joined.", fed->base.id); + } } else { // Received message was rejected. Try again. i--; } } // All federates have connected. - LF_PRINT_DEBUG("All federates have connected to RTI."); + LF_PRINT_DEBUG("All persistent federates have connected to RTI."); if (rti_remote->clock_sync_global_status >= clock_sync_on) { // Create the thread that performs periodic PTP clock synchronization sessions @@ -1471,6 +1497,83 @@ void lf_connect_to_federates(int socket_descriptor) { } } +void* lf_connect_to_transient_federates_thread(int socket_descriptor) { + // This loop will continue to accept connections of transient federates, as + // soon as there is room, or enable hot swap + + while (!rti_remote.all_persistent_federates_exited) { + // Continue waiting for an incoming connection requests from transients + // to join, or for hot swap. + // Wait for an incoming connection request. + struct sockaddr client_fd; + uint32_t client_length = sizeof(client_fd); + // The following blocks until a federate connects. + int socket_id = -1; + while (1) { + if (!rti_remote.all_persistent_federates_exited) { + return NULL; + } + socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); + if (socket_id >= 0) { + // Got a socket + break; + } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { + lf_print_error_system_failure("RTI failed to accept the socket."); + } else { + // Try again + lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); + continue; + } + } + +// Wait for the first message from the federate when RTI -a option is on. +#ifdef __RTI_AUTH__ + if (rti_remote->authentication_enabled) { + if (!authenticate_federate(&socket_id)) { + lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + socket_id = -1; + // Ignore the federate that failed authentication. + i--; + continue; + } + } +#endif + + // The first message from the federate should contain its ID and the federation ID. + // The function also detects if a hot swap request is initiated. + int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in*)&client_fd); + if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && + receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { + } + + // // Create a thread to communicate with the federate. + // // This has to be done after clock synchronization is finished + // // or that thread may end up attempting to handle incoming clock + // // synchronization messages. + // federate_info_t *fed = GET_FED_INFO(fed_id); + // lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + + // // If the federate is transient, then do not count it. + // if (fed->is_transient) { + // rti_remote.number_of_connected_transient_federates++; + // assert(rti_remote.number_of_connected_transient_federates <= rti_remote.number_of_transient_federates); + // i--; + // lf_print("RTI: Transient federate %d joined.", fed->base.id); + // } + // } else { + // // Received message was rejected. Try again. + // i--; + // } + + // FIXME: Check again if runtime clock synchronization should be lauched, + // only if the number of persistent threads is zero. This should be done + // only once, not at every transient connection. + } +} + void* respond_to_erroneous_connections(void* nothing) { initialize_lf_thread_id(); while (true) { @@ -1531,27 +1634,56 @@ int32_t start_rti_server(uint16_t port) { } void wait_for_federates(int socket_descriptor) { - // Wait for connections from federates and create a thread for each. - lf_connect_to_federates(socket_descriptor); + // Wait for connections from persistent federates and create a thread for each. + lf_connect_to_persistent_federates(socket_descriptor); - // All federates have connected. - lf_print("RTI: All expected federates have connected. Starting execution."); + // All persistent federates have connected. + lf_print("RTI: All expected persistent federates have connected. Starting execution."); + if (rti_remote.number_of_transient_federates > 0) { + lf_print("RTI: Transient Federates can join and leave the federation at anytime."); + } - // The socket server will not continue to accept connections after all the federates - // have joined. + // The socket server will only continue to accept connections from transient + // federates. // In case some other federation's federates are trying to join the wrong // federation, need to respond. Start a separate thread to do that. lf_thread_t responder_thread; - lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); + lf_thread_t transient_thread; + + // If the federation does not include transient federates, then respond to + // erronous connections. Otherwise, continue to accept transients joining and + // respond to duplicate joing requests. + if (rti_remote.number_of_transient_federates == 0) { + lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); + } else if (rti_remote.number_of_transient_federates > 0) { + lf_thread_create(&transient_thread, connect_to_transient_federates_thread, NULL); + } - // Wait for federate threads to exit. + // Wait for persistent federate threads to exit. void* thread_exit_status; for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { federate_info_t* fed = GET_FED_INFO(i); - lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); - lf_thread_join(fed->thread_id, &thread_exit_status); - pqueue_tag_free(fed->in_transit_message_tags); - lf_print("RTI: Federate %d thread exited.", fed->enclave.id); + if (!fed->is_transient) { + lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); + lf_thread_join(fed->thread_id, &thread_exit_status); + pqueue_tag_free(fed->in_transit_message_tags); + lf_print("RTI: Persistent federate %d thread exited.", fed->enclave.id); + } + } + + rti_remote->all_persistent_federates_exited = true; + + // Wait for transient federate threads to exit, if any. + if (rti_remote.number_of_transient_federates > 0) { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->is_transient) { + lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); + lf_thread_join(fed->thread_id, &thread_exit_status); + pqueue_tag_free(fed->in_transit_message_tags); + lf_print("RTI: Transient federate %d thread exited.", fed->enclave.id); + } + } } rti_remote->all_federates_exited = true; diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index d7b9ede7c..f7da91388 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -106,6 +106,15 @@ typedef struct rti_remote_t { */ volatile bool all_federates_exited; + /** + * Boolean indicating that all persistent federates have exited. + * This gets set to true exactly once before the program waits for + * persistent federates, then exits. + * It is marked volatile because the write is not guarded by a mutex. + * The main thread makes this true. + */ + volatile bool all_persistent_federates_exited; + /** * The ID of the federation that this RTI will supervise. * This should be overridden with a command-line -i option to ensure @@ -164,6 +173,11 @@ typedef struct rti_remote_t { * Number of transient federates */ int32_t number_of_transient_federates; + + /** + * Number of connected transient federates + */ + int32_t number_of_connected_transient_federates; } rti_remote_t; /** @@ -283,7 +297,7 @@ void handle_address_query(uint16_t fed_id); * field of the _RTI.federates[federate_id] array of structs. * * The server_hostname and server_ip_addr fields are assigned - * in lf_connect_to_federates() upon accepting the socket + * in lf_connect_to_persistent_federates() upon accepting the socket * from the remote federate. * * This function assumes the caller does not hold the mutex. @@ -354,12 +368,21 @@ void* federate_info_thread_TCP(void* fed); void send_reject(int* socket_id, unsigned char error_code); /** - * Wait for one incoming connection request from each federate, - * and upon receiving it, create a thread to communicate with - * that federate. Return when all federates have connected. + * Wait for one incoming connection request from each (persistent) federate, + * and upon receiving it, create a thread to communicate with that federate. + * Return when all persistent federates have connected. + * @param socket_descriptor The socket on which to accept connections. + */ +void lf_connect_to_persistent_federates(int socket_descriptor); + +/** + * Thread to wait for incoming connection request from transient federates. + * Upon receiving the connection request, check if a hot swap should start or + * simply create a thread to communicate with that federate. + * Stops if all persistent federates exited. * @param socket_descriptor The socket on which to accept connections. */ -void lf_connect_to_federates(int socket_descriptor); +void* lf_connect_to_transient_federates_thread(int socket_descriptor); /** * Thread to respond to new connections, which could be federates of other From bc2b978c062d95b80fde37a5c587e95b72bab669 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 1 Feb 2024 13:37:38 +0100 Subject: [PATCH 004/148] Differentiate between connecting to persistent federates and transient federates, cont. --- core/federated/RTI/rti_remote.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 0f7c371d5..fcc4d59a6 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1656,7 +1656,7 @@ void wait_for_federates(int socket_descriptor) { if (rti_remote.number_of_transient_federates == 0) { lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); } else if (rti_remote.number_of_transient_federates > 0) { - lf_thread_create(&transient_thread, connect_to_transient_federates_thread, NULL); + lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); } // Wait for persistent federate threads to exit. From 5fd6c95a00b35abeaff47087e01426e785e018db Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 1 Feb 2024 13:53:11 +0100 Subject: [PATCH 005/148] Add support of the effective_start_tag --- core/tag.c | 6 ++++++ core/threaded/reactor_threaded.c | 1 + 2 files changed, 7 insertions(+) diff --git a/core/tag.c b/core/tag.c index e777eccc1..695bac8d7 100644 --- a/core/tag.c +++ b/core/tag.c @@ -32,6 +32,12 @@ typedef enum _lf_time_type { LF_LOGICAL, LF_PHYSICAL, LF_ELAPSED_LOGICAL, LF_ELA // Global variables declared in tag.h: instant_t start_time = NEVER; +/** + * Only useful for transient federates. It records the effective start tag, to + * be used at startup. Elapsed logical time calculations will use start_time. + */ +tag_t effective_start_tag = {.time = 0LL, .microstep = 0}; + //////////////// Functions declared in tag.h tag_t lf_tag(void* env) { diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index 493bd5a3e..604f4c07c 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -34,6 +34,7 @@ // Global variables defined in tag.c and shared across environments: extern instant_t start_time; +extern tag_t effective_start_tag; /** * The maximum amount of time a worker thread should stall From 9fc49b9ea22bf4f86152998b33af7839da592515 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 1 Feb 2024 14:00:36 +0100 Subject: [PATCH 006/148] Add support of the effective_start_tag cont. --- core/threaded/reactor_threaded.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index 604f4c07c..3a974249a 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -575,7 +575,7 @@ void _lf_initialize_start_tag(environment_t* env) { } // The start time will likely have changed. Adjust the current tag and stop tag. - env->current_tag = (tag_t){.time = start_time, .microstep = 0u}; + env->current_tag = effective_start_tag; if (duration >= 0LL) { // A duration has been specified. Recalculate the stop time. env->stop_tag = ((tag_t){.time = start_time + duration, .microstep = 0}); @@ -604,9 +604,9 @@ void _lf_initialize_start_tag(environment_t* env) { // the required waiting time. Second, this call releases the mutex lock and allows // other threads (specifically, federate threads that handle incoming p2p messages // from other federates) to hold the lock and possibly raise a tag barrier. - while (!wait_until(start_time, &env->event_q_changed)) { + while (!wait_until(effective_start_tag.time + _lf_fed_STA_offset, &env->event_q_changed)) { }; - LF_PRINT_DEBUG("Done waiting for start time + STA offset " PRINTF_TIME ".", start_time + lf_fed_STA_offset); + LF_PRINT_DEBUG("Done waiting for start time + STA offset " PRINTF_TIME ".", start_time + _lf_fed_STA_offset); LF_PRINT_DEBUG("Physical time is ahead of current time by " PRINTF_TIME ". This should be close to the STA offset.", lf_time_physical() - start_time); From 7b4f8daf290624f3e36fe0d7ce161c635ab4629c Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 1 Feb 2024 14:21:36 +0100 Subject: [PATCH 007/148] Federate shares its type (persistent or transient) --- core/federated/federate.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 3e0d7046f..f13be5285 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1802,15 +1802,16 @@ void lf_connect_to_federate(uint16_t remote_federate_id) { break; } // Connect was successful. - size_t buffer_length = 1 + sizeof(uint16_t) + 1; + size_t buffer_length = 1 + sizeof(uint16_t) + 1 + 1; unsigned char buffer[buffer_length]; buffer[0] = MSG_TYPE_P2P_SENDING_FED_ID; if (_lf_my_fed_id == UINT16_MAX) { lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX - 1); } encode_uint16((uint16_t)_lf_my_fed_id, (unsigned char*)&(buffer[1])); + buffer[1 + sizeof(uint16_t)] = _fed.is_transient ? 1 : 0; unsigned char federation_id_length = (unsigned char)strnlen(federation_metadata.federation_id, 255); - buffer[sizeof(uint16_t) + 1] = federation_id_length; + buffer[sizeof(uint16_t) + 2] = federation_id_length; // Trace the event when tracing is enabled tracepoint_federate_to_federate(send_FED_ID, _lf_my_fed_id, remote_federate_id, NULL); @@ -1882,23 +1883,25 @@ void lf_connect_to_rti(const char* hostname, int port) { #endif // Send the message type first. - unsigned char buffer[4]; + unsigned char buffer[5]; buffer[0] = MSG_TYPE_FED_IDS; // Next send the federate ID. if (_lf_my_fed_id == UINT16_MAX) { lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX - 1); } encode_uint16((uint16_t)_lf_my_fed_id, &buffer[1]); + // Next send the federate type (persistent or transient) + buffer[1 + sizeof(uint16_t)] = _fed.is_transient ? 1 : 0; // Next send the federation ID length. // The federation ID is limited to 255 bytes. size_t federation_id_length = strnlen(federation_metadata.federation_id, 255); - buffer[1 + sizeof(uint16_t)] = (unsigned char)(federation_id_length & 0xff); + buffer[2 + sizeof(uint16_t)] = (unsigned char)(federation_id_length & 0xff); // Trace the event when tracing is enabled tracepoint_federate_to_rti(send_FED_ID, _lf_my_fed_id, NULL); // No need for a mutex here because no other threads are writing to this socket. - if (write_to_socket(_fed.socket_TCP_RTI, 2 + sizeof(uint16_t), buffer)) { + if (write_to_socket(_fed.socket_TCP_RTI, 3 + sizeof(uint16_t), buffer)) { continue; // Try again, possibly on a new port. } @@ -2020,7 +2023,7 @@ void* lf_handle_p2p_connections_from_federates(void* env_arg) { } LF_PRINT_LOG("Accepted new connection from remote federate."); - size_t header_length = 1 + sizeof(uint16_t) + 1; + size_t header_length = 1 + sizeof(uint16_t) + 1 + 1; unsigned char buffer[header_length]; int read_failed = read_from_socket(socket_id, header_length, (unsigned char*)&buffer); if (read_failed || buffer[0] != MSG_TYPE_P2P_SENDING_FED_ID) { @@ -2061,6 +2064,7 @@ void* lf_handle_p2p_connections_from_federates(void* env_arg) { // Extract the ID of the sending federate. uint16_t remote_fed_id = extract_uint16((unsigned char*)&(buffer[1])); + bool remote_fed_is_transient = buffer[1 + sizeof(uint16_t)]; LF_PRINT_DEBUG("Received sending federate ID %d.", remote_fed_id); // Trace the event when tracing is enabled From d70704eeee6551743c080ddcf431fa3e7e3f0311 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 5 Feb 2024 15:55:25 +0100 Subject: [PATCH 008/148] Add federation life cycle phase --- core/federated/RTI/rti_remote.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index f7da91388..88ce89606 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -75,18 +75,14 @@ typedef struct federate_info_t { */ typedef enum clock_sync_stat { clock_sync_off, clock_sync_init, clock_sync_on } clock_sync_stat; +/** + * The federation life cycle phases. + */ +typedef enum federation_life_cycle_phase { startup_phase, execution_phase, shutdown_phase } federation_life_cycle_phase; + /** * Structure that an RTI instance uses to keep track of its own and its * corresponding federates' state. - * It is a special case of `rti_common_t` (declared in enclave.h). Inheritence - * is mimicked by having the first attributes to be the same as of rti_common_t, - * except that scheduling_nodes attribute here is of type `federate_info_t**`, while it - * is of type `scheduling_node_t**` in `rti_common_t`. - * // **************** IMPORTANT!!! ******************** - * // ** If you make any change to this struct, ** - * // ** you MUST also change rti_common_t in ** - * // ** (enclave.h)! The change must exactly match. ** - * // ************************************************** */ typedef struct rti_remote_t { rti_common_t base; @@ -178,6 +174,11 @@ typedef struct rti_remote_t { * Number of connected transient federates */ int32_t number_of_connected_transient_federates; + + /** + * + */ + federation_life_cycle_phase phase; } rti_remote_t; /** From 556b8538b598f53213a5238592a15f7ef71b8cc0 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 14 Feb 2024 16:51:03 +0100 Subject: [PATCH 009/148] Make the federate read the effective start tag --- core/federated/federate.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index f13be5285..5686c654b 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -44,6 +44,7 @@ // Global variables defined in tag.c: extern instant_t start_time; +extern tag_t effective_start_tag; // Global variable defined in reactor_common.c: extern bool _lf_termination_executed; @@ -968,30 +969,35 @@ static instant_t get_start_time_from_rti(instant_t my_physical_time) { // Send the timestamp marker first. send_time(MSG_TYPE_TIMESTAMP, my_physical_time); - // Read bytes from the socket. We need 9 bytes. + // Read bytes from the socket. We need 17 (1 + 8 + 8) bytes. // Buffer for message ID plus timestamp. - size_t buffer_length = 1 + sizeof(instant_t); + size_t buffer_length = MSG_TYPE_TIMESTAMP_START_LENGTH; unsigned char buffer[buffer_length]; read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, buffer_length, buffer, NULL, - "Failed to read MSG_TYPE_TIMESTAMP message from RTI."); - LF_PRINT_DEBUG("Read 9 bytes."); + "Failed to read MSG_TYPE_TIMESTAMP_START message from RTI."); + LF_PRINT_DEBUG("Read 21 bytes."); // First byte received is the message ID. - if (buffer[0] != MSG_TYPE_TIMESTAMP) { + if (buffer[0] != MSG_TYPE_TIMESTAMP_START) { if (buffer[0] == MSG_TYPE_FAILED) { lf_print_error_and_exit("RTI has failed."); } - lf_print_error_and_exit("Expected a MSG_TYPE_TIMESTAMP message from the RTI. Got %u (see net_common.h).", + lf_print_error_and_exit("Expected a MSG_TYPE_TIMESTAMP_START message from the RTI. Got %u (see net_common.h).", buffer[0]); } instant_t timestamp = extract_int64(&(buffer[1])); tag_t tag = {.time = timestamp, .microstep = 0}; - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(receive_TIMESTAMP, _lf_my_fed_id, &tag); - lf_print("Starting timestamp is: " PRINTF_TIME ".", timestamp); + effective_start_tag = extract_tag(&(buffer[9])); + + // Trace the event when tracing is enabled. + // Note that we report in the trace the effective_start_tag. + // This is rather a choice. To be changed, if needed, of course. + tracepoint_federate_from_rti(receive_TIMESTAMP, _lf_my_fed_id, &effective_start_tag); + lf_print("Starting timestamp is: " PRINTF_TIME " and effectve start tag is: " PRINTF_TAG ".", timestamp, + effective_start_tag); LF_PRINT_LOG("Current physical time is: " PRINTF_TIME ".", lf_time_physical()); return timestamp; From 93ca7053960e76215490f9bfd3f00c8271bfad7c Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 15 Feb 2024 11:32:21 +0100 Subject: [PATCH 010/148] Drop the messages intended for a transient federate that is connected but did not started yet --- core/federated/RTI/rti_remote.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index fcc4d59a6..3d9f30cb6 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -287,6 +287,13 @@ void handle_timed_message(federate_info_t* sending_federate, unsigned char* buff } LF_MUTEX_UNLOCK(&rti_mutex); return; + } else { + if (lf_tag_compare(intended_tag, fed->effective_start_tag) < 0) { + // Do not forward the message if the federate is connected, but its + // start_time is not reached yet + lf_mutex_unlock(&rti_mutex); + return; + } } LF_PRINT_DEBUG("RTI forwarding message to port %d of federate %hu of length %zu.", reactor_port_id, federate_id, From 2de69d613c540207527db48ceacb2be8d08ebcba Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 15 Feb 2024 11:58:39 +0100 Subject: [PATCH 011/148] Granting TAG or PTAGs can be delayed if one of the upstream federates is an absent transient --- core/federated/RTI/rti_remote.c | 304 ++++++++++++++++++++++++++++---- core/federated/RTI/rti_remote.h | 14 +- 2 files changed, 285 insertions(+), 33 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 3d9f30cb6..d87fb875d 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -52,17 +52,16 @@ extern int lf_critical_section_enter(environment_t* env) { return lf_mutex_lock( extern int lf_critical_section_exit(environment_t* env) { return lf_mutex_unlock(&rti_mutex); } -void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { - if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || - lf_tag_compare(tag, e->last_provisionally_granted) < 0) { - return; - } - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (e->state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); - } +/** + * Notify a tag advance grant (TAG) message to the specified federate immediately. + * + * This function will keep a record of this TAG in the enclave's last_granted + * field. + * + * @param e The enclave. + * @param tag The tag to grant. + */ +void notify_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); unsigned char buffer[message_length]; buffer[0] = MSG_TYPE_TAG_ADVANCE_GRANT; @@ -85,7 +84,61 @@ void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { } } -void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { +/** + * @brief Thread that sleeps for a period of time, and then wakes up to check if + * a tag advance grant needs to be sent. That is, if the pending tag have not + * been reset to NEVER_TAG, the tag advance grant will be immediate. + * + * @param federate the fedarate whose tag advance grant needs to be delayed. + */ +void* pending_grant_thread(void* federate) { + federate_info_t* fed = (federate_info_t*)federate; + + interval_t sleep_interval = fed->pending_grant.time - lf_time_physical(); + if (sleep_interval > 0) { + lf_sleep(sleep_interval); + } + + lf_mutex_lock(&rti_mutex); + + // If the pending grant becomes NEVER_TAG, then this means that it should + // not be sent + if (lf_tag_compare(fed->pending_grant, NEVER_TAG) != 0) { + notify_tag_advance_grant_immediate(&(fed->enclave), fed->pending_grant); + fed->pending_grant = NEVER_TAG; + } + lf_mutex_unlock(&rti_mutex); +} + +/** + * Notify a tag advance grant (TAG) message to the specified federate after + * the physical time reaches the tag. A thread is created to this end. + * + * If a provisionl tag advance grant is pending, cancel it. If there is another + * pending tag advance grant, do not proceed with the thread creation. + * + * @param e The enclave. + * @param tag The tag to grant. + */ +void notify_tag_advance_grant_delayed(scheduling_node_t* e, tag_t tag) { + federate_info_t* fed = GET_FED_INFO(e->id); + + // Check wether there is already a pending grant + // And check the pending provisional grant as well + lf_mutex_lock(&rti_mutex); + if (lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) { + // If a tag is issued, then stop any possible provisional tag grant + fed->pending_grant = tag; + fed->pending_provisional_grant = NEVER_TAG; + lf_thread_create(&(fed->pending_grant_thread_id), pending_grant_thread, fed); + } else { + // If there is already a pending tag grant, then let it be sent first + // FIXME: Is this correct? + } + lf_mutex_unlock(&rti_mutex); +} + +void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { return; @@ -96,6 +149,36 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { // Need to wait here. lf_cond_wait(&sent_start_time); } + + // Check if sending the tag advance grant needs to be delayed or not + // Delay is needed when a federate has, at least one, absent upstream transient + int num_absent_upstram_transients = 0; + for (int j = 0; j < e->num_upstream; j++) { + federate_info_t* upstream = GET_FED_INFO(e->upstream[j]); + // Do Ignore this enclave if it no longer connected. + if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { + num_absent_upstram_transients++; + break; + } + } + if (num_absent_upstram_transients > 0) { + notify_tag_advance_grant_delayed(e, tag); + } else { + notify_tag_advance_grant_immediate(e, tag); + } +} + +/** + * Notify a provisional tag advance grant (PTAG) message to the specified federate + * immediately. + * + * This function will keep a record of this TAG in the enclave's last_provisionally_granted + * field. + * + * @param e The scheduling node. + * @param tag The tag to grant. + */ +void notify_provisional_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); unsigned char buffer[message_length]; buffer[0] = MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT; @@ -146,6 +229,166 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { } } +/** + * Thread that sleeps for a period of time, and then wakes up to check if + * a provisional tag advance grant needs to be sent. That is, if the pending + * provisional tag have not been reset to NEVER_TAG, the provisional tag advance + * grant will be immediate. + * + * @param federate the federate whose provisional tag advance grant needs to be delayed. + */ +void* pending_provisional_grant_thread(void* federate) { + federate_info_t* fed = (federate_info_t*)federate; + + interval_t sleep_interval = fed->pending_provisional_grant.time - lf_time_physical(); + if (sleep_interval > 0) { + lf_sleep(sleep_interval); + } + + lf_mutex_lock(&rti_mutex); + + // If the pending grant becomes NEVER_TAG, then this means that it should + // not be sent + if (lf_tag_compare(fed->pending_provisional_grant, NEVER_TAG) != 0) { + notify_provisional_tag_advance_grant_immediate(&(fed->enclave), fed->pending_provisional_grant); + fed->pending_provisional_grant = NEVER_TAG; + } + lf_mutex_unlock(&rti_mutex); +} + +/** + * Notify a provisional tag advance grant (PTAG) message to the specified federate + * after the physical time reaches the tag. A thread is created to this end. + * + * If a tag advance grant or a provisional one is pending, then do not proceed + * with the thread creation. + * + * @param e The scheduling node. + * @param tag The provisional tag to grant. + */ +void notify_provisional_tag_advance_grant_delayed(scheduling_node_t* e, tag_t tag) { + federate_info_t* fed = (federate_info_t*)e; + + // Proceed with the delayed provisional tag grant notification only if + // there is no pending grant and no provisional pending grant + lf_mutex_lock(&rti_mutex); + if ((lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) && + (lf_tag_compare(fed->pending_provisional_grant, NEVER_TAG) >= 0)) { + fed->pending_provisional_grant = tag; + lf_thread_create(&(fed->pending_provisional_grant_thread_id), pending_provisional_grant_thread, fed); + } + lf_mutex_unlock(&rti_mutex); +} + +void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { + if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || + lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { + return; + } + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (e->state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); + } + + // Check if sending the tag advance grant needs to be delayed or not + // Delay is needed when a federate has, at least one, absent upstream transient + int num_absent_upstram_transients = 0; + for (int j = 0; j < e->num_upstream; j++) { + federate_info_t* upstream = GET_FED_INFO(e->upstream[j]); + // Do Ignore this enclave if it no longer connected. + if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { + num_absent_upstram_transients++; + } + } + if (num_absent_upstram_transients > 0) { + notify_provisional_tag_advance_grant_delayed(e, tag); + } else { + notify_provisional_tag_advance_grant_immediate(e, tag); + } +} + +/** + * Thread that sleeps for a period of time, and then wakes up to check if + * a provisional tag advance grant needs to be sent. That is, if the pending + * provisional tag have not been reset to NEVER_TAG, the provisional tag advance + * grant will be immediate. + * + * @param federate the federate whose provisional tag advance grant needs to be delayed. + */ +void* pending_provisional_grant_thread(void* federate) { + federate_info_t* fed = (federate_info_t*)federate; + + interval_t sleep_interval = fed->pending_provisional_grant.time - lf_time_physical(); + if (sleep_interval > 0) { + lf_sleep(sleep_interval); + } + + lf_mutex_lock(&rti_mutex); + + // If the pending grant becomes NEVER_TAG, then this means that it should + // not be sent + if (lf_tag_compare(fed->pending_provisional_grant, NEVER_TAG) != 0) { + notify_provisional_tag_advance_grant_immediate(&(fed->enclave), fed->pending_provisional_grant); + fed->pending_provisional_grant = NEVER_TAG; + } + lf_mutex_unlock(&rti_mutex); +} + +/** + * Notify a provisional tag advance grant (PTAG) message to the specified federate + * after the physical time reaches the tag. A thread is created to this end. + * + * If a tag advance grant or a provisional one is pending, then do not proceed + * with the thread creation. + * + * @param e The scheduling node. + * @param tag The provisional tag to grant. + */ +void notify_provisional_tag_advance_grant_delayed(scheduling_node_t* e, tag_t tag) { + federate_info_t* fed = (federate_info_t*)e; + + // Proceed with the delayed provisional tag grant notification only if + // there is no pending grant and no provisional pending grant + lf_mutex_lock(&rti_mutex); + if ((lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) && + (lf_tag_compare(fed->pending_provisional_grant, NEVER_TAG) >= 0)) { + fed->pending_provisional_grant = tag; + lf_thread_create(&(fed->pending_provisional_grant_thread_id), pending_provisional_grant_thread, fed); + } + lf_mutex_unlock(&rti_mutex); +} + +void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { + if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || + lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { + return; + } + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (e->state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); + } + + // Check if sending the tag advance grant needs to be delayed or not + // Delay is needed when a federate has, at least one, absent upstream transient + int num_absent_upstram_transients = 0; + for (int j = 0; j < e->num_upstream; j++) { + federate_info_t* upstream = GET_FED_INFO(e->upstream[j]); + // Do Ignore this enclave if it no longer connected. + if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { + num_absent_upstram_transients++; + } + } + if (num_absent_upstram_transients > 0) { + notify_provisional_tag_advance_grant_delayed(e, tag); + } else { + notify_provisional_tag_advance_grant_immediate(e, tag); + } +} + void update_federate_next_event_tag_locked(uint16_t federate_id, tag_t next_event_tag) { federate_info_t* fed = GET_FED_INFO(federate_id); tag_t min_in_transit_tag = pqueue_tag_peek_tag(fed->in_transit_message_tags); @@ -1423,7 +1666,7 @@ static bool authenticate_federate(int* socket) { #endif void lf_connect_to_persistent_federates(int socket_descriptor) { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote.number_of_transient_federates; i++) { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates; i++) { // Wait for an incoming connection request. struct sockaddr client_fd; uint32_t client_length = sizeof(client_fd); @@ -1473,10 +1716,10 @@ void lf_connect_to_persistent_federates(int socket_descriptor) { // If the federate is transient, then do not count it. if (fed->is_transient) { - rti_remote.number_of_connected_transient_federates++; - assert(rti_remote.number_of_connected_transient_federates <= rti_remote.number_of_transient_federates); + rti_remote->number_of_connected_transient_federates++; + assert(rti_remote->number_of_connected_transient_federates <= rti_remote->number_of_transient_federates); i--; - lf_print("RTI: Transient federate %d joined.", fed->base.id); + lf_print("RTI: Transient federate %d joined.", fed->enclave.id); } } else { // Received message was rejected. Try again. @@ -1504,11 +1747,11 @@ void lf_connect_to_persistent_federates(int socket_descriptor) { } } -void* lf_connect_to_transient_federates_thread(int socket_descriptor) { +void* lf_connect_to_transient_federates_thread(void* nothing) { // This loop will continue to accept connections of transient federates, as // soon as there is room, or enable hot swap - while (!rti_remote.all_persistent_federates_exited) { + while (!rti_remote->all_persistent_federates_exited) { // Continue waiting for an incoming connection requests from transients // to join, or for hot swap. // Wait for an incoming connection request. @@ -1517,7 +1760,7 @@ void* lf_connect_to_transient_federates_thread(int socket_descriptor) { // The following blocks until a federate connects. int socket_id = -1; while (1) { - if (!rti_remote.all_persistent_federates_exited) { + if (!rti_remote->all_persistent_federates_exited) { return NULL; } socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); @@ -1542,8 +1785,9 @@ void* lf_connect_to_transient_federates_thread(int socket_descriptor) { shutdown(socket_id, SHUT_RDWR); close(socket_id); socket_id = -1; - // Ignore the federate that failed authentication. - i--; + socket_descriptor + // Ignore the federate that failed authentication. + i--; continue; } } @@ -1565,10 +1809,10 @@ void* lf_connect_to_transient_federates_thread(int socket_descriptor) { // // If the federate is transient, then do not count it. // if (fed->is_transient) { - // rti_remote.number_of_connected_transient_federates++; - // assert(rti_remote.number_of_connected_transient_federates <= rti_remote.number_of_transient_federates); - // i--; - // lf_print("RTI: Transient federate %d joined.", fed->base.id); + // rti_remote->number_of_connected_transient_federates++; + // assert(rti_remote->number_of_connected_transient_federates <= + // number_ofrti_remote->number_of_transient_federates_transient_federates); i--; lf_print("RTI: Transient + // federate %d joined.", fed->base.id); // } // } else { // // Received message was rejected. Try again. @@ -1646,7 +1890,7 @@ void wait_for_federates(int socket_descriptor) { // All persistent federates have connected. lf_print("RTI: All expected persistent federates have connected. Starting execution."); - if (rti_remote.number_of_transient_federates > 0) { + if (rti_remote->number_of_transient_federates > 0) { lf_print("RTI: Transient Federates can join and leave the federation at anytime."); } @@ -1660,9 +1904,9 @@ void wait_for_federates(int socket_descriptor) { // If the federation does not include transient federates, then respond to // erronous connections. Otherwise, continue to accept transients joining and // respond to duplicate joing requests. - if (rti_remote.number_of_transient_federates == 0) { + if (rti_remote->number_of_transient_federates == 0) { lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); - } else if (rti_remote.number_of_transient_federates > 0) { + } else if (rti_remote->number_of_transient_federates > 0) { lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); } @@ -1681,7 +1925,7 @@ void wait_for_federates(int socket_descriptor) { rti_remote->all_persistent_federates_exited = true; // Wait for transient federate threads to exit, if any. - if (rti_remote.number_of_transient_federates > 0) { + if (rti_remote->number_of_transient_federates > 0) { for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { federate_info_t* fed = GET_FED_INFO(i); if (fed->is_transient) { @@ -1743,7 +1987,7 @@ void initialize_RTI(rti_remote_t* rti) { rti_remote->authentication_enabled = false; rti_remote->base.tracing_enabled = false; rti_remote->stop_in_progress = false; - rti_remote->num_transient_federates = 0; + rti_remote->number_of_transient_federates = 0; } // The RTI includes clock.c, which requires the following functions that are defined diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index 88ce89606..573410988 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -67,7 +67,16 @@ typedef struct federate_info_t { // RTI has not been informed of the port number. struct in_addr server_ip_addr; // Information about the IP address of the socket // server of the federate. - bool is_transient; + bool is_transient; // Indicates whether the federate is transient or persistent. + tag_t effective_start_tag; // Records the start time of the federate, which is + // mainly useful for transient federates + tag_t pending_grant; // The pending tag advance grant + tag_t pending_provisional_grant; // The pending provisional tag advance grant + lf_thread_t pending_grant_thread_id; // The ID of the thread handling the pending + // tag grant + lf_thread_t pending_provisional_grant_thread_id; // The ID of the thread handling + // the pending provitional tag grant + } federate_info_t; /** @@ -381,9 +390,8 @@ void lf_connect_to_persistent_federates(int socket_descriptor); * Upon receiving the connection request, check if a hot swap should start or * simply create a thread to communicate with that federate. * Stops if all persistent federates exited. - * @param socket_descriptor The socket on which to accept connections. */ -void* lf_connect_to_transient_federates_thread(int socket_descriptor); +void* lf_connect_to_transient_federates_thread(void* nothing); /** * Thread to respond to new connections, which could be federates of other From fc6e60178fc44c58eb0aa082c52bb8e6fb4eb5d1 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 15 Feb 2024 12:17:00 +0100 Subject: [PATCH 012/148] When all upstream federates are not connected (transients), then the TAG to issue is the NET. This is to avoid starvation. --- core/federated/RTI/rti_common.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/core/federated/RTI/rti_common.c b/core/federated/RTI/rti_common.c index 3a1b16fab..f1229493f 100644 --- a/core/federated/RTI/rti_common.c +++ b/core/federated/RTI/rti_common.c @@ -155,6 +155,9 @@ tag_t eimt_strict(scheduling_node_t* e) { tag_advance_grant_t tag_advance_grant_if_safe(scheduling_node_t* e) { tag_advance_grant_t result = {.tag = NEVER_TAG, .is_provisional = false}; + // Check how many upstream federates are connected + int num_connected_upstream = 0; + // Find the earliest LTC of upstream scheduling_nodes (M). tag_t min_upstream_completed = FOREVER_TAG; @@ -164,6 +167,7 @@ tag_advance_grant_t tag_advance_grant_if_safe(scheduling_node_t* e) { // Ignore this enclave/federate if it is not connected. if (upstream->state == NOT_CONNECTED) continue; + num_connected_upstream++; // Adjust by the "after" delay. // Note that "no delay" is encoded as NEVER, @@ -176,8 +180,15 @@ tag_advance_grant_t tag_advance_grant_if_safe(scheduling_node_t* e) { } LF_PRINT_LOG("RTI: Minimum upstream LTC for federate/enclave %d is " PRINTF_TAG "(adjusted by after delay).", e->id, min_upstream_completed.time - start_time, min_upstream_completed.microstep); - if (lf_tag_compare(min_upstream_completed, e->last_granted) > 0 && - lf_tag_compare(min_upstream_completed, e->next_event) >= 0 // The enclave has to advance its tag + + if (num_connected_upstream == 0) { + // When none of the upstream federates is connected (case of transients), + if (lf_tag_compare(e->next_event, FOREVER_TAG) != 0) { + result.tag = e->next_event; + return result; + } + } else if (lf_tag_compare(min_upstream_completed, e->last_granted) > 0 && + lf_tag_compare(min_upstream_completed, e->next_event) >= 0 // The enclave has to advance its tag ) { result.tag = min_upstream_completed; return result; From 9dff3d04ce31e363be28c925c8dbbf19578609e3 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 15 Feb 2024 12:29:47 +0100 Subject: [PATCH 013/148] Only persistent federates are accounted for requesting stop --- core/federated/RTI/rti_remote.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index d87fb875d..e4b5dcb8b 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -688,18 +688,21 @@ static void broadcast_stop_time_to_federates_locked() { } /** - * Mark a federate requesting stop. If the number of federates handling stop reaches the - * NUM_OF_FEDERATES, broadcast MSG_TYPE_STOP_GRANTED to every federate. + * Mark a federate requesting stop. If the number of federates handling stop reaches + * the number of persistent federates, broadcast MSG_TYPE_STOP_GRANTED to every federate. * This function assumes the _RTI.mutex is already locked. * @param fed The federate that has requested a stop. * @return 1 if stop time has been sent to all federates and 0 otherwise. */ static int mark_federate_requesting_stop(federate_info_t* fed) { if (!fed->requested_stop) { - rti_remote->base.num_scheduling_nodes_handling_stop++; + // Increment the number of federates handling stop only if it is persistent + if (!fed->is_transient) + rti_remote->base.num_scheduling_nodes_handling_stop++; fed->requested_stop = true; } - if (rti_remote->base.num_scheduling_nodes_handling_stop == rti_remote->base.number_of_scheduling_nodes) { + if (rti_remote->base.num_scheduling_nodes_handling_stop == + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { // We now have information about the stop time of all // federates. broadcast_stop_time_to_federates_locked(); From c1068dfea2d36e2de918cc5548e97d67997971d4 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 15 Feb 2024 13:28:34 +0100 Subject: [PATCH 014/148] Compute and send the effective start tag of a joining transient --- core/federated/RTI/rti_remote.c | 217 +++++++++++++++++++++++++++----- 1 file changed, 187 insertions(+), 30 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index e4b5dcb8b..b6e578bad 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -904,50 +904,73 @@ void handle_address_ad(uint16_t federate_id) { } } -void handle_timestamp(federate_info_t* my_fed) { - unsigned char buffer[sizeof(int64_t)]; - // Read bytes from the socket. We need 8 bytes. - read_from_socket_fail_on_error(&my_fed->socket, sizeof(int64_t), (unsigned char*)&buffer, NULL, - "ERROR reading timestamp from federate %d.\n", my_fed->enclave.id); +/** + * Send to the start time to the federate my_fed. + * This function assumes the caller does not hold the mutex. + * + * If it is the startup phase, the start_time will be the maximum received timestamps + * plus an offset. The federate will then receive identical federation_start_time + * and federate_start_tag.time (the federate_start_tag.microstep will be 0). + * If, however, the startup phase is passed, the federate will receive different + * values than sateted above. + * + * @param my_fed the federate to send the start time to. + * @param federation_start_time the federation start_time + * @param federate_start_tag the federate effective start tag + */ +void send_start_tag(federate_info_t* my_fed, instant_t federation_start_time, tag_t federate_start_tag) { + // Send back to the federate the maximum time plus an offset on a TIMESTAMP_START + // message. + // In the startup phase, federates will receive identical start_time and + // effective_start_tag + unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_LENGTH]; + start_time_buffer[0] = MSG_TYPE_TIMESTAMP; + encode_int64(swap_bytes_if_big_endian_int64(start_time), &start_time_buffer[1]); - int64_t timestamp = swap_bytes_if_big_endian_int64(*((int64_t*)(&buffer))); if (rti_remote->base.tracing_enabled) { - tag_t tag = {.time = timestamp, .microstep = 0}; - tracepoint_rti_from_federate(receive_TIMESTAMP, my_fed->enclave.id, &tag); - } - LF_PRINT_DEBUG("RTI received timestamp message with time: " PRINTF_TIME ".", timestamp); - - LF_MUTEX_LOCK(&rti_mutex); - rti_remote->num_feds_proposed_start++; - if (timestamp > rti_remote->max_start_time) { - rti_remote->max_start_time = timestamp; + tag_t tag = {.time = start_time, .microstep = 0}; + tracepoint_rti_to_federate(rti_remote->base.trace, send_TIMESTAMP, my_fed->enclave.id, &tag); } - if (rti_remote->num_feds_proposed_start == rti_remote->base.number_of_scheduling_nodes) { - // All federates have proposed a start time. - lf_cond_broadcast(&received_start_times); - } else { - // Some federates have not yet proposed a start time. - // wait for a notification. - while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) { - // FIXME: Should have a timeout here? - lf_cond_wait(&received_start_times); - } + if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_LENGTH, start_time_buffer)) { + lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); } + LF_MUTEX_LOCK(&rti_mutex); + // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP + // message has been sent. That MSG_TYPE_TIMESTAMP message grants time advance to + // the federate to the start time. + my_fed->enclave.state = GRANTED; + lf_cond_broadcast(&sent_start_time); + LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); LF_MUTEX_UNLOCK(&rti_mutex); +} - // Send back to the federate the maximum time plus an offset on a TIMESTAMP +/** + * Send to the start time to the federate my_fed. + * This function assumes the caller does not hold the mutex. + * + * If it is the startup phase, the start_time will be the maximum received timestamps + * plus an offset. The federate will then receive identical federation_start_time + * and federate_start_tag.time (the federate_start_tag.microstep will be 0). + * If, however, the startup phase is passed, the federate will receive different + * values than sateted above. + * + * @param my_fed the federate to send the start time to. + * @param federation_start_time the federation start_time + * @param federate_start_tag the federate effective start tag + */ +void send_start_tag(federate_info_t* my_fed, instant_t federation_start_time, tag_t federate_start_tag) { + // Send back to the federate the maximum time plus an offset on a TIMESTAMP_START // message. + // In the startup phase, federates will receive identical start_time and + // effective_start_tag unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_LENGTH]; start_time_buffer[0] = MSG_TYPE_TIMESTAMP; - // Add an offset to this start time to get everyone starting together. - start_time = rti_remote->max_start_time + DELAY_START; - lf_tracing_set_start_time(start_time); encode_int64(swap_bytes_if_big_endian_int64(start_time), &start_time_buffer[1]); if (rti_remote->base.tracing_enabled) { tag_t tag = {.time = start_time, .microstep = 0}; - tracepoint_rti_to_federate(send_TIMESTAMP, my_fed->enclave.id, &tag); + tracepoint_rti_to_federate(rti_remote->base.trace, send_TIMESTAMP, my_fed->enclave.id, &tag); } if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_LENGTH, start_time_buffer)) { lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); @@ -963,6 +986,140 @@ void handle_timestamp(federate_info_t* my_fed) { LF_MUTEX_UNLOCK(&rti_mutex); } +void handle_timestamp(federate_info_t* my_fed) { + unsigned char buffer[sizeof(int64_t)]; + // Read bytes from the socket. We need 8 bytes. + read_from_socket_fail_on_error(&my_fed->socket, sizeof(int64_t), (unsigned char*)&buffer, NULL, + "ERROR reading timestamp from federate %d.\n", my_fed->enclave.id); + + int64_t timestamp = swap_bytes_if_big_endian_int64(*((int64_t*)(&buffer))); + if (rti_remote->base.tracing_enabled) { + tag_t tag = {.time = timestamp, .microstep = 0}; + tracepoint_rti_from_federate(receive_TIMESTAMP, my_fed->enclave.id, &tag); + } + LF_PRINT_DEBUG("RTI received timestamp message with time: " PRINTF_TIME ".", timestamp); + + LF_MUTEX_LOCK(&rti_mutex); + + // Processing the TIMESTAMP depends on whether it is the startup phase (all + // persistent federates joined) or not. + if (rti_remote->phase == + startup_phase) { // This is equivalent to: rti_remote->num_feds_proposed_start < (rti_remote->number_of_enclaves - + // rti_remote->number_of_transient_federates) + if (timestamp > rti_remote->max_start_time) { + rti_remote->max_start_time = timestamp; + } + // Check that persistent federates did propose a start_time + if (!my_fed->is_transient) { + rti_remote->num_feds_proposed_start++; + } + if (rti_remote->num_feds_proposed_start == + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { + // All federates have proposed a start time. + lf_cond_broadcast(&received_start_times); + rti_remote->phase = execution_phase; + } else { + // Some federates have not yet proposed a start time. + // wait for a notification. + while (rti_remote->num_feds_proposed_start < + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { + // FIXME: Should have a timeout here? + lf_cond_wait(&received_start_times); + } + } + + LF_MUTEX_UNLOCK(&rti_mutex); + + // Send back to the federate the maximum time plus an offset on a TIMESTAMP + // message. + // Add an offset to this start time to get everyone starting together. + start_time = rti_remote->max_start_time + DELAY_START; + my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; + send_start_tag(my_fed, start_time, my_fed->effective_start_tag); + } else if (rti_remote->phase == shutdown_phase) { + // Do not answer the federate if the federation is in hsutdown phase + // Or maybe send and error message? + LF_MUTEX_LOCK(&rti_mutex); + return; + } else { // The federation is the execution phase + // A transient has joined after the startup phase + // At this point, we already hold the mutex + + // This is rather a possible extreme corner case, where a transient sends its timestamp, and only + // enters the if section after all persistents have joined. + if (timestamp < start_time) { + timestamp = start_time; + } + + //// Algorithm for computing the effective_start_time of a joining transient + // The effective_start_time will be the max among all the following tags: + // - At tag: (joining time, 0 microstep) + // - The latest completed logical tag + 1 microstep + // - The latest granted tag + 1 microstep, of every downstream federate + // - The latest provisionnaly granted tag + 1 microstep, of every downstream federate + + my_fed->effective_start_tag = (tag_t){.time = timestamp, .microstep = 0u}; + + if (lf_tag_compare(my_fed->enclave.completed, my_fed->effective_start_tag) > 0) { + my_fed->effective_start_tag = my_fed->enclave.completed; + my_fed->effective_start_tag.microstep++; + } + + // Iterate over the downstream federates + for (int j = 0; j < my_fed->enclave.num_downstream; j++) { + federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); + + // Ignore this federate if it has resigned. + if (downstream->enclave.state == NOT_CONNECTED) { + continue; + } + + // Get the max over the TAG of the downstreams + if (lf_tag_compare(downstream->enclave.last_granted, my_fed->effective_start_tag) > 0) { + my_fed->effective_start_tag = downstream->enclave.last_granted; + my_fed->effective_start_tag.microstep++; + } + + // Get the max over the PTAG of the downstreams + if (lf_tag_compare(downstream->enclave.last_provisionally_granted, my_fed->effective_start_tag) > 0) { + my_fed->effective_start_tag = downstream->enclave.last_provisionally_granted; + my_fed->effective_start_tag.microstep++; + } + } + + // For every downstream that has a pending grant that is higher then the + // effective_start_time of the federate, cancel it + for (int j = 0; j < my_fed->enclave.num_downstream; j++) { + federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); + + // Ignore this federate if it has resigned. + if (downstream->enclave.state == NOT_CONNECTED) { + continue; + } + + // Check the pending tag grant, if any, and keep it only if it is + // sonner than the effective start tag + if (lf_tag_compare(downstream->pending_grant, NEVER_TAG) != 0 && + lf_tag_compare(downstream->pending_grant, my_fed->effective_start_tag) > 0) { + downstream->pending_grant = NEVER_TAG; + } + // Same for the possible pending provisional tag grant + if (lf_tag_compare(downstream->pending_provisional_grant, NEVER_TAG) != 0 && + lf_tag_compare(downstream->pending_provisional_grant, my_fed->effective_start_tag) > 0) { + downstream->pending_provisional_grant = NEVER_TAG; + } + } + + LF_MUTEX_UNLOCK(&rti_mutex); + + // Once the effective start time set, sent it to the joining transient, + // together with the start time of the federation. + + // Send the start time + send_start_tag(my_fed, start_time, my_fed->effective_start_tag); + } +} + void send_physical_clock(unsigned char message_type, federate_info_t* fed, socket_type_t socket_type) { if (fed->enclave.state == NOT_CONNECTED) { lf_print_warning("Clock sync: RTI failed to send physical time to federate %d. Socket not connected.\n", From 71834e62cca69e3b6a7dcf6da7e78511f782471b Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 15 Feb 2024 14:37:56 +0100 Subject: [PATCH 015/148] Start on enabling the hot swap mechanism --- core/federated/RTI/rti_remote.c | 87 ++++++++++++++++++++++++++++----- 1 file changed, 76 insertions(+), 11 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index b6e578bad..e485b571f 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1425,6 +1425,21 @@ void* federate_info_thread_TCP(void* fed) { // Prevent multiple threads from closing the same socket at the same time. LF_MUTEX_LOCK(&rti_mutex); close(my_fed->socket); // from unistd.h + // Manual clean, in case of a transient federate + if (my_fed->is_transient) { + free_in_transit_message_q(my_fed->in_transit_message_tags); + lf_print("RTI: Transient Federate %d thread exited.", my_fed->enclave.id); + + // Update the number of connected transient federates + _f_rti->number_of_connected_transient_federates--; + + // Reset the status of the leaving federate + reset_transient_federate(my_fed); + } + // Signal the hot swap mechanism, if needed + if (hot_swap_in_progress && hot_swap_federate->enclave.id == my_fed->enclave.id) { + hot_swap_old_resigned = true; + } LF_MUTEX_UNLOCK(&rti_mutex); return NULL; } @@ -1516,7 +1531,7 @@ static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_ LF_PRINT_DEBUG("RTI received federation ID: %s.", federation_id_received); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_FED_ID, fed_id, NULL); + tracepoint_rti_from_federate(rti_remote->base.trace, receive_FED_ID, fed_id, NULL); } // Compare the received federation ID to mine. if (strncmp(rti_remote->federation_id, federation_id_received, federation_id_length) != 0) { @@ -1524,7 +1539,7 @@ static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_ lf_print_warning("Federate from another federation %s attempted to connect to RTI in federation %s.", federation_id_received, rti_remote->federation_id); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } send_reject(socket_id, FEDERATION_ID_DOES_NOT_MATCH); return -1; @@ -1533,23 +1548,63 @@ static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_ // Federate ID is out of range. lf_print_error("RTI received federate ID %d, which is out of range.", fed_id); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } send_reject(socket_id, FEDERATE_ID_OUT_OF_RANGE); return -1; } else { + // Find out if it is a new connection or a hot swap. + // Reject if: + // - duplicate of a connected persistent federate + // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that + // particular federate + // - or it is a hot swap, but it is not the execution phase yet if ((rti_remote->base.scheduling_nodes[fed_id])->state != NOT_CONNECTED) { - lf_print_error("RTI received duplicate federate ID: %d.", fed_id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + if (!is_transient) { + lf_print_error("RTI received duplicate federate ID: %d.", fed_id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); + } + send_reject(socket_id, FEDERATE_ID_IN_USE); + return -1; + } else if (hot_swap_in_progress || rti_remote->phase != execution_phase) { + lf_print_warning("RTI rejects the connection of transient federate %d, \ + because a hot swap is already in progress for federate %d. \n\ + Only one hot swap operation is allowed at a time.", + fed_id, hot_swap_federate->enclave.id); + if (_f_rti->tracing_enabled) { + tracepoint_rti_to_federate(_f_rti->trace, send_REJECT, fed_id, NULL); + } + send_reject(socket_id, FEDERATE_ID_IN_USE); + return -1; } - send_reject(socket_id, FEDERATE_ID_IN_USE); - return -1; } } } } - federate_info_t* fed = GET_FED_INFO(fed_id); + + federate_info_t* fed_twin = GET_FED_INFO(fed_id); + federate_info_t* fed; + // If the federate is already connected (making the request a duplicate), and that + // the federate is transient, and it is the execution phase, then mark that a hot + // swap is in progreass and initialize the hot_swap_federate. + // Otherwise, proceed with a normal transinet connection + if (fed_twin->enclave.state != NOT_CONNECTED && is_transient && fed_twin->is_transient && + rti_remote->phase == execution_phase && !hot_swap_in_progress) { + // Allocate memory for the new federate and initilize it + hot_swap_federate = (federate_info_t*)malloc(sizeof(federate_info_t)); + initialize_federate(hot_swap_federate, fed_id); + + // Set that hot swap is in progress + hot_swap_in_progress = true; + // free(fed); // Free the old memory to prevent memory leak + fed = hot_swap_federate; + lf_print("RTI: Hot Swap starting for federate %d.", fed_id); + } else { + fed = fed_twin; + fed->is_transient = is_transient; + } + // The MSG_TYPE_FED_IDS message has the right federation ID. // Get the peer address from the connected socket_id. Then assign it as the federate's socket server. @@ -1614,7 +1669,12 @@ static int receive_connection_information(int* socket_id, uint16_t fed_id) { send_reject(socket_id, UNEXPECTED_MESSAGE); return 0; } else { - federate_info_t* fed = GET_FED_INFO(fed_id); + federate_info_t* fed; + if (hot_swap_in_progress) { + fed = hot_swap_federate; + } else { + fed = GET_FED_INFO(fed_id); + } // Read the number of upstream and downstream connections fed->enclave.num_upstream = extract_int32(&(connection_info_header[1])); fed->enclave.num_downstream = extract_int32(&(connection_info_header[1 + sizeof(int32_t)])); @@ -1698,7 +1758,12 @@ static int receive_udp_message_and_set_up_clock_sync(int* socket_id, uint16_t fe send_reject(socket_id, UNEXPECTED_MESSAGE); return 0; } else { - federate_info_t* fed = GET_FED_INFO(fed_id); + federate_info_t* fed; + if (hot_swap_in_progress) { + fed = hot_swap_federate; + } else { + fed = GET_FED_INFO(fed_id); + } if (rti_remote->clock_sync_global_status >= clock_sync_init) { // If no initial clock sync, no need perform initial clock sync. uint16_t federate_UDP_port_number = extract_uint16(&(response[1])); From ba38bc1467a996fa28abfe5998251128230cb637 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 15 Feb 2024 17:42:09 +0100 Subject: [PATCH 016/148] More on the hot swap mechanism + various fixes --- core/federated/RTI/rti_remote.c | 133 +++++++++++++++++++++++++------- core/federated/RTI/rti_remote.h | 6 ++ 2 files changed, 111 insertions(+), 28 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index e485b571f..c11f8510c 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -38,6 +38,15 @@ extern instant_t start_time; */ static rti_remote_t* rti_remote; +// Referance to the federate instance to support hot swap +federate_info_t* hot_swap_federate; + +// Indicates if a hot swap process is in progress +bool hot_swap_in_progress = false; + +// Indicates thatthe old federate has stopped +bool hot_swap_old_resigned = false; + bool _lf_federate_reports_error = false; // A convenient macro for getting the `federate_info_t *` at index `_idx` @@ -1427,11 +1436,12 @@ void* federate_info_thread_TCP(void* fed) { close(my_fed->socket); // from unistd.h // Manual clean, in case of a transient federate if (my_fed->is_transient) { - free_in_transit_message_q(my_fed->in_transit_message_tags); + // FIXME: Aren't there transit messages anymore??? + // free_in_transit_message_q(my_fed->in_transit_message_tags); lf_print("RTI: Transient Federate %d thread exited.", my_fed->enclave.id); // Update the number of connected transient federates - _f_rti->number_of_connected_transient_federates--; + rti_remote->number_of_connected_transient_federates--; // Reset the status of the leaving federate reset_transient_federate(my_fed); @@ -1572,8 +1582,8 @@ static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_ because a hot swap is already in progress for federate %d. \n\ Only one hot swap operation is allowed at a time.", fed_id, hot_swap_federate->enclave.id); - if (_f_rti->tracing_enabled) { - tracepoint_rti_to_federate(_f_rti->trace, send_REJECT, fed_id, NULL); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } send_reject(socket_id, FEDERATE_ID_IN_USE); return -1; @@ -1890,6 +1900,7 @@ static bool authenticate_federate(int* socket) { } #endif +// FIXME: The socket descriptor here (parameter) is not used. Should be removed? void lf_connect_to_persistent_federates(int socket_descriptor) { for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates; i++) { // Wait for an incoming connection request. @@ -2021,32 +2032,74 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { // The first message from the federate should contain its ID and the federation ID. // The function also detects if a hot swap request is initiated. int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in*)&client_fd); - if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && + + if (fed_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { + lf_mutex_lock(&rti_mutex); + if (hot_swap_in_progress) { + lf_print("RTI: Hot swap confirmed for federate %d.", fed_id); + + // Then send STOP + federate_info_t* fed_old = GET_FED_INFO(fed_id); + hot_swap_federate->enclave.completed = fed_old->enclave.completed; + + LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); + send_stop(fed_old); + lf_mutex_unlock(&rti_mutex); + + // Wait for the old federate to send MSG_TYPE_RESIGN + LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); + // FIXME: Should this have a timeout? + while (!hot_swap_old_resigned) + ; + + // The latest LTC is the tag at which the old federate resigned. This is useful + // for computing the effective_start_time of the new joining federate. + hot_swap_federate->enclave.completed = fed_old->enclave.completed; + + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + lf_thread_create(&(hot_swap_federate->thread_id), federate_info_thread_TCP, hot_swap_federate); + + // Redirect the federate in rti_remote + rti_remote->base.scheduling_nodes[fed_id] = (scheduling_node_t*)hot_swap_federate; + + // Free the old federate memory and reset the Hot wap indicators + // FIXME: Is this enough to free the memory allocated to the federate? + free(fed_old); + lf_mutex_lock(&rti_mutex); + hot_swap_in_progress = false; + lf_mutex_unlock(&rti_mutex); + + lf_print("RTI: Hot swap succeeded for federate %d.", fed_id); + } else { + lf_mutex_unlock(&rti_mutex); + + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + federate_info_t* fed = GET_FED_INFO(fed_id); + lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + lf_print("RTI: Transient federate %d joined.", fed_id); + } + rti_remote->number_of_connected_transient_federates++; + } else { + // If a hot swap was initialed, but the connection information or/and clock + // synchronization fail, then reset hot_swap_in_profress, and free the memory + // allocated for hot_swap_federate + if (hot_swap_in_progress) { + lf_print("RTI: Hot swap canceled for federate %d.", fed_id); + lf_mutex_lock(&rti_mutex); + hot_swap_in_progress = false; + lf_mutex_unlock(&rti_mutex); + + // FIXME: Is this enough to free the memory of a federate_info_t data structure? + free(hot_swap_federate); + } } - - // // Create a thread to communicate with the federate. - // // This has to be done after clock synchronization is finished - // // or that thread may end up attempting to handle incoming clock - // // synchronization messages. - // federate_info_t *fed = GET_FED_INFO(fed_id); - // lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); - - // // If the federate is transient, then do not count it. - // if (fed->is_transient) { - // rti_remote->number_of_connected_transient_federates++; - // assert(rti_remote->number_of_connected_transient_federates <= - // number_ofrti_remote->number_of_transient_federates_transient_federates); i--; lf_print("RTI: Transient - // federate %d joined.", fed->base.id); - // } - // } else { - // // Received message was rejected. Try again. - // i--; - // } - - // FIXME: Check again if runtime clock synchronization should be lauched, - // only if the number of persistent threads is zero. This should be done - // only once, not at every transient connection. } } @@ -2089,6 +2142,27 @@ void initialize_federate(federate_info_t* fed, uint16_t id) { fed->server_ip_addr.s_addr = 0; fed->server_port = -1; fed->is_transient = true; + fed->effective_start_tag = NEVER_TAG; + fed->pending_grant = NEVER_TAG; + fed->pending_provisional_grant = NEVER_TAG; +} + +void reset_transient_federate(federate_info_t* fed) { + fed->enclave.next_event = NEVER_TAG; + fed->enclave.state = NOT_CONNECTED; + // Reset of the federate-related attributes + fed->socket = -1; // No socket. + fed->clock_synchronization_enabled = true; + fed->in_transit_message_tags = pqueue_tag_init(10); + strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); + fed->server_ip_addr.s_addr = 0; + fed->server_port = -1; + fed->requested_stop = false; + fed->is_transient = true; + fed->effective_start_tag = NEVER_TAG; + fed->pending_grant = NEVER_TAG; + fed->pending_provisional_grant = NEVER_TAG; + // FIXME: There is room though to check if the interface has changed??? Do we allow this? } int32_t start_rti_server(uint16_t port) { @@ -2148,6 +2222,8 @@ void wait_for_federates(int socket_descriptor) { } rti_remote->all_persistent_federates_exited = true; + rti_remote->phase = shutdown_phase; + lf_print("RTI: All persistent threads exited."); // Wait for transient federate threads to exit, if any. if (rti_remote->number_of_transient_federates > 0) { @@ -2213,6 +2289,7 @@ void initialize_RTI(rti_remote_t* rti) { rti_remote->base.tracing_enabled = false; rti_remote->stop_in_progress = false; rti_remote->number_of_transient_federates = 0; + rti_remote->phase = startup_phase; } // The RTI includes clock.c, which requires the following functions that are defined diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index 573410988..17196e00e 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -406,6 +406,12 @@ void* respond_to_erroneous_connections(void* nothing); */ void initialize_federate(federate_info_t* fed, uint16_t id); +/** + * Reset the federate. The federate has to be transient. + * @param fed A pointer to the federate + */ +void reset_transient_federate(federate_info_t* fed); + /** * Start the socket server for the runtime infrastructure (RTI) and * return the socket descriptor. From c0645cccdeba43e04512337d445177bc0c377759 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 15 Feb 2024 21:19:15 +0100 Subject: [PATCH 017/148] RTI can sent a request for immediate stop. This is required by the hot swap mechanism --- core/federated/RTI/rti_remote.c | 20 ++++++++++++++++++++ include/core/federated/network/net_common.h | 10 ++++++++++ 2 files changed, 30 insertions(+) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index c11f8510c..274e5c259 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1983,6 +1983,26 @@ void lf_connect_to_persistent_federates(int socket_descriptor) { } } +/** + * @brief A request for immediate stop to the federate + * + * @param fed: the deferate to stop + */ +void send_stop(federate_info_t* fed) { + // Reply with a stop granted to all federates + unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; + outgoing_buffer[0] = MSG_TYPE_STOP; + lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); + + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); + } + write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, + "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); + + LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); +} + void* lf_connect_to_transient_federates_thread(void* nothing) { // This loop will continue to accept connections of transient federates, as // soon as there is room, or enable hot swap diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index 47826be3e..3dbf0a92f 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -617,6 +617,16 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define MSG_TYPE_FAILED 25 +/** + * Byte sent by the RTI ordering the federate to stop. Upon receiving the meaasage, + * the federate will call lf_stop(), which will make him resign at its current_tag + * plus 1 microstep. + * The next 8 bytes will be the time at which the federates will stop. * + * The next 4 bytes will be the microstep at which the federates will stop.. + */ +#define MSG_TYPE_STOP 30 +#define MSG_TYPE_STOP_LENGTH 1 + ///////////////////////////////////////////// //// Rejection codes From f244f8581fd0611ea467e0ce010348ce3676acc6 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 16 Feb 2024 10:00:21 +0100 Subject: [PATCH 018/148] Federate processes a stop request by the RTI --- core/federated/federate.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/core/federated/federate.c b/core/federated/federate.c index 5686c654b..dca79a3f7 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1392,6 +1392,20 @@ static void handle_stop_granted_message() { } } +/** + * Handle a MSG_TYPE_STOP message from the RTI. + * + * This function simply calls lf_stop(). + */ +void handle_stop() { + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_STOP, _lf_my_fed_id, NULL); + + lf_print("Received from RTI a MSG_TYPE_STOP at physical time " PRINTF_TIME ".", lf_time_physical()); + + lf_stop(); +} + /** * Handle a MSG_TYPE_STOP_REQUEST message from the RTI. */ @@ -1578,6 +1592,9 @@ static void* listen_to_rti_TCP(void* args) { case MSG_TYPE_STOP_GRANTED: handle_stop_granted_message(); break; + case MSG_TYPE_STOP: + handle_stop(); + break; case MSG_TYPE_PORT_ABSENT: if (handle_port_absent_message(&_fed.socket_TCP_RTI, -1)) { // Failures to complete the read of absent messages from the RTI are fatal. From eb712c071d15f3732a4eaa6b135e2e2165a72871 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 16 Feb 2024 10:34:34 +0100 Subject: [PATCH 019/148] Stop the execution of a federate --- core/federated/federate.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/core/federated/federate.c b/core/federated/federate.c index dca79a3f7..2b413443c 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1392,6 +1392,28 @@ static void handle_stop_granted_message() { } } +/** + * @brief Stop the execution of a federate. + * Every enclave within the federate will stop at one microstep later than its + * current tag. Unlike lf_request_stop(), this process does not require any + * involvement from the RTI, nor does it necessitate any consensus. + * + * This function is particularly useful for testing transient federates. + */ +void lf_stop() { + environment_t *env; + int num_env = _lf_get_environments(&env); + + for (int i = 0 ; i < num_env ; i++) { + tag_t new_stop_tag; + new_stop_tag.time = env[i].current_tag.time; + new_stop_tag.microstep = env[i].current_tag.microstep + 1; + _lf_set_stop_tag(&env[i], new_stop_tag); + } + + LF_PRINT_LOG("Federate is stopping."); +} + /** * Handle a MSG_TYPE_STOP message from the RTI. * From 96cb09ee3aae76f90a9569d05b08e60362edc32e Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Fri, 4 Aug 2023 13:56:59 -0700 Subject: [PATCH 020/148] Add lf_get_federates_bin_directory. --- core/CMakeLists.txt | 1 + core/federated/federate.c | 28 +++++++++++++++------------- include/core/federated/federate.h | 6 ++++++ 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 6d938ae0c..04443d5ab 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -181,6 +181,7 @@ define(SCHEDULER) define(LF_SOURCE_DIRECTORY) define(LF_SOURCE_GEN_DIRECTORY) define(LF_PACKAGE_DIRECTORY) +define(LF_FEDERATES_BIN_DIRECTORY) define(LF_FILE_SEPARATOR) define(WORKERS_NEEDED_FOR_FEDERATE) define(LF_ENCLAVES) diff --git a/core/federated/federate.c b/core/federated/federate.c index 2b413443c..719da99b2 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1395,23 +1395,23 @@ static void handle_stop_granted_message() { /** * @brief Stop the execution of a federate. * Every enclave within the federate will stop at one microstep later than its - * current tag. Unlike lf_request_stop(), this process does not require any + * current tag. Unlike lf_request_stop(), this process does not require any * involvement from the RTI, nor does it necessitate any consensus. - * + * * This function is particularly useful for testing transient federates. */ void lf_stop() { - environment_t *env; - int num_env = _lf_get_environments(&env); - - for (int i = 0 ; i < num_env ; i++) { - tag_t new_stop_tag; - new_stop_tag.time = env[i].current_tag.time; - new_stop_tag.microstep = env[i].current_tag.microstep + 1; - _lf_set_stop_tag(&env[i], new_stop_tag); - } + environment_t* env; + int num_env = _lf_get_environments(&env); - LF_PRINT_LOG("Federate is stopping."); + for (int i = 0; i < num_env; i++) { + tag_t new_stop_tag; + new_stop_tag.time = env[i].current_tag.time; + new_stop_tag.microstep = env[i].current_tag.microstep + 1; + _lf_set_stop_tag(&env[i], new_stop_tag); + } + + LF_PRINT_LOG("Federate is stopping."); } /** @@ -2667,6 +2667,8 @@ bool lf_update_max_level(tag_t tag, bool is_provisional) { return (prev_max_level_allowed_to_advance != max_level_allowed_to_advance); } +char* lf_get_federates_bin_directory() { return LF_FEDERATES_BIN_DIRECTORY; } + #ifdef FEDERATED_DECENTRALIZED instant_t lf_wait_until_time(tag_t tag) { instant_t result = tag.time; // Default. @@ -2692,4 +2694,4 @@ instant_t lf_wait_until_time(tag_t tag) { } #endif // FEDERATED_DECENTRALIZED -#endif // FEDERATED +#endif diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index 604a36637..b6af1f023 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -545,4 +545,10 @@ bool lf_update_max_level(tag_t tag, bool is_provisional); instant_t lf_wait_until_time(tag_t tag); #endif // FEDERATED_DECENTRALIZED +/** + * @brief Return the directory containing the executables of the individual + * federates. + */ +char* lf_get_federates_bin_directory(); + #endif // FEDERATE_H From aaed9091829e94926f8ced4f09a524e7c3ebc7fe Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 16 Feb 2024 12:44:26 +0100 Subject: [PATCH 021/148] A federate can get its id, its effective_start_time, and the start_time of the federation. This is particularly useful for testing. --- core/federated/federate.c | 12 ++++++++++++ include/core/federated/federate.h | 17 +++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/core/federated/federate.c b/core/federated/federate.c index 719da99b2..bbc05fa47 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2694,4 +2694,16 @@ instant_t lf_wait_until_time(tag_t tag) { } #endif // FEDERATED_DECENTRALIZED +char* lf_get_federation_id() { + return federation_metadata.federation_id; +} + +instant_t lf_get_effective_start_time() { + return effective_start_tag.time; +} + +instant_t lf_get_start_time() { + return start_time; +} + #endif diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index b6af1f023..aeb8a608f 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -551,4 +551,21 @@ instant_t lf_wait_until_time(tag_t tag); */ char* lf_get_federates_bin_directory(); +/** + * @brief Returns the federation id. + * + * This function is useful for creating federates on runtime. + */ +char* lf_get_federation_id(); + +/** + * @brief Returns the effective start time of the federate. The start_time of persistent + * federates is equal to their effective_start_time. Transient federates, however, + * have their effective_start_time higher or equal to their start_time. + */ +instant_t lf_get_effective_start_time(); + +/** @brief Returns the start time of the federate. */ +instant_t lf_get_start_time(); + #endif // FEDERATE_H From d8f39f4c9aea21f06b9d2060efbca2be82fcd6f8 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Sun, 18 Feb 2024 00:37:49 +0100 Subject: [PATCH 022/148] Add MST_TYPE_TIMESTAMP_START and its length --- include/core/federated/network/net_common.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index 3dbf0a92f..fe6fa88ed 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -617,6 +617,16 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define MSG_TYPE_FAILED 25 +/** + * As an answer to MSG_TYPE_TIMESTAMP, the RTI broadcasts to all persistent + * federates, or sends to newly joining transient federate, a message of + * MSG_TYPE_STIMESTAMP_START. It includes the starting time of the federation, + * together with the effective starting logical tag. The latter is useful for + * transient federates. + */ +#define MSG_TYPE_TIMESTAMP_START 50 +#define MSG_TYPE_TIMESTAMP_START_LENGTH (1 + sizeof(instant_t) + sizeof(instant_t) + sizeof(microstep_t)) + /** * Byte sent by the RTI ordering the federate to stop. Upon receiving the meaasage, * the federate will call lf_stop(), which will make him resign at its current_tag From a4afdc4037b22946782fc57c3c705d4bf99a8549 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 19 Feb 2024 10:33:20 +0100 Subject: [PATCH 023/148] Fix the use of lf_stop() --- core/federated/federate.c | 13 +++---------- core/threaded/reactor_threaded.c | 5 ++++- include/core/federated/federate.h | 14 ++++++++++++-- include/core/federated/network/net_common.h | 6 +++--- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index bbc05fa47..a54e259ea 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -109,7 +109,6 @@ federate_instance_t _fed = {.socket_TCP_RTI = -1, .last_sent_NET = (tag_t){.time = NEVER, .microstep = 0u}, .min_delay_from_physical_action_to_federate_output = NEVER, .is_transient = false}; ->>>>>>> 6fbf4094 (Start on adding federate type (transient or not)) federation_metadata_t federation_metadata = { .federation_id = "Unidentified Federation", .rti_host = NULL, .rti_port = -1, .rti_user = NULL}; @@ -2694,16 +2693,10 @@ instant_t lf_wait_until_time(tag_t tag) { } #endif // FEDERATED_DECENTRALIZED -char* lf_get_federation_id() { - return federation_metadata.federation_id; -} +char* lf_get_federation_id() { return federation_metadata.federation_id; } -instant_t lf_get_effective_start_time() { - return effective_start_tag.time; -} +instant_t lf_get_effective_start_time() { return effective_start_tag.time; } -instant_t lf_get_start_time() { - return start_time; -} +instant_t lf_get_start_time() { return start_time; } #endif diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index 3a974249a..fdbd588ed 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -587,7 +587,10 @@ void _lf_initialize_start_tag(environment_t* env) { // If we have a non-zero STA offset, then we need to allow messages to arrive // at the start time. To avoid spurious STP violations, we temporarily // set the current time back by the STA offset. - env->current_tag.time = lf_time_subtract(env->current_tag.time, lf_fed_STA_offset); + env->current_tag = + (tag_t){.time = effective_start_tag.time - lf_fed_STA_offset, .microstep = effective_start_tag.microstep}; + + LF_PRINT_LOG("Waiting for start time " PRINTF_TIME " plus STA " PRINTF_TIME ".", start_time, lf_fed_STA_offset); #else // For other than federated decentralized execution, there is no lf_fed_STA_offset variable defined. // To use uniform code below, we define it here as a local variable. diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index aeb8a608f..5c5d9683e 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -545,6 +545,16 @@ bool lf_update_max_level(tag_t tag, bool is_provisional); instant_t lf_wait_until_time(tag_t tag); #endif // FEDERATED_DECENTRALIZED +/** + * @brief Stop the execution of a federate. + * Every enclave within the federate will stop at one microstep later than its + * current tag. Unlike lf_request_stop(), this process does not require any + * involvement from the RTI, nor does it necessitate any consensus. + * + * This function is particularly useful for testing transient federates. + */ +void lf_stop(); + /** * @brief Return the directory containing the executables of the individual * federates. @@ -553,14 +563,14 @@ char* lf_get_federates_bin_directory(); /** * @brief Returns the federation id. - * + * * This function is useful for creating federates on runtime. */ char* lf_get_federation_id(); /** * @brief Returns the effective start time of the federate. The start_time of persistent - * federates is equal to their effective_start_time. Transient federates, however, + * federates is equal to their effective_start_time. Transient federates, however, * have their effective_start_time higher or equal to their start_time. */ instant_t lf_get_effective_start_time(); diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index fe6fa88ed..712184c57 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -618,9 +618,9 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MSG_TYPE_FAILED 25 /** - * As an answer to MSG_TYPE_TIMESTAMP, the RTI broadcasts to all persistent - * federates, or sends to newly joining transient federate, a message of - * MSG_TYPE_STIMESTAMP_START. It includes the starting time of the federation, + * As an answer to MSG_TYPE_TIMESTAMP, the RTI broadcasts to all persistent + * federates, or sends to newly joining transient federate, a message of + * MSG_TYPE_STIMESTAMP_START. It includes the starting time of the federation, * together with the effective starting logical tag. The latter is useful for * transient federates. */ From 1a342214c5171f71934d481e70002e76a170a54c Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 19 Feb 2024 16:15:12 +0100 Subject: [PATCH 024/148] Fix send_start_tag() to account for the effective start tag --- core/federated/RTI/rti_remote.c | 16 ++++++++-------- core/federated/federate.c | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 274e5c259..f772dfc2e 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -932,15 +932,15 @@ void send_start_tag(federate_info_t* my_fed, instant_t federation_start_time, ta // message. // In the startup phase, federates will receive identical start_time and // effective_start_tag - unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_LENGTH]; - start_time_buffer[0] = MSG_TYPE_TIMESTAMP; + unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_START_LENGTH]; + start_time_buffer[0] = MSG_TYPE_TIMESTAMP_START; encode_int64(swap_bytes_if_big_endian_int64(start_time), &start_time_buffer[1]); + encode_tag(&(start_time_buffer[1 + sizeof(instant_t)]), federate_start_tag); if (rti_remote->base.tracing_enabled) { - tag_t tag = {.time = start_time, .microstep = 0}; - tracepoint_rti_to_federate(rti_remote->base.trace, send_TIMESTAMP, my_fed->enclave.id, &tag); + tracepoint_rti_to_federate(rti_remote->base.trace, send_TIMESTAMP, my_fed->enclave.id, &federate_start_tag); } - if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_LENGTH, start_time_buffer)) { + if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_START_LENGTH, start_time_buffer)) { lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); } @@ -1048,11 +1048,11 @@ void handle_timestamp(federate_info_t* my_fed) { } else if (rti_remote->phase == shutdown_phase) { // Do not answer the federate if the federation is in hsutdown phase // Or maybe send and error message? - LF_MUTEX_LOCK(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); return; } else { // The federation is the execution phase - // A transient has joined after the startup phase - // At this point, we already hold the mutex + // A transient has joined after the startup phase + // At this point, we already hold the mutex // This is rather a possible extreme corner case, where a transient sends its timestamp, and only // enters the if section after all persistents have joined. diff --git a/core/federated/federate.c b/core/federated/federate.c index a54e259ea..92d4822f5 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -968,7 +968,7 @@ static instant_t get_start_time_from_rti(instant_t my_physical_time) { // Send the timestamp marker first. send_time(MSG_TYPE_TIMESTAMP, my_physical_time); - // Read bytes from the socket. We need 17 (1 + 8 + 8) bytes. + // Read bytes from the socket. We need 21 (1 + 8 + 8 + 4) bytes. // Buffer for message ID plus timestamp. size_t buffer_length = MSG_TYPE_TIMESTAMP_START_LENGTH; unsigned char buffer[buffer_length]; From 628123408e306f22b9f724b55cc42f9991b0f010 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Tue, 20 Feb 2024 10:41:31 +0100 Subject: [PATCH 025/148] Fix for transient connection --- core/federated/RTI/rti_remote.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index f772dfc2e..652d73728 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -2041,9 +2041,8 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { shutdown(socket_id, SHUT_RDWR); close(socket_id); socket_id = -1; - socket_descriptor - // Ignore the federate that failed authentication. - i--; + // Ignore the federate that failed authentication. + i--; continue; } } From 533fcecaac1df4853f217467ced306638abc7813 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Tue, 20 Feb 2024 11:04:28 +0100 Subject: [PATCH 026/148] Fix the initialization of the effective start tag --- core/threaded/reactor_threaded.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index fdbd588ed..88f7fc7b4 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -570,7 +570,7 @@ void _lf_initialize_start_tag(environment_t* env) { // statuses to unknown lf_reset_status_fields_on_input_port_triggers(); - // Get a start_time from the RTI + // Get a start_time and effective_start_tag from the RTI lf_synchronize_with_other_federates(); // Resets start_time in federated execution according to the RTI. } @@ -614,7 +614,7 @@ void _lf_initialize_start_tag(environment_t* env) { lf_time_physical() - start_time); // Restore the current tag to match the start time. - env->current_tag = (tag_t){.time = start_time, .microstep = 0u}; + env->current_tag = (tag_t){.time = effective_start_tag.time, .microstep = effective_start_tag.microstep}; // If the stop_tag is (0,0), also insert the shutdown // reactions. This can only happen if the timeout time From 485bb94720c8f1825bab9e87970dc27439ba42ea Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Tue, 20 Feb 2024 11:11:55 +0100 Subject: [PATCH 027/148] Adjust doc --- core/threaded/reactor_threaded.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index 88f7fc7b4..4229428ef 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -613,7 +613,7 @@ void _lf_initialize_start_tag(environment_t* env) { LF_PRINT_DEBUG("Physical time is ahead of current time by " PRINTF_TIME ". This should be close to the STA offset.", lf_time_physical() - start_time); - // Restore the current tag to match the start time. + // Restore the current tag to match the effective start time. env->current_tag = (tag_t){.time = effective_start_tag.time, .microstep = effective_start_tag.microstep}; // If the stop_tag is (0,0), also insert the shutdown From fa99eeef3959f9176acff016ba0f37ea13d038d1 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Tue, 20 Feb 2024 11:13:24 +0100 Subject: [PATCH 028/148] Adjust 2 debug messages in RTI --- core/threaded/reactor_threaded.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index 4229428ef..ede57d729 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -609,9 +609,10 @@ void _lf_initialize_start_tag(environment_t* env) { // from other federates) to hold the lock and possibly raise a tag barrier. while (!wait_until(effective_start_tag.time + _lf_fed_STA_offset, &env->event_q_changed)) { }; - LF_PRINT_DEBUG("Done waiting for start time + STA offset " PRINTF_TIME ".", start_time + _lf_fed_STA_offset); + LF_PRINT_DEBUG("Done waiting for effective start time + STA offset " PRINTF_TIME ".", + effective_start_tag.time + _lf_fed_STA_offset); LF_PRINT_DEBUG("Physical time is ahead of current time by " PRINTF_TIME ". This should be close to the STA offset.", - lf_time_physical() - start_time); + lf_time_physical() - effective_start_tag.time); // Restore the current tag to match the effective start time. env->current_tag = (tag_t){.time = effective_start_tag.time, .microstep = effective_start_tag.microstep}; From 1e71976bea7e8d5561a532314ed457a51bdc4993 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Tue, 20 Feb 2024 13:42:10 +0100 Subject: [PATCH 029/148] Attemp to fix the hot swap mechanism --- core/federated/RTI/rti_remote.c | 7 +++--- core/federated/federate.c | 39 ++++++++++++++++----------------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 652d73728..fa70b6d85 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1399,7 +1399,7 @@ void* federate_info_thread_TCP(void* fed) { break; case MSG_TYPE_RESIGN: handle_federate_resign(my_fed); - return NULL; + break; case MSG_TYPE_NEXT_EVENT_TAG: handle_next_event_tag(my_fed); break; @@ -1470,6 +1470,7 @@ void send_reject(int* socket_id, unsigned char error_code) { *socket_id = -1; LF_MUTEX_UNLOCK(&rti_mutex); } +lf_print("handle_timestamp for transient 1157"); /** * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload @@ -2054,7 +2055,7 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { if (fed_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(&rti_mutex); if (hot_swap_in_progress) { lf_print("RTI: Hot swap confirmed for federate %d.", fed_id); @@ -2064,7 +2065,7 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); send_stop(fed_old); - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); // Wait for the old federate to send MSG_TYPE_RESIGN LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); diff --git a/core/federated/federate.c b/core/federated/federate.c index 92d4822f5..4327d18e2 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2666,32 +2666,31 @@ bool lf_update_max_level(tag_t tag, bool is_provisional) { return (prev_max_level_allowed_to_advance != max_level_allowed_to_advance); } -char* lf_get_federates_bin_directory() { return LF_FEDERATES_BIN_DIRECTORY; } +void lf_stop() { + environment_t* env; + int num_env = _lf_get_environments(&env); -#ifdef FEDERATED_DECENTRALIZED -instant_t lf_wait_until_time(tag_t tag) { - instant_t result = tag.time; // Default. + for (int i = 0; i < num_env; i++) { + LF_MUTEX_LOCK(&env[i].mutex); - // Do not add the STA if the tag is the starting tag. - if (tag.time != start_time || tag.microstep != 0u) { + tag_t new_stop_tag; + new_stop_tag.time = env[i].current_tag.time; + new_stop_tag.microstep = env[i].current_tag.microstep + 1; - // Apply the STA to the logical time, but only if at least one network input port is not known up to this tag. - // Subtract one microstep because it is sufficient to commit to a tag if the input ports are known - // up to one microstep earlier. - if (tag.microstep > 0) { - tag.microstep--; - } else { - tag.microstep = UINT_MAX; - tag.time -= 1; - } + _lf_set_stop_tag(&env[i], new_stop_tag); - if (!inputs_known_to(tag)) { - result = lf_time_add(result, lf_fed_STA_offset); - } + lf_print("Setting the stop tag of env %d to " PRINTF_TAG ".", i, env[i].stop_tag.time - start_time, + env[i].stop_tag.microstep); + + if (env[i].barrier.requestors) + _lf_decrement_tag_barrier_locked(&env[i]); + lf_cond_broadcast(&env[i].event_q_changed); + LF_MUTEX_UNLOCK(&env[i].mutex); } - return result; + LF_PRINT_LOG("Federate is stopping."); } -#endif // FEDERATED_DECENTRALIZED + +char* lf_get_federates_bin_directory() { return LF_FEDERATES_BIN_DIRECTORY; } char* lf_get_federation_id() { return federation_metadata.federation_id; } From ef9d528039267f88284520914ef7e611d12e36ad Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Tue, 20 Feb 2024 14:07:57 +0100 Subject: [PATCH 030/148] Remove overlooked instruction --- core/federated/RTI/rti_remote.c | 1 - 1 file changed, 1 deletion(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index fa70b6d85..7e48b0f45 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1470,7 +1470,6 @@ void send_reject(int* socket_id, unsigned char error_code) { *socket_id = -1; LF_MUTEX_UNLOCK(&rti_mutex); } -lf_print("handle_timestamp for transient 1157"); /** * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload From 07b2831fe068612137d8c578443f0af9692758dd Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Tue, 20 Feb 2024 14:20:07 +0100 Subject: [PATCH 031/148] Augment fedsd to acount for transient federates --- util/tracing/visualization/fedsd.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/util/tracing/visualization/fedsd.py b/util/tracing/visualization/fedsd.py index d66000e4b..0d7f57246 100644 --- a/util/tracing/visualization/fedsd.py +++ b/util/tracing/visualization/fedsd.py @@ -28,6 +28,7 @@ .TAG { stroke: #08a578; fill: #08a578} \ .TIMESTAMP { stroke: grey; fill: grey } \ .FED_ID {stroke: #80DD99; fill: #80DD99 } \ + .STOP {stroke: #d0b7eb; fill: #d0b7eb} \ .ADV {stroke-linecap="round" ; stroke: "red" ; fill: "red"} \ text { \ font-size: smaller; \ @@ -83,7 +84,9 @@ "Receiving ADR_AD": "ADR_AD", "Receiving ADR_QR": "ADR_QR", "Receiving UNIDENTIFIED": "UNIDENTIFIED", - "Scheduler advancing time ends": "AdvLT" + "Scheduler advancing time ends": "AdvLT", + "Sending STOP": "STOP", + "Receiving STOP": "STOP" } prune_event_name.setdefault(" ", "UNIDENTIFIED") @@ -110,7 +113,7 @@ # Events matching at the sender and receiver ends depend on whether they are tagged # (the elapsed logical time and microstep have to be the same) or not. # Set of tagged events (messages) -non_tagged_messages = {'FED_ID', 'ACK', 'RESIGN', 'FAILED', 'REJECT', 'ADR_QR', 'ADR_AD', 'MSG', 'P2P_MSG'} +non_tagged_messages = {'FED_ID', 'ACK', 'RESIGN', 'FAILED', 'REJECT', 'ADR_QR', 'ADR_AD', 'MSG', 'P2P_MSG', 'STOP'} ################################################################################ @@ -209,7 +212,6 @@ def svg_string_draw_label(x1, y1, x2, y2, label) : else: rotation = 0 str_line = '\t'+label+'\n' - #print('rot = '+str(rotation)+' x1='+str(x1)+' y1='+str(y1)+' x2='+str(x2)+' y2='+str(y2)) return str_line @@ -504,11 +506,17 @@ def get_and_convert_lft_files(rti_lft_file, federates_lft_files, start_time, end if (not fed_df.empty): # Get the federate id number fed_id = fed_df.iloc[-1]['self_id'] - # Add to the list of sequence diagram actors and add the name - actors.append(fed_id) - actors_names[fed_id] = Path(fed_trace).stem - # Derive the x coordinate of the actor - x_coor[fed_id] = (padding * 2) + (spacing * (len(actors) - 1)) + + ### Check that the federate id have not been entrered yet. + ### This is particlurly useful for transient actors, when + ### they leave and join several times + if (actors.count(fed_id) == 0): + # Add to the list of sequence diagram actors and add the name + actors.append(fed_id) + actors_names[fed_id] = Path(fed_trace).stem + # Derive the x coordinate of the actor + x_coor[fed_id] = (padding * 2) + (spacing * (len(actors)-1)) + fed_df['x1'] = x_coor[fed_id] trace_df = pd.concat([trace_df, fed_df]) fed_df = fed_df[0:0] @@ -672,7 +680,7 @@ def get_and_convert_lft_files(rti_lft_file, federates_lft_files, start_time, end # FIXME: Using microseconds is hardwired here. physical_time = f'{int(row["physical_time"]/1000):,}' - if (row['event'] in {'FED_ID', 'ACK', 'FAILED', 'REJECT', 'ADR_QR', 'ADR_AD', 'MSG', 'P2P_MSG'}): + if (row['event'] in {'FED_ID', 'ACK', 'FAILED', 'REJECT', 'ADR_QR', 'ADR_AD', 'MSG', 'P2P_MSG', 'STOP'}): label = row['event'] else: label = row['event'] + '(' + f'{int(row["logical_time"]):,}' + ', ' + str(row['microstep']) + ')' From 39b02917c65ada5f31e84ba63110c1c80e4b599d Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Tue, 20 Feb 2024 20:44:32 +0100 Subject: [PATCH 032/148] Minor fixes --- core/federated/RTI/rti_remote.c | 1 - core/federated/federate.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 7e48b0f45..bd86cc3de 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -2042,7 +2042,6 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { close(socket_id); socket_id = -1; // Ignore the federate that failed authentication. - i--; continue; } } diff --git a/core/federated/federate.c b/core/federated/federate.c index 4327d18e2..5e97cbfc2 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -996,7 +996,7 @@ static instant_t get_start_time_from_rti(instant_t my_physical_time) { // This is rather a choice. To be changed, if needed, of course. tracepoint_federate_from_rti(receive_TIMESTAMP, _lf_my_fed_id, &effective_start_tag); lf_print("Starting timestamp is: " PRINTF_TIME " and effectve start tag is: " PRINTF_TAG ".", timestamp, - effective_start_tag); + effective_start_tag.time - start_time, effective_start_tag.microstep); LF_PRINT_LOG("Current physical time is: " PRINTF_TIME ".", lf_time_physical()); return timestamp; From e020745b24cd253cb8277aeb2f108f9c0365230c Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Tue, 20 Feb 2024 22:17:25 +0100 Subject: [PATCH 033/148] Fix lingua-france-ref.txt --- lingua-franca-ref.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingua-franca-ref.txt b/lingua-franca-ref.txt index 8b25206ff..52199a147 100644 --- a/lingua-franca-ref.txt +++ b/lingua-franca-ref.txt @@ -1 +1 @@ -master \ No newline at end of file +transient-fed From 0ac8787364cc879d3e07a73af30c2146fcf79810 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Tue, 20 Feb 2024 22:54:58 +0100 Subject: [PATCH 034/148] Fix void* returns --- core/federated/RTI/rti_remote.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index bd86cc3de..e3bf9add1 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -116,7 +116,8 @@ void* pending_grant_thread(void* federate) { notify_tag_advance_grant_immediate(&(fed->enclave), fed->pending_grant); fed->pending_grant = NEVER_TAG; } - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); + return NULL; } /** From c05b2accb2b18bbe62e495c4e8def64fdfcce1c7 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad <37504116+ChadliaJerad@users.noreply.github.com> Date: Fri, 23 Feb 2024 10:28:45 +0100 Subject: [PATCH 035/148] Fix based on code review Co-authored-by: Edward A. Lee --- core/federated/RTI/rti_remote.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index e3bf9add1..cf3ea776d 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -44,7 +44,7 @@ federate_info_t* hot_swap_federate; // Indicates if a hot swap process is in progress bool hot_swap_in_progress = false; -// Indicates thatthe old federate has stopped +// Indicates that the old federate has stopped. bool hot_swap_old_resigned = false; bool _lf_federate_reports_error = false; @@ -276,7 +276,7 @@ void* pending_provisional_grant_thread(void* federate) { * @param e The scheduling node. * @param tag The provisional tag to grant. */ -void notify_provisional_tag_advance_grant_delayed(scheduling_node_t* e, tag_t tag) { +static void notify_provisional_tag_advance_grant_delayed(scheduling_node_t* e, tag_t tag) { federate_info_t* fed = (federate_info_t*)e; // Proceed with the delayed provisional tag grant notification only if @@ -915,7 +915,8 @@ void handle_address_ad(uint16_t federate_id) { } /** - * Send to the start time to the federate my_fed. + * @brief Send to the start time to the federate my_fed. + * * This function assumes the caller does not hold the mutex. * * If it is the startup phase, the start_time will be the maximum received timestamps @@ -928,7 +929,7 @@ void handle_address_ad(uint16_t federate_id) { * @param federation_start_time the federation start_time * @param federate_start_tag the federate effective start tag */ -void send_start_tag(federate_info_t* my_fed, instant_t federation_start_time, tag_t federate_start_tag) { +static void send_start_tag(federate_info_t* my_fed, instant_t federation_start_time, tag_t federate_start_tag) { // Send back to the federate the maximum time plus an offset on a TIMESTAMP_START // message. // In the startup phase, federates will receive identical start_time and From 99207ec04042a6fc6dea91baa36e2ce207f1ecd4 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad <37504116+ChadliaJerad@users.noreply.github.com> Date: Fri, 23 Feb 2024 10:34:13 +0100 Subject: [PATCH 036/148] Make immediate and delayed (p)tag notification functions static, as well as the waiting threads Co-authored-by: Edward A. Lee --- core/federated/RTI/rti_remote.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index cf3ea776d..b793f0cb2 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -70,7 +70,7 @@ extern int lf_critical_section_exit(environment_t* env) { return lf_mutex_unlock * @param e The enclave. * @param tag The tag to grant. */ -void notify_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { +static void notify_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); unsigned char buffer[message_length]; buffer[0] = MSG_TYPE_TAG_ADVANCE_GRANT; @@ -95,12 +95,12 @@ void notify_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { /** * @brief Thread that sleeps for a period of time, and then wakes up to check if - * a tag advance grant needs to be sent. That is, if the pending tag have not + * a tag advance grant needs to be sent. That is, if the pending tag has not * been reset to NEVER_TAG, the tag advance grant will be immediate. * - * @param federate the fedarate whose tag advance grant needs to be delayed. + * @param federate the federate whose tag advance grant needs to be delayed. */ -void* pending_grant_thread(void* federate) { +static void* pending_grant_thread(void* federate) { federate_info_t* fed = (federate_info_t*)federate; interval_t sleep_interval = fed->pending_grant.time - lf_time_physical(); @@ -110,13 +110,13 @@ void* pending_grant_thread(void* federate) { lf_mutex_lock(&rti_mutex); - // If the pending grant becomes NEVER_TAG, then this means that it should - // not be sent + // If the pending grant has become NEVER_TAG, then this means that it should + // not be sent. if (lf_tag_compare(fed->pending_grant, NEVER_TAG) != 0) { notify_tag_advance_grant_immediate(&(fed->enclave), fed->pending_grant); fed->pending_grant = NEVER_TAG; } - LF_MUTEX_UNLOCK(&rti_mutex); + lf_mutex_unlock(&rti_mutex); return NULL; } @@ -130,7 +130,7 @@ void* pending_grant_thread(void* federate) { * @param e The enclave. * @param tag The tag to grant. */ -void notify_tag_advance_grant_delayed(scheduling_node_t* e, tag_t tag) { +static void notify_tag_advance_grant_delayed(scheduling_node_t* e, tag_t tag) { federate_info_t* fed = GET_FED_INFO(e->id); // Check wether there is already a pending grant @@ -188,7 +188,7 @@ void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { * @param e The scheduling node. * @param tag The tag to grant. */ -void notify_provisional_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { +static void notify_provisional_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); unsigned char buffer[message_length]; buffer[0] = MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT; @@ -247,7 +247,7 @@ void notify_provisional_tag_advance_grant_immediate(scheduling_node_t* e, tag_t * * @param federate the federate whose provisional tag advance grant needs to be delayed. */ -void* pending_provisional_grant_thread(void* federate) { +static void* pending_provisional_grant_thread(void* federate) { federate_info_t* fed = (federate_info_t*)federate; interval_t sleep_interval = fed->pending_provisional_grant.time - lf_time_physical(); From 9d7e5b92848739c8387aca58b3f1cb0174e080b7 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad <37504116+ChadliaJerad@users.noreply.github.com> Date: Fri, 23 Feb 2024 10:17:14 +0100 Subject: [PATCH 037/148] Control the type and value of the number of transients Co-authored-by: Edward A. Lee --- core/federated/RTI/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index 005b784ae..6c0f9361d 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -122,7 +122,7 @@ void usage(int argc, const char* argv[]) { lf_print(" -n, --number_of_federates "); lf_print(" The number of federates in the federation that this RTI will control.\n"); lf_print(" -nt, --number_of_transient_federates "); - lf_print(" The number of transient federates in the federation that this RTI will control.\n"); + lf_print(" The number of federates that are transient; this must be strictly less than the number of federates.\n"); lf_print(" -p, --port "); lf_print(" The port number to use for the RTI. Must be larger than 0 and smaller than %d. Default is %d.\n", UINT16_MAX, DEFAULT_PORT); From f49287b5c4358f3aaf6d4366ed0f732ff478263a Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 29 Feb 2024 14:33:20 +0100 Subject: [PATCH 038/148] The number of transients should not be greater or equal to the number of federates --- core/federated/RTI/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index 6c0f9361d..5f27092d1 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -294,8 +294,8 @@ int process_args(int argc, const char* argv[]) { usage(argc, argv); return 0; } - if (rti.number_of_transient_federates > rti.base.number_of_scheduling_nodes) { - lf_print_error("--number_of_transient_federates cannot be higher than the number of federates."); + if (rti.number_of_transient_federates >= rti.base.number_of_scheduling_nodes) { + lf_print_error("--number_of_transient_federates cannot be higher or equal to the number of federates."); usage(argc, argv); return 0; } From 58a4a0d38eb0f0052110e224e30316c1c6f2607c Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 29 Feb 2024 15:49:31 +0100 Subject: [PATCH 039/148] Remove wrongly duplicated code --- core/federated/RTI/rti_remote.c | 42 --------------------------------- 1 file changed, 42 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index b793f0cb2..d2c4902c2 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -914,48 +914,6 @@ void handle_address_ad(uint16_t federate_id) { } } -/** - * @brief Send to the start time to the federate my_fed. - * - * This function assumes the caller does not hold the mutex. - * - * If it is the startup phase, the start_time will be the maximum received timestamps - * plus an offset. The federate will then receive identical federation_start_time - * and federate_start_tag.time (the federate_start_tag.microstep will be 0). - * If, however, the startup phase is passed, the federate will receive different - * values than sateted above. - * - * @param my_fed the federate to send the start time to. - * @param federation_start_time the federation start_time - * @param federate_start_tag the federate effective start tag - */ -static void send_start_tag(federate_info_t* my_fed, instant_t federation_start_time, tag_t federate_start_tag) { - // Send back to the federate the maximum time plus an offset on a TIMESTAMP_START - // message. - // In the startup phase, federates will receive identical start_time and - // effective_start_tag - unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_START_LENGTH]; - start_time_buffer[0] = MSG_TYPE_TIMESTAMP_START; - encode_int64(swap_bytes_if_big_endian_int64(start_time), &start_time_buffer[1]); - encode_tag(&(start_time_buffer[1 + sizeof(instant_t)]), federate_start_tag); - - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_TIMESTAMP, my_fed->enclave.id, &federate_start_tag); - } - if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_START_LENGTH, start_time_buffer)) { - lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); - } - - LF_MUTEX_LOCK(&rti_mutex); - // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP - // message has been sent. That MSG_TYPE_TIMESTAMP message grants time advance to - // the federate to the start time. - my_fed->enclave.state = GRANTED; - lf_cond_broadcast(&sent_start_time); - LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); - LF_MUTEX_UNLOCK(&rti_mutex); -} - /** * Send to the start time to the federate my_fed. * This function assumes the caller does not hold the mutex. From 9f5090a8f67f88d30855d820f126dea2b823a3df Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 29 Feb 2024 15:57:49 +0100 Subject: [PATCH 040/148] Code review: remove redundancy and add has_transient_upstream_federats --- core/federated/RTI/rti_remote.c | 168 ++++++++++++-------------------- core/federated/RTI/rti_remote.h | 2 + 2 files changed, 63 insertions(+), 107 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index d2c4902c2..d1ff93a76 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -127,11 +127,11 @@ static void* pending_grant_thread(void* federate) { * If a provisionl tag advance grant is pending, cancel it. If there is another * pending tag advance grant, do not proceed with the thread creation. * - * @param e The enclave. + * @param fed The federate. * @param tag The tag to grant. */ static void notify_tag_advance_grant_delayed(scheduling_node_t* e, tag_t tag) { - federate_info_t* fed = GET_FED_INFO(e->id); + federate_info_t* fed = (federate_info_t*)GET_FED_INFO(e->id); // Check wether there is already a pending grant // And check the pending provisional grant as well @@ -160,21 +160,20 @@ void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { lf_cond_wait(&sent_start_time); } + // Check if sending the tag advance grant needs to be delayed or not. + // Delay is needed when a federate has at least one absent upstream transient. + // Check if sending the tag advance grant needs to be delayed or not // Delay is needed when a federate has, at least one, absent upstream transient - int num_absent_upstram_transients = 0; - for (int j = 0; j < e->num_upstream; j++) { - federate_info_t* upstream = GET_FED_INFO(e->upstream[j]); - // Do Ignore this enclave if it no longer connected. - if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { - num_absent_upstram_transients++; - break; - } - } - if (num_absent_upstram_transients > 0) { - notify_tag_advance_grant_delayed(e, tag); - } else { + federate_info_t* fed = GET_FED_INFO(e->id); + if (!fed->has_upstream_transient_federates) { notify_tag_advance_grant_immediate(e, tag); + } else { + if (get_num_absent_upstream_transients(fed) > 0) { + notify_tag_advance_grant_delayed(fed, tag); + } else { + notify_tag_advance_grant_immediate(e, tag); + } } } @@ -273,7 +272,7 @@ static void* pending_provisional_grant_thread(void* federate) { * If a tag advance grant or a provisional one is pending, then do not proceed * with the thread creation. * - * @param e The scheduling node. + * @param fed The federate. * @param tag The provisional tag to grant. */ static void notify_provisional_tag_advance_grant_delayed(scheduling_node_t* e, tag_t tag) { @@ -281,13 +280,13 @@ static void notify_provisional_tag_advance_grant_delayed(scheduling_node_t* e, t // Proceed with the delayed provisional tag grant notification only if // there is no pending grant and no provisional pending grant - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(&rti_mutex); if ((lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) && (lf_tag_compare(fed->pending_provisional_grant, NEVER_TAG) >= 0)) { fed->pending_provisional_grant = tag; lf_thread_create(&(fed->pending_provisional_grant_thread_id), pending_provisional_grant_thread, fed); } - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); } void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { @@ -304,98 +303,15 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { // Check if sending the tag advance grant needs to be delayed or not // Delay is needed when a federate has, at least one, absent upstream transient - int num_absent_upstram_transients = 0; - for (int j = 0; j < e->num_upstream; j++) { - federate_info_t* upstream = GET_FED_INFO(e->upstream[j]); - // Do Ignore this enclave if it no longer connected. - if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { - num_absent_upstram_transients++; - } - } - if (num_absent_upstram_transients > 0) { - notify_provisional_tag_advance_grant_delayed(e, tag); - } else { + federate_info_t* fed = GET_FED_INFO(e->id); + if (!fed->has_upstream_transient_federates) { notify_provisional_tag_advance_grant_immediate(e, tag); - } -} - -/** - * Thread that sleeps for a period of time, and then wakes up to check if - * a provisional tag advance grant needs to be sent. That is, if the pending - * provisional tag have not been reset to NEVER_TAG, the provisional tag advance - * grant will be immediate. - * - * @param federate the federate whose provisional tag advance grant needs to be delayed. - */ -void* pending_provisional_grant_thread(void* federate) { - federate_info_t* fed = (federate_info_t*)federate; - - interval_t sleep_interval = fed->pending_provisional_grant.time - lf_time_physical(); - if (sleep_interval > 0) { - lf_sleep(sleep_interval); - } - - lf_mutex_lock(&rti_mutex); - - // If the pending grant becomes NEVER_TAG, then this means that it should - // not be sent - if (lf_tag_compare(fed->pending_provisional_grant, NEVER_TAG) != 0) { - notify_provisional_tag_advance_grant_immediate(&(fed->enclave), fed->pending_provisional_grant); - fed->pending_provisional_grant = NEVER_TAG; - } - lf_mutex_unlock(&rti_mutex); -} - -/** - * Notify a provisional tag advance grant (PTAG) message to the specified federate - * after the physical time reaches the tag. A thread is created to this end. - * - * If a tag advance grant or a provisional one is pending, then do not proceed - * with the thread creation. - * - * @param e The scheduling node. - * @param tag The provisional tag to grant. - */ -void notify_provisional_tag_advance_grant_delayed(scheduling_node_t* e, tag_t tag) { - federate_info_t* fed = (federate_info_t*)e; - - // Proceed with the delayed provisional tag grant notification only if - // there is no pending grant and no provisional pending grant - lf_mutex_lock(&rti_mutex); - if ((lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) && - (lf_tag_compare(fed->pending_provisional_grant, NEVER_TAG) >= 0)) { - fed->pending_provisional_grant = tag; - lf_thread_create(&(fed->pending_provisional_grant_thread_id), pending_provisional_grant_thread, fed); - } - lf_mutex_unlock(&rti_mutex); -} - -void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { - if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || - lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { - return; - } - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (e->state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); - } - - // Check if sending the tag advance grant needs to be delayed or not - // Delay is needed when a federate has, at least one, absent upstream transient - int num_absent_upstram_transients = 0; - for (int j = 0; j < e->num_upstream; j++) { - federate_info_t* upstream = GET_FED_INFO(e->upstream[j]); - // Do Ignore this enclave if it no longer connected. - if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { - num_absent_upstram_transients++; - } - } - if (num_absent_upstram_transients > 0) { - notify_provisional_tag_advance_grant_delayed(e, tag); } else { - notify_provisional_tag_advance_grant_immediate(e, tag); + if (get_num_absent_upstream_transients(fed) > 0) { + notify_provisional_tag_advance_grant_delayed(fed, tag); + } else { + notify_provisional_tag_advance_grant_immediate(e, tag); + } } } @@ -2161,10 +2077,48 @@ int32_t start_rti_server(uint16_t port) { return rti_remote->socket_descriptor_TCP; } +/** + * Iterate over the federates and sets 'has_upstream_transient_federates'. + * Once done, check that no transient federate has an upstream transient federate. + + * @return true for success, false for failure. + */ +static bool set_has_upstream_transient_federates_parameter_and_check() { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + for (int j = 0; j < fed->enclave.num_upstream; j++) { + federate_info_t* upstream_fed = GET_FED_INFO(fed->enclave.upstream[j]); + if (upstream_fed->is_transient) { + fed->has_upstream_transient_federates = true; + break; + } + } + } + + // Now check that no transient has an upstream transient + // FIXME: Do we really need this? Or should it be the job of the validator? + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->is_transient && fed->has_upstream_transient_federates) { + return false; + } + } + + return true; +} + void wait_for_federates(int socket_descriptor) { // Wait for connections from persistent federates and create a thread for each. lf_connect_to_persistent_federates(socket_descriptor); + // Set has_upstream_transient_federates parameter in all federates and check + // that is no more than one level of transiency + if (rti_remote->number_of_transient_federates > 0) { + if (!set_has_upstream_transient_federates_parameter_and_check()) { + lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); + } + } + // All persistent federates have connected. lf_print("RTI: All expected persistent federates have connected. Starting execution."); if (rti_remote->number_of_transient_federates > 0) { diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index 17196e00e..423696fee 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -67,6 +67,8 @@ typedef struct federate_info_t { // RTI has not been informed of the port number. struct in_addr server_ip_addr; // Information about the IP address of the socket // server of the federate. + bool has_upstream_transient_federates; // Indicates whether the federate has uptream + // transient federates bool is_transient; // Indicates whether the federate is transient or persistent. tag_t effective_start_tag; // Records the start time of the federate, which is // mainly useful for transient federates From ece11b021c4b79a91ffb9684737b8a792838d58a Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 29 Feb 2024 15:58:53 +0100 Subject: [PATCH 041/148] Fix the call to tracepoint() to account for the refactoring of the trace mechanism --- core/federated/RTI/rti_remote.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index d1ff93a76..74be98fad 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -726,7 +726,7 @@ void handle_stop_request_message(federate_info_t* fed) { continue; } if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); + (send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); } write_to_socket_fail_on_error(&f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, &rti_mutex, "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", @@ -854,10 +854,9 @@ void send_start_tag(federate_info_t* my_fed, instant_t federation_start_time, ta encode_int64(swap_bytes_if_big_endian_int64(start_time), &start_time_buffer[1]); if (rti_remote->base.tracing_enabled) { - tag_t tag = {.time = start_time, .microstep = 0}; - tracepoint_rti_to_federate(rti_remote->base.trace, send_TIMESTAMP, my_fed->enclave.id, &tag); + tracepoint_rti_to_federate(send_TIMESTAMP, my_fed->enclave.id, &federate_start_tag); } - if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_LENGTH, start_time_buffer)) { + if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_START_LENGTH, start_time_buffer)) { lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); } @@ -1417,7 +1416,7 @@ static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_ LF_PRINT_DEBUG("RTI received federation ID: %s.", federation_id_received); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(rti_remote->base.trace, receive_FED_ID, fed_id, NULL); + tracepoint_rti_from_federate(receive_FED_ID, fed_id, NULL); } // Compare the received federation ID to mine. if (strncmp(rti_remote->federation_id, federation_id_received, federation_id_length) != 0) { @@ -1425,7 +1424,7 @@ static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_ lf_print_warning("Federate from another federation %s attempted to connect to RTI in federation %s.", federation_id_received, rti_remote->federation_id); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); } send_reject(socket_id, FEDERATION_ID_DOES_NOT_MATCH); return -1; @@ -1434,7 +1433,7 @@ static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_ // Federate ID is out of range. lf_print_error("RTI received federate ID %d, which is out of range.", fed_id); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); } send_reject(socket_id, FEDERATE_ID_OUT_OF_RANGE); return -1; @@ -1449,7 +1448,7 @@ static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_ if (!is_transient) { lf_print_error("RTI received duplicate federate ID: %d.", fed_id); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); } send_reject(socket_id, FEDERATE_ID_IN_USE); return -1; @@ -1459,7 +1458,7 @@ static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_ Only one hot swap operation is allowed at a time.", fed_id, hot_swap_federate->enclave.id); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); } send_reject(socket_id, FEDERATE_ID_IN_USE); return -1; @@ -1871,7 +1870,7 @@ void send_stop(federate_info_t* fed) { lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); + tracepoint_rti_to_federate(send_STOP, fed->enclave.id, NULL); } write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); @@ -2035,6 +2034,7 @@ void initialize_federate(federate_info_t* fed, uint16_t id) { strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); fed->server_ip_addr.s_addr = 0; fed->server_port = -1; + fed->has_upstream_transient_federates = false; fed->is_transient = true; fed->effective_start_tag = NEVER_TAG; fed->pending_grant = NEVER_TAG; From c04bfc129df062325df53a7b43d7f164461d2ab5 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 29 Feb 2024 16:12:58 +0100 Subject: [PATCH 042/148] Fix a call to tracepoint --- core/federated/RTI/rti_remote.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 74be98fad..13d52e752 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -726,7 +726,7 @@ void handle_stop_request_message(federate_info_t* fed) { continue; } if (rti_remote->base.tracing_enabled) { - (send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); + tracepoint_rti_to_federate(send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); } write_to_socket_fail_on_error(&f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, &rti_mutex, "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", From e7f806a035a43320d097525f401e04e8ce1e3693 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 29 Feb 2024 17:44:06 +0100 Subject: [PATCH 043/148] Change _lf_fed_STA_offset to lf_fed_STA_offset and _lf_set_stop_tag to lf_set_sto_tag --- core/federated/federate.c | 2 +- core/threaded/reactor_threaded.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 5e97cbfc2..66785337f 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2677,7 +2677,7 @@ void lf_stop() { new_stop_tag.time = env[i].current_tag.time; new_stop_tag.microstep = env[i].current_tag.microstep + 1; - _lf_set_stop_tag(&env[i], new_stop_tag); + lf_set_stop_tag(&env[i], new_stop_tag); lf_print("Setting the stop tag of env %d to " PRINTF_TAG ".", i, env[i].stop_tag.time - start_time, env[i].stop_tag.microstep); diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index ede57d729..fbba5a76c 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -610,7 +610,7 @@ void _lf_initialize_start_tag(environment_t* env) { while (!wait_until(effective_start_tag.time + _lf_fed_STA_offset, &env->event_q_changed)) { }; LF_PRINT_DEBUG("Done waiting for effective start time + STA offset " PRINTF_TIME ".", - effective_start_tag.time + _lf_fed_STA_offset); + effective_start_tag.time + lf_fed_STA_offset); LF_PRINT_DEBUG("Physical time is ahead of current time by " PRINTF_TIME ". This should be close to the STA offset.", lf_time_physical() - effective_start_tag.time); From a9e70121bfcb1724767710b9823fe1dc5eb97afa Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 29 Feb 2024 19:13:22 +0100 Subject: [PATCH 044/148] Fix the call to tracepoint in federate.c --- core/federated/federate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 66785337f..b94ebeb47 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1420,7 +1420,7 @@ void lf_stop() { */ void handle_stop() { // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_STOP, _lf_my_fed_id, NULL); + tracepoint_federate_from_rti(receive_STOP, _lf_my_fed_id, NULL); lf_print("Received from RTI a MSG_TYPE_STOP at physical time " PRINTF_TIME ".", lf_time_physical()); From ca0ac6eb4f5f8841b335c6334a3a3ce33a394760 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 14 Mar 2024 23:01:15 +0100 Subject: [PATCH 045/148] Add comments and more formatting --- core/federated/RTI/main.c | 5 +---- core/federated/RTI/rti_remote.h | 3 +-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index 5f27092d1..dc4cbeb53 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -339,10 +339,7 @@ int main(int argc, const char* argv[]) { } lf_print("Starting RTI for a total of %d federates, with %d being transient, in federation ID %s", - rti.base.number_of_scheduling_nodes, rti.number_of_transient_federates, - - rti.federation_id); - + rti.base.number_of_scheduling_nodes, rti.number_of_transient_federates, rti.federation_id); assert(rti.base.number_of_scheduling_nodes < UINT16_MAX); // Allocate memory for the federates diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index 423696fee..8432f93ea 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -78,7 +78,6 @@ typedef struct federate_info_t { // tag grant lf_thread_t pending_provisional_grant_thread_id; // The ID of the thread handling // the pending provitional tag grant - } federate_info_t; /** @@ -187,7 +186,7 @@ typedef struct rti_remote_t { int32_t number_of_connected_transient_federates; /** - * + * Indicates the life cycle phase of the federation. */ federation_life_cycle_phase phase; } rti_remote_t; From 8080a527473f9476ef4c4927bea33e25710fe2c0 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 14 Mar 2024 23:02:49 +0100 Subject: [PATCH 046/148] Fix bugs due to rebase + Tracepoint lf_stop() --- core/federated/RTI/rti_remote.c | 23 +++++++++-------------- core/threaded/reactor_threaded.c | 7 +++---- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 13d52e752..d8f524ba6 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -130,9 +130,7 @@ static void* pending_grant_thread(void* federate) { * @param fed The federate. * @param tag The tag to grant. */ -static void notify_tag_advance_grant_delayed(scheduling_node_t* e, tag_t tag) { - federate_info_t* fed = (federate_info_t*)GET_FED_INFO(e->id); - +static void notify_tag_advance_grant_delayed(federate_info_t* fed, tag_t tag) { // Check wether there is already a pending grant // And check the pending provisional grant as well lf_mutex_lock(&rti_mutex); @@ -275,9 +273,7 @@ static void* pending_provisional_grant_thread(void* federate) { * @param fed The federate. * @param tag The provisional tag to grant. */ -static void notify_provisional_tag_advance_grant_delayed(scheduling_node_t* e, tag_t tag) { - federate_info_t* fed = (federate_info_t*)e; - +static void notify_provisional_tag_advance_grant_delayed(federate_info_t* fed, tag_t tag) { // Proceed with the delayed provisional tag grant notification only if // there is no pending grant and no provisional pending grant LF_MUTEX_LOCK(&rti_mutex); @@ -849,9 +845,10 @@ void send_start_tag(federate_info_t* my_fed, instant_t federation_start_time, ta // message. // In the startup phase, federates will receive identical start_time and // effective_start_tag - unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_LENGTH]; - start_time_buffer[0] = MSG_TYPE_TIMESTAMP; + unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_START_LENGTH]; + start_time_buffer[0] = MSG_TYPE_TIMESTAMP_START; encode_int64(swap_bytes_if_big_endian_int64(start_time), &start_time_buffer[1]); + encode_tag(&(start_time_buffer[1 + sizeof(instant_t)]), federate_start_tag); if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(send_TIMESTAMP, my_fed->enclave.id, &federate_start_tag); @@ -1372,7 +1369,7 @@ static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_ // First byte received is the message type. if (buffer[0] != MSG_TYPE_FED_IDS) { if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); } if (buffer[0] == MSG_TYPE_P2P_SENDING_FED_ID || buffer[0] == MSG_TYPE_P2P_TAGGED_MESSAGE) { // The federate is trying to connect to a peer, not to the RTI. @@ -1891,15 +1888,13 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { // The following blocks until a federate connects. int socket_id = -1; while (1) { - if (!rti_remote->all_persistent_federates_exited) { - return NULL; - } + // if (!rti_remote->all_persistent_federates_exited) { + // return NULL; + // } socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); if (socket_id >= 0) { // Got a socket break; - } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { - lf_print_error_system_failure("RTI failed to accept the socket."); } else { // Try again lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index fbba5a76c..5c66b4772 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -582,14 +582,13 @@ void _lf_initialize_start_tag(environment_t* env) { } _lf_initialize_timers(env); + env->current_tag = effective_start_tag; #if defined FEDERATED_DECENTRALIZED // If we have a non-zero STA offset, then we need to allow messages to arrive // at the start time. To avoid spurious STP violations, we temporarily // set the current time back by the STA offset. - env->current_tag = - (tag_t){.time = effective_start_tag.time - lf_fed_STA_offset, .microstep = effective_start_tag.microstep}; - + env->current_tag.time -= lf_fed_STA_offset; LF_PRINT_LOG("Waiting for start time " PRINTF_TIME " plus STA " PRINTF_TIME ".", start_time, lf_fed_STA_offset); #else // For other than federated decentralized execution, there is no lf_fed_STA_offset variable defined. @@ -632,7 +631,7 @@ void _lf_initialize_start_tag(environment_t* env) { // from exceeding the timestamp of the message. It will remove that barrier // once the complete message has been read. Here, we wait for that barrier // to be removed, if appropriate before proceeding to executing tag (0,0). - _lf_wait_on_tag_barrier(env, (tag_t){.time = start_time, .microstep = 0}); + _lf_wait_on_tag_barrier(env, effective_start_tag); lf_spawn_staa_thread(); #else // NOT FEDERATED_DECENTRALIZED From df2eaae4a23e5a184326bbe7d78c896bfbe5dd4d Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 15 Mar 2024 00:30:50 +0100 Subject: [PATCH 047/148] Code review: document that MSG_TYPE_FED_IDS include the type of the federate (persistent or transient) --- include/core/federated/network/net_common.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index 712184c57..21e54c9cb 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -44,10 +44,10 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * When it has successfully opened a TCP connection, the first message it sends * to the RTI is a MSG_TYPE_FED_IDS message, which contains the ID of this federate * within the federation, contained in the global variable _lf_my_fed_id - * in the federate code - * (which is initialized by the code generator) and the unique ID of - * the federation, a GUID that is created at run time by the generated script - * that launches the federation. + * in the federate code (which is initialized by the code generator), + * the type of this federate (persistent (0) or transient (1)), + * and the unique ID of the federation, a GUID that is created at run time by the + * generated script that launches the federation. * If you launch the federates and the RTI manually, rather than using the script, * then the federation ID is a string that is optionally given to the federate * on the command line when it is launched. The federate will connect From a705c11221f87b786ee8e46f1f58e641658380b3 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 15 Mar 2024 00:43:15 +0100 Subject: [PATCH 048/148] Add return NULL in void* functions --- core/federated/RTI/rti_remote.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index d8f524ba6..19ff51254 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -261,6 +261,7 @@ static void* pending_provisional_grant_thread(void* federate) { fed->pending_provisional_grant = NEVER_TAG; } lf_mutex_unlock(&rti_mutex); + return NULL; } /** @@ -1989,6 +1990,7 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { } } } + return NULL; } void* respond_to_erroneous_connections(void* nothing) { From 279d1bba6395869a8f4424033d9c31010bc3c35c Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 4 Mar 2024 00:43:47 +0100 Subject: [PATCH 049/148] Use a tag queue to manage delayed grants --- core/federated/RTI/rti_remote.c | 199 ++++++++++++++++++++------------ core/federated/RTI/rti_remote.h | 6 - 2 files changed, 128 insertions(+), 77 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 19ff51254..276bdb083 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -29,6 +29,7 @@ #include "rti_remote.h" #include "net_util.h" #include +#include "clock.h" // For lf_clock_cond_timedwait() // Global variables defined in tag.c: extern instant_t start_time; @@ -56,11 +57,29 @@ bool _lf_federate_reports_error = false; lf_mutex_t rti_mutex; lf_cond_t received_start_times; lf_cond_t sent_start_time; +lf_cond_t updated_delayed_grants; extern int lf_critical_section_enter(environment_t* env) { return lf_mutex_lock(&rti_mutex); } extern int lf_critical_section_exit(environment_t* env) { return lf_mutex_unlock(&rti_mutex); } +/** + * Find the number of non connected upstream transients + * @param fed The federate + * @return the number of non connected upstream transients + */ +static int get_num_absent_upstream_transients(federate_info_t* fed) { + int num_absent_upstream_transients = 0; + for (int j = 0; j < fed->enclave.num_upstream; j++) { + federate_info_t* upstream = GET_FED_INFO(fed->enclave.upstream[j]); + // Do Ignore this enclave if it no longer connected. + if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { + num_absent_upstream_transients++; + } + } + return num_absent_upstream_transients; +} + /** * Notify a tag advance grant (TAG) message to the specified federate immediately. * @@ -168,7 +187,7 @@ void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { notify_tag_advance_grant_immediate(e, tag); } else { if (get_num_absent_upstream_transients(fed) > 0) { - notify_tag_advance_grant_delayed(fed, tag); + notify_grant_delayed(fed, tag, false); } else { notify_tag_advance_grant_immediate(e, tag); } @@ -236,56 +255,6 @@ static void notify_provisional_tag_advance_grant_immediate(scheduling_node_t* e, } } -/** - * Thread that sleeps for a period of time, and then wakes up to check if - * a provisional tag advance grant needs to be sent. That is, if the pending - * provisional tag have not been reset to NEVER_TAG, the provisional tag advance - * grant will be immediate. - * - * @param federate the federate whose provisional tag advance grant needs to be delayed. - */ -static void* pending_provisional_grant_thread(void* federate) { - federate_info_t* fed = (federate_info_t*)federate; - - interval_t sleep_interval = fed->pending_provisional_grant.time - lf_time_physical(); - if (sleep_interval > 0) { - lf_sleep(sleep_interval); - } - - lf_mutex_lock(&rti_mutex); - - // If the pending grant becomes NEVER_TAG, then this means that it should - // not be sent - if (lf_tag_compare(fed->pending_provisional_grant, NEVER_TAG) != 0) { - notify_provisional_tag_advance_grant_immediate(&(fed->enclave), fed->pending_provisional_grant); - fed->pending_provisional_grant = NEVER_TAG; - } - lf_mutex_unlock(&rti_mutex); - return NULL; -} - -/** - * Notify a provisional tag advance grant (PTAG) message to the specified federate - * after the physical time reaches the tag. A thread is created to this end. - * - * If a tag advance grant or a provisional one is pending, then do not proceed - * with the thread creation. - * - * @param fed The federate. - * @param tag The provisional tag to grant. - */ -static void notify_provisional_tag_advance_grant_delayed(federate_info_t* fed, tag_t tag) { - // Proceed with the delayed provisional tag grant notification only if - // there is no pending grant and no provisional pending grant - LF_MUTEX_LOCK(&rti_mutex); - if ((lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) && - (lf_tag_compare(fed->pending_provisional_grant, NEVER_TAG) >= 0)) { - fed->pending_provisional_grant = tag; - lf_thread_create(&(fed->pending_provisional_grant_thread_id), pending_provisional_grant_thread, fed); - } - LF_MUTEX_UNLOCK(&rti_mutex); -} - void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { @@ -305,7 +274,7 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { notify_provisional_tag_advance_grant_immediate(e, tag); } else { if (get_num_absent_upstream_transients(fed) > 0) { - notify_provisional_tag_advance_grant_delayed(fed, tag); + notify_grant_delayed(fed, tag, true); } else { notify_provisional_tag_advance_grant_immediate(e, tag); } @@ -979,16 +948,12 @@ void handle_timestamp(federate_info_t* my_fed) { continue; } - // Check the pending tag grant, if any, and keep it only if it is + // Check the pending grants, if any, and keep it only if it is // sonner than the effective start tag - if (lf_tag_compare(downstream->pending_grant, NEVER_TAG) != 0 && - lf_tag_compare(downstream->pending_grant, my_fed->effective_start_tag) > 0) { - downstream->pending_grant = NEVER_TAG; - } - // Same for the possible pending provisional tag grant - if (lf_tag_compare(downstream->pending_provisional_grant, NEVER_TAG) != 0 && - lf_tag_compare(downstream->pending_provisional_grant, my_fed->effective_start_tag) > 0) { - downstream->pending_provisional_grant = NEVER_TAG; + pqueue_delayed_grant_element_t* dge = + pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, downstream->enclave.id); + if (dge != NULL && lf_tag_compare(dge->base.tag, my_fed->effective_start_tag) > 0) { + pqueue_delayed_grants_remove(rti_remote->delayed_grants, dge); } } @@ -1877,6 +1842,7 @@ void send_stop(federate_info_t* fed) { } void* lf_connect_to_transient_federates_thread(void* nothing) { + initialize_lf_thread_id(); // This loop will continue to accept connections of transient federates, as // soon as there is room, or enable hot swap @@ -1993,6 +1959,88 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { return NULL; } +void* lf_delayed_grants_thread(void* nothing) { + initialize_lf_thread_id(); + while (rti_remote->phase == execution_phase) { + if (pqueue_delayed_grants_size(rti_remote->delayed_grants) != 0) { + pqueue_delayed_grant_element_t* next; + // Do not pop, but rather read + next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + instant_t next_time = next->base.tag.time; + // Wait for expiration, or a signal to stop or terminate. + if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time) == LF_TIMEOUT) { + lf_print("RTI: lf_delayed_grants_thread() is sending grant to %d at " PRINTF_TIME ".", next->fed_id, + next_time - start_time); + // Time reached to send the grant. Do it for delayed grants with + // the same tag + LF_MUTEX_LOCK(&rti_mutex); + do { + next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); + federate_info_t* fed = GET_FED_INFO(next->fed_id); + if (next->is_provisional) { + notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } else { + notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } + } while ((next = pqueue_delayed_grants_find_with_tag(rti_remote->delayed_grants, next->base.tag)) != NULL); + LF_MUTEX_UNLOCK(&rti_mutex); + } else { + // Waiting was interrupted, because of an update in the queue, or + // because this thread needs to terminate + lf_print("RTI: lf_delayed_grants_thread() did not send grant to %d at " PRINTF_TIME ", but rather terminated!", + next->fed_id, next_time - start_time); + } + } + } + // The federation is at the shutdown phase. All persistent federates exited. + // We can do a sanity check that the delayed_grants queue is empty. + // FIXME: If there are still pending grants, what does that mean? Maybe that the + // federation stopped after a request to stop (not a timeout). Therefore, we need + // cleanup, and free the memory... + // TODO: do it! +} + +void* lf_delayed_grants_thread(void* nothing) { + initialize_lf_thread_id(); + while (rti_remote->phase == execution_phase) { + if (pqueue_delayed_grants_size(rti_remote->delayed_grants) != 0) { + pqueue_delayed_grant_element_t* next; + // Do not pop, but rather read + next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + instant_t next_time = next->base.tag.time; + // Wait for expiration, or a signal to stop or terminate. + if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time) == LF_TIMEOUT) { + lf_print("RTI: lf_delayed_grants_thread() is sending grant to %d at " PRINTF_TIME ".", next->fed_id, + next_time - start_time); + // Time reached to send the grant. Do it for delayed grants with + // the same tag + LF_MUTEX_LOCK(&rti_mutex); + do { + next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); + federate_info_t* fed = GET_FED_INFO(next->fed_id); + if (next->is_provisional) { + notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } else { + notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } + } while ((next = pqueue_delayed_grants_find_with_tag(rti_remote->delayed_grants, next->base.tag)) != NULL); + LF_MUTEX_UNLOCK(&rti_mutex); + } else { + // Waiting was interrupted, because of an update in the queue, or + // because this thread needs to terminate + lf_print("RTI: lf_delayed_grants_thread() did not send grant to %d at " PRINTF_TIME ", but rather terminated!", + next->fed_id, next_time - start_time); + } + } + } + // The federation is at the shutdown phase. All persistent federates exited. + // We can do a sanity check that the delayed_grants queue is empty. + // FIXME: If there are still pending grants, what does that mean? Maybe that the + // federation stopped after a request to stop (not a timeout). Therefore, we need + // cleanup, and free the memory... + // TODO: do it! +} + void* respond_to_erroneous_connections(void* nothing) { initialize_lf_thread_id(); while (true) { @@ -2034,8 +2082,6 @@ void initialize_federate(federate_info_t* fed, uint16_t id) { fed->has_upstream_transient_federates = false; fed->is_transient = true; fed->effective_start_tag = NEVER_TAG; - fed->pending_grant = NEVER_TAG; - fed->pending_provisional_grant = NEVER_TAG; } void reset_transient_federate(federate_info_t* fed) { @@ -2051,8 +2097,6 @@ void reset_transient_federate(federate_info_t* fed) { fed->requested_stop = false; fed->is_transient = true; fed->effective_start_tag = NEVER_TAG; - fed->pending_grant = NEVER_TAG; - fed->pending_provisional_grant = NEVER_TAG; // FIXME: There is room though to check if the interface has changed??? Do we allow this? } @@ -2076,11 +2120,15 @@ int32_t start_rti_server(uint16_t port) { /** * Iterate over the federates and sets 'has_upstream_transient_federates'. - * Once done, check that no transient federate has an upstream transient federate. + * Once done, check that no transient federate has an upstream transient federate, + * and compute the number of persistent federates that do have upstream transients, + * which is the maximun number of delayed grants that can be pending at the same time. + * This is useful for initialyzing the queue of delayed grants. - * @return true for success, false for failure. + * @return -1, if there is more than one level of transiency, else, the number of + * persistents that have an upstream transient */ -static bool set_has_upstream_transient_federates_parameter_and_check() { +static int set_has_upstream_transient_federates_parameter_and_check() { for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { federate_info_t* fed = GET_FED_INFO(i); for (int j = 0; j < fed->enclave.num_upstream; j++) { @@ -2094,14 +2142,18 @@ static bool set_has_upstream_transient_federates_parameter_and_check() { // Now check that no transient has an upstream transient // FIXME: Do we really need this? Or should it be the job of the validator? + uint16_t max_number_of_delayed_grants = 0; for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { federate_info_t* fed = GET_FED_INFO(i); if (fed->is_transient && fed->has_upstream_transient_federates) { - return false; + return -1; + } + if (!fed->is_transient && fed->has_upstream_transient_federates) { + max_number_of_delayed_grants++; } } - return true; + return max_number_of_delayed_grants; } void wait_for_federates(int socket_descriptor) { @@ -2111,9 +2163,11 @@ void wait_for_federates(int socket_descriptor) { // Set has_upstream_transient_federates parameter in all federates and check // that is no more than one level of transiency if (rti_remote->number_of_transient_federates > 0) { - if (!set_has_upstream_transient_federates_parameter_and_check()) { + uint16_t max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); + if (max_number_of_pending_grants == -1) { lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); } + rti_remote->delayed_grants = pqueue_delayed_grants_init(max_number_of_pending_grants); } // All persistent federates have connected. @@ -2128,6 +2182,7 @@ void wait_for_federates(int socket_descriptor) { // federation, need to respond. Start a separate thread to do that. lf_thread_t responder_thread; lf_thread_t transient_thread; + lf_thread_t delayed_grants_thread; // If the federation does not include transient federates, then respond to // erronous connections. Otherwise, continue to accept transients joining and @@ -2136,6 +2191,7 @@ void wait_for_federates(int socket_descriptor) { lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); } else if (rti_remote->number_of_transient_federates > 0) { lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); + lf_thread_create(&delayed_grants_thread, lf_delayed_grants_thread, NULL); } // Wait for persistent federate threads to exit. @@ -2197,6 +2253,7 @@ void initialize_RTI(rti_remote_t* rti) { LF_MUTEX_INIT(&rti_mutex); LF_COND_INIT(&received_start_times, &rti_mutex); LF_COND_INIT(&sent_start_time, &rti_mutex); + LF_COND_INIT(&updated_delayed_grants, &rti_mutex); initialize_rti_common(&rti_remote->base); rti_remote->base.mutex = &rti_mutex; diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index 8432f93ea..c27fd9c7a 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -72,12 +72,6 @@ typedef struct federate_info_t { bool is_transient; // Indicates whether the federate is transient or persistent. tag_t effective_start_tag; // Records the start time of the federate, which is // mainly useful for transient federates - tag_t pending_grant; // The pending tag advance grant - tag_t pending_provisional_grant; // The pending provisional tag advance grant - lf_thread_t pending_grant_thread_id; // The ID of the thread handling the pending - // tag grant - lf_thread_t pending_provisional_grant_thread_id; // The ID of the thread handling - // the pending provitional tag grant } federate_info_t; /** From c2d6779ddf415e85aec5ff001cbb1bd4acd2b058 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 15 Mar 2024 01:54:56 +0100 Subject: [PATCH 050/148] Fix missing code from the previous commit --- core/federated/RTI/rti_remote.c | 245 ++++++++++++++++++++------------ core/federated/RTI/rti_remote.h | 25 ++++ 2 files changed, 176 insertions(+), 94 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 276bdb083..b879f6bcb 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -63,6 +63,157 @@ extern int lf_critical_section_enter(environment_t* env) { return lf_mutex_lock( extern int lf_critical_section_exit(environment_t* env) { return lf_mutex_unlock(&rti_mutex); } +// Utility functions to simplify the call of pqueue_tag routines. +// These functions mainly do the casting. +// FIXME: Should we remove the queue parameter from the functions? + +/** + * @brief Creates a priority queue of delayed grants that is sorted by tags. + * + * @param nbr_delayed_grants The size. + * @return The dynamically allocated queue or NULL. + */ +pqueue_delayed_grants_t* pqueue_delayed_grants_init(uint16_t nbr_delayed_grants) { + return (pqueue_delayed_grants_t*)pqueue_tag_init((size_t)nbr_delayed_grants); +} + +/** + * @brief Return the size of the queue. + * + * @param q The queue. + * @return The size. + */ +size_t pqueue_delayed_grants_size(pqueue_delayed_grants_t* q) { return pqueue_tag_size((pqueue_tag_t*)q); } + +/** + * @brief Insert an\ delayed grant element into the queue. + * + * @param q The queue. + * @param e The delayed grant element to insert. + * @return 0 on success + */ +int pqueue_delayed_grants_insert(pqueue_delayed_grants_t* q, pqueue_delayed_grant_element_t* d) { + return pqueue_tag_insert((pqueue_tag_t*)q, (void*)d); +} + +/** + * @brief Pop the least-tag element from the queue. + * + * @param q The queue. + * @return NULL on error, otherwise the entry + */ +pqueue_delayed_grant_element_t* pqueue_delayed_grants_pop(pqueue_delayed_grants_t* q) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_pop((pqueue_tag_t*)q); +} + +/** + * @brief Return highest-ranking element without removing it. + * + * @param q The queue. + * @return NULL on if the queue is empty, otherwise the delayed grant element. + */ +pqueue_delayed_grant_element_t* pqueue_delayed_grants_peek(pqueue_delayed_grants_t* q) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_peek((pqueue_tag_t*)q); +} + +/** + * @brief Free all memory used by the queue including elements that are marked dynamic. + * + * @param q The queue. + */ +void pqueue_delayed_grants_free(pqueue_delayed_grants_t* q) { pqueue_tag_free((pqueue_tag_t*)q); } + +/** + * @brief Remove an item from the delayed grants queue. + * + * @param q The queue. + * @param e The entry to remove. + */ +void pqueue_delayed_grants_remove(pqueue_delayed_grants_t* q, pqueue_delayed_grant_element_t* e) { + pqueue_tag_remove((pqueue_tag_t*)q, (void*)e); +} + +/** + * @brief Return the first item with the specified tag or NULL if there is none. + * @param q The queue. + * @param t The tag. + * @return An entry with the specified tag or NULL if there isn't one. + */ +pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_with_tag(pqueue_delayed_grants_t* q, tag_t t) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_find_with_tag((pqueue_tag_t*)q, t); +} + +// Function that does not in pqueue_tag.c +/** + * @brief Return the first item with the specified federate id or NULL if there is none. + * @param q The queue. + * @param fed_id The federate id. + * @return An entry with the specified federate if or NULL if there isn't one. + */ + +pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t* q, uint16_t fed_id) { + pqueue_delayed_grant_element_t* dge; + pqueue_t* _q = (pqueue_t*)q; + if (!q || q->size == 1) + return NULL; + for (int i = 1; i <= q->size; i++) { + dge = (pqueue_delayed_grant_element_t*)q->d[i]; + if (dge->fed_id == fed_id) { + return dge; + } + } + return NULL; +} + +/** + * @brief Insert the delayed grant into the delayed_grants queue. + * + * In case there is already a grant for that federte, keep the soonest one. + * FIXME: Is that correct? + * + * @param fed The federate. + * @param tag The tag to grant. + * @param is_provisional State whther the grant is provisional. + */ +static void notify_grant_delayed(federate_info_t* fed, tag_t tag, bool is_provisional) { + // Check wether there is already a pending grant, + // and check the pending provisional grant as well + // Iterate over the + LF_MUTEX_LOCK(&rti_mutex); + pqueue_delayed_grant_element_t* dge = + pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, fed->enclave.id); + if (dge == NULL) { + pqueue_delayed_grant_element_t* dge = + (pqueue_delayed_grant_element_t*)malloc(sizeof(pqueue_delayed_grant_element_t)); + dge->base.is_dynamic = 1; + dge->base.tag = tag; + dge->fed_id = fed->enclave.id; + dge->is_provisional = is_provisional; + pqueue_delayed_grants_insert(rti_remote->delayed_grants, dge); + lf_cond_broadcast(&updated_delayed_grants); + } else { + // FIXME: Decide what to do in this case... + // TODO: to do! + } + LF_MUTEX_UNLOCK(&rti_mutex); +} + +/** + * @brief Cancels the delayed grants of a federate by deleting then from the delayed_grants queue. + * * + * @param fed The federate. + */ +void notify_grant_canceled(federate_info_t* fed) { + LF_MUTEX_LOCK(&rti_mutex); + pqueue_delayed_grant_element_t* dge = + pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, fed->enclave.id); + if (dge != NULL) { + pqueue_delayed_grants_remove(rti_remote->delayed_grants, dge); + lf_cond_broadcast(&updated_delayed_grants); + } + LF_MUTEX_UNLOCK(&rti_mutex); +} + /** * Find the number of non connected upstream transients * @param fed The federate @@ -112,59 +263,6 @@ static void notify_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) } } -/** - * @brief Thread that sleeps for a period of time, and then wakes up to check if - * a tag advance grant needs to be sent. That is, if the pending tag has not - * been reset to NEVER_TAG, the tag advance grant will be immediate. - * - * @param federate the federate whose tag advance grant needs to be delayed. - */ -static void* pending_grant_thread(void* federate) { - federate_info_t* fed = (federate_info_t*)federate; - - interval_t sleep_interval = fed->pending_grant.time - lf_time_physical(); - if (sleep_interval > 0) { - lf_sleep(sleep_interval); - } - - lf_mutex_lock(&rti_mutex); - - // If the pending grant has become NEVER_TAG, then this means that it should - // not be sent. - if (lf_tag_compare(fed->pending_grant, NEVER_TAG) != 0) { - notify_tag_advance_grant_immediate(&(fed->enclave), fed->pending_grant); - fed->pending_grant = NEVER_TAG; - } - lf_mutex_unlock(&rti_mutex); - return NULL; -} - -/** - * Notify a tag advance grant (TAG) message to the specified federate after - * the physical time reaches the tag. A thread is created to this end. - * - * If a provisionl tag advance grant is pending, cancel it. If there is another - * pending tag advance grant, do not proceed with the thread creation. - * - * @param fed The federate. - * @param tag The tag to grant. - */ -static void notify_tag_advance_grant_delayed(federate_info_t* fed, tag_t tag) { - // Check wether there is already a pending grant - // And check the pending provisional grant as well - lf_mutex_lock(&rti_mutex); - if (lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) { - // If a tag is issued, then stop any possible provisional tag grant - fed->pending_grant = tag; - fed->pending_provisional_grant = NEVER_TAG; - lf_thread_create(&(fed->pending_grant_thread_id), pending_grant_thread, fed); - } else { - // If there is already a pending tag grant, then let it be sent first - // FIXME: Is this correct? - } - lf_mutex_unlock(&rti_mutex); -} - void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { @@ -2000,47 +2098,6 @@ void* lf_delayed_grants_thread(void* nothing) { // TODO: do it! } -void* lf_delayed_grants_thread(void* nothing) { - initialize_lf_thread_id(); - while (rti_remote->phase == execution_phase) { - if (pqueue_delayed_grants_size(rti_remote->delayed_grants) != 0) { - pqueue_delayed_grant_element_t* next; - // Do not pop, but rather read - next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); - instant_t next_time = next->base.tag.time; - // Wait for expiration, or a signal to stop or terminate. - if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time) == LF_TIMEOUT) { - lf_print("RTI: lf_delayed_grants_thread() is sending grant to %d at " PRINTF_TIME ".", next->fed_id, - next_time - start_time); - // Time reached to send the grant. Do it for delayed grants with - // the same tag - LF_MUTEX_LOCK(&rti_mutex); - do { - next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); - federate_info_t* fed = GET_FED_INFO(next->fed_id); - if (next->is_provisional) { - notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - } else { - notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - } - } while ((next = pqueue_delayed_grants_find_with_tag(rti_remote->delayed_grants, next->base.tag)) != NULL); - LF_MUTEX_UNLOCK(&rti_mutex); - } else { - // Waiting was interrupted, because of an update in the queue, or - // because this thread needs to terminate - lf_print("RTI: lf_delayed_grants_thread() did not send grant to %d at " PRINTF_TIME ", but rather terminated!", - next->fed_id, next_time - start_time); - } - } - } - // The federation is at the shutdown phase. All persistent federates exited. - // We can do a sanity check that the delayed_grants queue is empty. - // FIXME: If there are still pending grants, what does that mean? Maybe that the - // federation stopped after a request to stop (not a timeout). Therefore, we need - // cleanup, and free the memory... - // TODO: do it! -} - void* respond_to_erroneous_connections(void* nothing) { initialize_lf_thread_id(); while (true) { diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index c27fd9c7a..0ea2fd5bd 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -84,6 +84,20 @@ typedef enum clock_sync_stat { clock_sync_off, clock_sync_init, clock_sync_on } */ typedef enum federation_life_cycle_phase { startup_phase, execution_phase, shutdown_phase } federation_life_cycle_phase; +/** + * @brief The type for an element in a delayed grants priority queue that is sorted by tag. + */ +typedef struct pqueue_delayed_grant_element_t { + pqueue_tag_element_t base; + uint16_t fed_id; // Id of the federate with delayed grant of tag (in base) + bool is_provisional; // Boolean recoding if the delayed grant is provisional +} pqueue_delayed_grant_element_t; + +/** + * @brief Type of a delayed grants queue sorted by tags. + */ +typedef pqueue_tag_t pqueue_delayed_grants_t; + /** * Structure that an RTI instance uses to keep track of its own and its * corresponding federates' state. @@ -183,6 +197,12 @@ typedef struct rti_remote_t { * Indicates the life cycle phase of the federation. */ federation_life_cycle_phase phase; + + /** + * Queue of the pending grants, in case transient federates are absent and + * issuing grants to their downstreams need to be delayed. + */ + pqueue_delayed_grants_t* delayed_grants; } rti_remote_t; /** @@ -388,6 +408,11 @@ void lf_connect_to_persistent_federates(int socket_descriptor); */ void* lf_connect_to_transient_federates_thread(void* nothing); +/** + * Thread that manages the delayed grants using a priprity queue. + */ +void* lf_delayed_grants_thread(void* nothing); + /** * Thread to respond to new connections, which could be federates of other * federations who are attempting to join the wrong federation. From 2eb27a784f59b75b8147994ddd54c0086be9e9d2 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 4 Mar 2024 00:52:05 +0100 Subject: [PATCH 051/148] Fix the type of max_number_of_delayed_grants --- core/federated/RTI/rti_remote.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index b879f6bcb..438c42b51 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -168,7 +168,10 @@ pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_dela /** * @brief Insert the delayed grant into the delayed_grants queue. * - * In case there is already a grant for that federte, keep the soonest one. + * The insertion will cause the broadcast to cause the delayed_grants_thread to + * account for the update. + * + * In case there is already a grant for that federate, keep the soonest one. * FIXME: Is that correct? * * @param fed The federate. @@ -199,10 +202,17 @@ static void notify_grant_delayed(federate_info_t* fed, tag_t tag, bool is_provis } /** - * @brief Cancels the delayed grants of a federate by deleting then from the delayed_grants queue. - * * + * @brief Cancel a delayed grant by removing it from delayed_grants queue. + * + * The removal will cause the broadcast to cause the delayed_grants_thread to + * account for the update. + * + * In case there is already a grant for that federte, keep the soonest one. + * FIXME: Is that correct? + * * @param fed The federate. */ + void notify_grant_canceled(federate_info_t* fed) { LF_MUTEX_LOCK(&rti_mutex); pqueue_delayed_grant_element_t* dge = @@ -2057,6 +2067,9 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { return NULL; } +/** + * + */ void* lf_delayed_grants_thread(void* nothing) { initialize_lf_thread_id(); while (rti_remote->phase == execution_phase) { @@ -2199,7 +2212,7 @@ static int set_has_upstream_transient_federates_parameter_and_check() { // Now check that no transient has an upstream transient // FIXME: Do we really need this? Or should it be the job of the validator? - uint16_t max_number_of_delayed_grants = 0; + int max_number_of_delayed_grants = 0; for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { federate_info_t* fed = GET_FED_INFO(i); if (fed->is_transient && fed->has_upstream_transient_federates) { @@ -2220,7 +2233,7 @@ void wait_for_federates(int socket_descriptor) { // Set has_upstream_transient_federates parameter in all federates and check // that is no more than one level of transiency if (rti_remote->number_of_transient_federates > 0) { - uint16_t max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); + int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); if (max_number_of_pending_grants == -1) { lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); } From 1795f80e18351406e4cd9a306acaef0aa6de09fa Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 4 Mar 2024 01:10:07 +0100 Subject: [PATCH 052/148] Fix return void * --- core/federated/RTI/rti_remote.c | 1 + 1 file changed, 1 insertion(+) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 438c42b51..a87d18113 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -2109,6 +2109,7 @@ void* lf_delayed_grants_thread(void* nothing) { // federation stopped after a request to stop (not a timeout). Therefore, we need // cleanup, and free the memory... // TODO: do it! + return NULL; } void* respond_to_erroneous_connections(void* nothing) { From 1682395f6e9fab03d959025e4a6dc92bea7bf285 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 4 Mar 2024 16:04:15 +0100 Subject: [PATCH 053/148] Fix the use of condition variabale updated_delayed_grants --- core/federated/RTI/rti_remote.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index a87d18113..5c30aefcc 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -193,10 +193,10 @@ static void notify_grant_delayed(federate_info_t* fed, tag_t tag, bool is_provis dge->fed_id = fed->enclave.id; dge->is_provisional = is_provisional; pqueue_delayed_grants_insert(rti_remote->delayed_grants, dge); - lf_cond_broadcast(&updated_delayed_grants); + lf_cond_signal(&updated_delayed_grants); } else { // FIXME: Decide what to do in this case... - // TODO: to do! + // TODO: do it! } LF_MUTEX_UNLOCK(&rti_mutex); } @@ -219,7 +219,7 @@ void notify_grant_canceled(federate_info_t* fed) { pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, fed->enclave.id); if (dge != NULL) { pqueue_delayed_grants_remove(rti_remote->delayed_grants, dge); - lf_cond_broadcast(&updated_delayed_grants); + lf_cond_signal(&updated_delayed_grants); } LF_MUTEX_UNLOCK(&rti_mutex); } @@ -2072,6 +2072,10 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { */ void* lf_delayed_grants_thread(void* nothing) { initialize_lf_thread_id(); + + // Wait for the first condition signal + lf_cond_wait(&updated_delayed_grants); + while (rti_remote->phase == execution_phase) { if (pqueue_delayed_grants_size(rti_remote->delayed_grants) != 0) { pqueue_delayed_grant_element_t* next; @@ -2079,7 +2083,7 @@ void* lf_delayed_grants_thread(void* nothing) { next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); instant_t next_time = next->base.tag.time; // Wait for expiration, or a signal to stop or terminate. - if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time) == LF_TIMEOUT) { + if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time) != LF_TIMEOUT) { lf_print("RTI: lf_delayed_grants_thread() is sending grant to %d at " PRINTF_TIME ".", next->fed_id, next_time - start_time); // Time reached to send the grant. Do it for delayed grants with @@ -2232,7 +2236,7 @@ void wait_for_federates(int socket_descriptor) { lf_connect_to_persistent_federates(socket_descriptor); // Set has_upstream_transient_federates parameter in all federates and check - // that is no more than one level of transiency + // that there is no more than one level of transiency if (rti_remote->number_of_transient_federates > 0) { int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); if (max_number_of_pending_grants == -1) { From 5ad13a3710a8737f4d95aa401db8eb4ecefbc931 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 4 Mar 2024 16:16:55 +0100 Subject: [PATCH 054/148] Add documentation to lf_delayed_grants_thread() and adjust the exit condition --- core/federated/RTI/rti_remote.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 5c30aefcc..2869c3dc3 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -2068,7 +2068,11 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { } /** - * + * This thread is responsible for managing the priority queue of delayed grants to be issued. + * It waits until the current time matches the highest priority tag time in the queue. + * If reached, it notifies the grant immediately. If, however, the current time has not yet + * reached the highest priority tag and the queue has been updated (either by inserting or + * canceling an entry), the thread stops waiting and restarts the process again. */ void* lf_delayed_grants_thread(void* nothing) { initialize_lf_thread_id(); @@ -2076,7 +2080,10 @@ void* lf_delayed_grants_thread(void* nothing) { // Wait for the first condition signal lf_cond_wait(&updated_delayed_grants); - while (rti_remote->phase == execution_phase) { + while (true) { + if (rti_remote->all_federates_exited) { + break; + } if (pqueue_delayed_grants_size(rti_remote->delayed_grants) != 0) { pqueue_delayed_grant_element_t* next; // Do not pop, but rather read From 508c484301886ba3a8cecc2c4fd4b85db561fb90 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 11 Mar 2024 05:57:42 +0100 Subject: [PATCH 055/148] Fix the segmentation fault hapening when a second hot swap mechanism is invoqued --- core/federated/RTI/rti_remote.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 2869c3dc3..13cac4488 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -2026,16 +2026,17 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { // synchronization messages. lf_thread_create(&(hot_swap_federate->thread_id), federate_info_thread_TCP, hot_swap_federate); + LF_MUTEX_LOCK(&rti_mutex); // Redirect the federate in rti_remote rti_remote->base.scheduling_nodes[fed_id] = (scheduling_node_t*)hot_swap_federate; // Free the old federate memory and reset the Hot wap indicators // FIXME: Is this enough to free the memory allocated to the federate? free(fed_old); - lf_mutex_lock(&rti_mutex); + hot_swap_federate = NULL; hot_swap_in_progress = false; - lf_mutex_unlock(&rti_mutex); - + hot_swap_old_resigned = false; + LF_MUTEX_UNLOCK(&rti_mutex); lf_print("RTI: Hot swap succeeded for federate %d.", fed_id); } else { lf_mutex_unlock(&rti_mutex); From aff416d825f1fd32614d7c68be4982437672b486 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 11 Mar 2024 06:05:38 +0100 Subject: [PATCH 056/148] Improve memory management when dealing with transients --- core/federated/RTI/rti_remote.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 13cac4488..a0117231c 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1382,8 +1382,6 @@ void* federate_info_thread_TCP(void* fed) { close(my_fed->socket); // from unistd.h // Manual clean, in case of a transient federate if (my_fed->is_transient) { - // FIXME: Aren't there transit messages anymore??? - // free_in_transit_message_q(my_fed->in_transit_message_tags); lf_print("RTI: Transient Federate %d thread exited.", my_fed->enclave.id); // Update the number of connected transient federates @@ -2004,8 +2002,6 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { // Then send STOP federate_info_t* fed_old = GET_FED_INFO(fed_id); - hot_swap_federate->enclave.completed = fed_old->enclave.completed; - LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); send_stop(fed_old); LF_MUTEX_UNLOCK(&rti_mutex); @@ -2031,7 +2027,7 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { rti_remote->base.scheduling_nodes[fed_id] = (scheduling_node_t*)hot_swap_federate; // Free the old federate memory and reset the Hot wap indicators - // FIXME: Is this enough to free the memory allocated to the federate? + pqueue_tag_free(fed_old->in_transit_message_tags); free(fed_old); hot_swap_federate = NULL; hot_swap_in_progress = false; @@ -2168,19 +2164,16 @@ void initialize_federate(federate_info_t* fed, uint16_t id) { } void reset_transient_federate(federate_info_t* fed) { - fed->enclave.next_event = NEVER_TAG; - fed->enclave.state = NOT_CONNECTED; // Reset of the federate-related attributes fed->socket = -1; // No socket. fed->clock_synchronization_enabled = true; + // FIXME: The following two lines can be improved? + pqueue_tag_free(fed->in_transit_message_tags); fed->in_transit_message_tags = pqueue_tag_init(10); strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); fed->server_ip_addr.s_addr = 0; fed->server_port = -1; fed->requested_stop = false; - fed->is_transient = true; - fed->effective_start_tag = NEVER_TAG; - // FIXME: There is room though to check if the interface has changed??? Do we allow this? } int32_t start_rti_server(uint16_t port) { From 1ac4bbe870c0557118d4663dc35e7e937186a793 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 11 Mar 2024 06:08:18 +0100 Subject: [PATCH 057/148] Make sure the transient federate interface did not change when a new version is joining --- core/federated/RTI/rti_remote.c | 57 ++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index a0117231c..0cc43c502 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1623,11 +1623,18 @@ static int receive_connection_information(int* socket_id, uint16_t fed_id) { send_reject(socket_id, UNEXPECTED_MESSAGE); return 0; } else { - federate_info_t* fed; - if (hot_swap_in_progress) { - fed = hot_swap_federate; - } else { - fed = GET_FED_INFO(fed_id); + // In case of a transient federate that is joining again, or a hot swap, then + // check that the connection information did not change. + federate_info_t* fed = GET_FED_INFO(fed_id); + federate_info_t* temp_fed = NULL; + if (lf_tag_compare(fed->effective_start_tag, NEVER_TAG) != 0) { + if (hot_swap_in_progress) { + fed = hot_swap_federate; + } else { + temp_fed = (federate_info_t*)calloc(1, sizeof(federate_info_t)); + initialize_federate(temp_fed, fed_id); + fed = temp_fed; + } } // Read the number of upstream and downstream connections fed->enclave.num_upstream = extract_int32(&(connection_info_header[1])); @@ -1680,6 +1687,46 @@ static int receive_connection_information(int* socket_id, uint16_t fed_id) { free(connections_info_body); } + + // NOTE: In this design, changes in the connections are not allowed. This means that the first + // instance to join __is__ the reference. If this policy is to be changed, then it is in + // the following lines will be updated accordingly. + if (hot_swap_in_progress || temp_fed != NULL) { + if (temp_fed == NULL) { + temp_fed = hot_swap_federate; + } + // Now, compare the previous and the new neighberhood structure + // Start with the number of upstreams and downstreams + bool reject = false; + if ((fed->enclave.num_upstream != temp_fed->enclave.num_upstream) || + (fed->enclave.num_downstream != temp_fed->enclave.num_downstream)) { + reject = true; + } else { + // Then check all upstreams and their delays + for (int i = 0; i < fed->enclave.num_upstream; i++) { + if ((fed->enclave.upstream[i] != temp_fed->enclave.upstream[i]) || + (fed->enclave.upstream_delay[i] != temp_fed->enclave.upstream_delay[i])) { + reject = true; + break; + } + } + if (!reject) { + // Finally, check all downstream federates + for (int i = 0; i < fed->enclave.num_downstream; i++) { + if (fed->enclave.downstream[i] != temp_fed->enclave.downstream[i]) { + reject = true; + break; + } + } + } + } + if (reject) { + if (temp_fed != hot_swap_federate) { + free(temp_fed); + } + return 0; + } + } } LF_PRINT_DEBUG("RTI received neighbor structure from federate %d.", fed_id); return 1; From 42e00c8a2b8ed8e7c9bdfe819528a552767341d9 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 18 Mar 2024 05:51:52 +0100 Subject: [PATCH 058/148] Fix the segmentation fault error of the RTI --- core/federated/RTI/rti_remote.c | 44 ++++++++++++--------------------- 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 0cc43c502..79b7006b3 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -133,24 +133,13 @@ void pqueue_delayed_grants_remove(pqueue_delayed_grants_t* q, pqueue_delayed_gra pqueue_tag_remove((pqueue_tag_t*)q, (void*)e); } -/** - * @brief Return the first item with the specified tag or NULL if there is none. - * @param q The queue. - * @param t The tag. - * @return An entry with the specified tag or NULL if there isn't one. - */ -pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_with_tag(pqueue_delayed_grants_t* q, tag_t t) { - return (pqueue_delayed_grant_element_t*)pqueue_tag_find_with_tag((pqueue_tag_t*)q, t); -} - -// Function that does not in pqueue_tag.c +// Function that does not exist in pqueue_tag.c /** * @brief Return the first item with the specified federate id or NULL if there is none. * @param q The queue. * @param fed_id The federate id. * @return An entry with the specified federate if or NULL if there isn't one. */ - pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t* q, uint16_t fed_id) { pqueue_delayed_grant_element_t* dge; pqueue_t* _q = (pqueue_t*)q; @@ -158,8 +147,10 @@ pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_dela return NULL; for (int i = 1; i <= q->size; i++) { dge = (pqueue_delayed_grant_element_t*)q->d[i]; - if (dge->fed_id == fed_id) { - return dge; + if (dge) { + if (dge->fed_id == fed_id) { + return dge; + } } } return NULL; @@ -2130,25 +2121,21 @@ void* lf_delayed_grants_thread(void* nothing) { } if (pqueue_delayed_grants_size(rti_remote->delayed_grants) != 0) { pqueue_delayed_grant_element_t* next; + // Do not pop, but rather read next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); instant_t next_time = next->base.tag.time; // Wait for expiration, or a signal to stop or terminate. - if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time) != LF_TIMEOUT) { - lf_print("RTI: lf_delayed_grants_thread() is sending grant to %d at " PRINTF_TIME ".", next->fed_id, - next_time - start_time); - // Time reached to send the grant. Do it for delayed grants with - // the same tag + if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time)) { + // Time reached to send the grant. Do it for delayed grants with the same tag LF_MUTEX_LOCK(&rti_mutex); - do { - next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); - federate_info_t* fed = GET_FED_INFO(next->fed_id); - if (next->is_provisional) { - notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - } else { - notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - } - } while ((next = pqueue_delayed_grants_find_with_tag(rti_remote->delayed_grants, next->base.tag)) != NULL); + next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); + federate_info_t* fed = GET_FED_INFO(next->fed_id); + if (next->is_provisional) { + notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } else { + notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } LF_MUTEX_UNLOCK(&rti_mutex); } else { // Waiting was interrupted, because of an update in the queue, or @@ -2385,6 +2372,7 @@ void initialize_RTI(rti_remote_t* rti) { rti_remote->max_start_time = 0LL; rti_remote->num_feds_proposed_start = 0; rti_remote->all_federates_exited = false; + rti_remote->all_persistent_federates_exited = false; rti_remote->federation_id = "Unidentified Federation"; rti_remote->user_specified_port = 0; rti_remote->final_port_TCP = 0; From 402c4afe985c0ed39e10b4bbd85b72dc0fab0229 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 18 Mar 2024 05:53:09 +0100 Subject: [PATCH 059/148] Fix the exit of th thread that connects to transients --- core/federated/RTI/rti_remote.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 79b7006b3..e68ce71e9 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1999,9 +1999,9 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { // The following blocks until a federate connects. int socket_id = -1; while (1) { - // if (!rti_remote->all_persistent_federates_exited) { - // return NULL; - // } + if (rti_remote->all_persistent_federates_exited) { + return NULL; + } socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); if (socket_id >= 0) { // Got a socket From c1249e5261aa44180bd2a7bed20e5dc674a414cf Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 18 Mar 2024 05:54:31 +0100 Subject: [PATCH 060/148] Logging of the delayed grants queue activity --- core/federated/RTI/rti_remote.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index e68ce71e9..6443e860c 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -184,6 +184,8 @@ static void notify_grant_delayed(federate_info_t* fed, tag_t tag, bool is_provis dge->fed_id = fed->enclave.id; dge->is_provisional = is_provisional; pqueue_delayed_grants_insert(rti_remote->delayed_grants, dge); + LF_PRINT_LOG("RTI: Inserting a delayed grant of " PRINTF_TAG " for federate %d.", dge->base.tag.time - start_time, + dge->base.tag.microstep, dge->fed_id); lf_cond_signal(&updated_delayed_grants); } else { // FIXME: Decide what to do in this case... @@ -210,6 +212,8 @@ void notify_grant_canceled(federate_info_t* fed) { pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, fed->enclave.id); if (dge != NULL) { pqueue_delayed_grants_remove(rti_remote->delayed_grants, dge); + LF_PRINT_LOG("RTI: Canceling the delayed grant of " PRINTF_TAG " for federate %d.", dge->base.tag.time - start_time, + dge->base.tag.microstep, dge->fed_id); lf_cond_signal(&updated_delayed_grants); } LF_MUTEX_UNLOCK(&rti_mutex); From 8bc1f43879ba1b12ef2c4c6548378d42a04d436a Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 18 Mar 2024 14:13:24 +0100 Subject: [PATCH 061/148] Consistent naming: use lf_tag_effective_start() instead of lf_get_effective_start_tag() --- core/federated/federate.c | 10 ++++------ core/tag.c | 2 ++ core/threaded/reactor_threaded.c | 1 + include/core/federated/federate.h | 10 ---------- tag/api/tag.h | 8 ++++++++ 5 files changed, 15 insertions(+), 16 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index b94ebeb47..aab0e1a67 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -995,8 +995,6 @@ static instant_t get_start_time_from_rti(instant_t my_physical_time) { // Note that we report in the trace the effective_start_tag. // This is rather a choice. To be changed, if needed, of course. tracepoint_federate_from_rti(receive_TIMESTAMP, _lf_my_fed_id, &effective_start_tag); - lf_print("Starting timestamp is: " PRINTF_TIME " and effectve start tag is: " PRINTF_TAG ".", timestamp, - effective_start_tag.time - start_time, effective_start_tag.microstep); LF_PRINT_LOG("Current physical time is: " PRINTF_TIME ".", lf_time_physical()); return timestamp; @@ -2597,6 +2595,10 @@ void lf_synchronize_with_other_federates(void) { // Reset the start time to the coordinated start time for all federates. // Note that this does not grant execution to this federate. start_time = get_start_time_from_rti(lf_time_physical()); + + lf_print("Starting timestamp is: " PRINTF_TIME " and effective start tag is: " PRINTF_TAG ".", lf_time_start(), + lf_tag_effective_start().time - lf_time_start(), lf_tag_effective_start().microstep); + lf_tracing_set_start_time(start_time); // Start a thread to listen for incoming TCP messages from the RTI. @@ -2694,8 +2696,4 @@ char* lf_get_federates_bin_directory() { return LF_FEDERATES_BIN_DIRECTORY; } char* lf_get_federation_id() { return federation_metadata.federation_id; } -instant_t lf_get_effective_start_time() { return effective_start_tag.time; } - -instant_t lf_get_start_time() { return start_time; } - #endif diff --git a/core/tag.c b/core/tag.c index 695bac8d7..e6b139cf8 100644 --- a/core/tag.c +++ b/core/tag.c @@ -167,6 +167,8 @@ instant_t lf_time_physical_elapsed(void) { return lf_time_physical() - start_tim instant_t lf_time_start(void) { return start_time; } +tag_t lf_tag_start_effective(void) { return effective_start_tag; } + size_t lf_readable_time(char* buffer, instant_t time) { if (time <= (instant_t)0) { snprintf(buffer, 2, "0"); diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index 5c66b4772..6d0759df9 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -1022,6 +1022,7 @@ int lf_reactor_c_main(int argc, const char* argv[]) { // Initialize the clock through the platform API. No reading of physical time before this. _lf_initialize_clock(); start_time = lf_time_physical(); + effective_start_tag = (tag_t){.time = start_time, .microstep = 0}; #ifndef FEDERATED lf_tracing_set_start_time(start_time); #endif diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index 5c5d9683e..5e2dcd45a 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -568,14 +568,4 @@ char* lf_get_federates_bin_directory(); */ char* lf_get_federation_id(); -/** - * @brief Returns the effective start time of the federate. The start_time of persistent - * federates is equal to their effective_start_time. Transient federates, however, - * have their effective_start_time higher or equal to their start_time. - */ -instant_t lf_get_effective_start_time(); - -/** @brief Returns the start time of the federate. */ -instant_t lf_get_start_time(); - #endif // FEDERATE_H diff --git a/tag/api/tag.h b/tag/api/tag.h index 97c1aa0d7..2784e1c84 100644 --- a/tag/api/tag.h +++ b/tag/api/tag.h @@ -210,6 +210,14 @@ instant_t lf_time_physical_elapsed(void); */ instant_t lf_time_start(void); +/** + * Return the tag at which the execution effectively started. + * Most of the time, this will default to {.time = start_time, .microstep: 0}. + * When the reactor is a transient federate, however, the value will be different. + * @return A tag. + */ +tag_t lf_tag_start_effective(void); + /** * For user-friendly reporting of time values, the buffer length required. * This is calculated as follows, based on 64-bit time in nanoseconds: From 34e820e39a99844e712e4c5a70ea09608af08424 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 18 Mar 2024 22:03:21 +0100 Subject: [PATCH 062/148] Fix log message --- core/federated/federate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index aab0e1a67..b4a3ae83c 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2597,7 +2597,7 @@ void lf_synchronize_with_other_federates(void) { start_time = get_start_time_from_rti(lf_time_physical()); lf_print("Starting timestamp is: " PRINTF_TIME " and effective start tag is: " PRINTF_TAG ".", lf_time_start(), - lf_tag_effective_start().time - lf_time_start(), lf_tag_effective_start().microstep); + effective_start_tag.time - lf_time_start(), effective_start_tag.microstep); lf_tracing_set_start_time(start_time); From 548adf0f94ad813053f8b6ae9460855c28581e6b Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 18 Mar 2024 22:04:17 +0100 Subject: [PATCH 063/148] Attempt to solve another segmentation fault --- core/federated/RTI/rti_remote.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 6443e860c..df67f20d9 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -142,10 +142,9 @@ void pqueue_delayed_grants_remove(pqueue_delayed_grants_t* q, pqueue_delayed_gra */ pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t* q, uint16_t fed_id) { pqueue_delayed_grant_element_t* dge; - pqueue_t* _q = (pqueue_t*)q; if (!q || q->size == 1) return NULL; - for (int i = 1; i <= q->size; i++) { + for (int i = 1; i < q->size; i++) { dge = (pqueue_delayed_grant_element_t*)q->d[i]; if (dge) { if (dge->fed_id == fed_id) { @@ -2011,8 +2010,8 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { // Got a socket break; } else { - // Try again - lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); + // Continue trying + lf_print("RTI failed to accept the socket. %s. Continue trying.", strerror(errno)); continue; } } From 7d75f204580ffaf319e01830c8602163858cf40f Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 18 Mar 2024 23:52:48 +0100 Subject: [PATCH 064/148] Fix implicit declaration error by moving lf_stop(), lf_get_federates_bin_directory(), and lf_get_federaion_id() declarations to util.h --- include/core/federated/federate.h | 23 ----------------------- include/core/utils/util.h | 23 +++++++++++++++++++++++ 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index 5e2dcd45a..604a36637 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -545,27 +545,4 @@ bool lf_update_max_level(tag_t tag, bool is_provisional); instant_t lf_wait_until_time(tag_t tag); #endif // FEDERATED_DECENTRALIZED -/** - * @brief Stop the execution of a federate. - * Every enclave within the federate will stop at one microstep later than its - * current tag. Unlike lf_request_stop(), this process does not require any - * involvement from the RTI, nor does it necessitate any consensus. - * - * This function is particularly useful for testing transient federates. - */ -void lf_stop(); - -/** - * @brief Return the directory containing the executables of the individual - * federates. - */ -char* lf_get_federates_bin_directory(); - -/** - * @brief Returns the federation id. - * - * This function is useful for creating federates on runtime. - */ -char* lf_get_federation_id(); - #endif // FEDERATE_H diff --git a/include/core/utils/util.h b/include/core/utils/util.h index 77b7b767d..327644f2e 100644 --- a/include/core/utils/util.h +++ b/include/core/utils/util.h @@ -194,4 +194,27 @@ void lf_vprint_error_and_exit(const char* format, va_list args) ATTRIBUTE_FORMAT */ #define LF_CRITICAL_SECTION_EXIT(env) LF_ASSERT(!lf_critical_section_exit(env), "Could not exit critical section") +/** + * @brief Stop the execution of a federate. + * Every enclave within the federate will stop at one microstep later than its + * current tag. Unlike lf_request_stop(), this process does not require any + * involvement from the RTI, nor does it necessitate any consensus. + * + * This function is particularly useful for testing transient federates. + */ +void lf_stop(); + +/** + * @brief Return the directory containing the executables of the individual + * federates. + */ +char* lf_get_federates_bin_directory(); + +/** + * @brief Returns the federation id. + * + * This function is useful for creating federates on runtime. + */ +char* lf_get_federation_id(); + #endif /* UTIL_H */ From 9487891488321969f2f1b91f4df84a146e4c27c8 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 12 Apr 2024 17:52:31 +0100 Subject: [PATCH 065/148] Fix merge --- util/tracing/trace_util.c | 1 - 1 file changed, 1 deletion(-) diff --git a/util/tracing/trace_util.c b/util/tracing/trace_util.c index ed32c5baa..400c4c9ca 100644 --- a/util/tracing/trace_util.c +++ b/util/tracing/trace_util.c @@ -61,7 +61,6 @@ typedef struct open_file_t { open_file_t* next; } open_file_t; open_file_t* _open_files = NULL; - /** * Function to be invoked upon exiting. */ From dbf6bf74095432b3a92d8697a543fc9bf1195e78 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 15 Apr 2024 10:18:39 +0100 Subject: [PATCH 066/148] Fix warnings (turned as errors) --- core/federated/federate.c | 14 +++++++++----- include/core/utils/util.h | 2 +- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index b4a3ae83c..4db195b2e 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -987,8 +987,6 @@ static instant_t get_start_time_from_rti(instant_t my_physical_time) { } instant_t timestamp = extract_int64(&(buffer[1])); - - tag_t tag = {.time = timestamp, .microstep = 0}; effective_start_tag = extract_tag(&(buffer[9])); // Trace the event when tracing is enabled. @@ -2106,7 +2104,7 @@ void* lf_handle_p2p_connections_from_federates(void* env_arg) { // Extract the ID of the sending federate. uint16_t remote_fed_id = extract_uint16((unsigned char*)&(buffer[1])); - bool remote_fed_is_transient = buffer[1 + sizeof(uint16_t)]; + // bool remote_fed_is_transient = buffer[1 + sizeof(uint16_t)]; LF_PRINT_DEBUG("Received sending federate ID %d.", remote_fed_id); // Trace the event when tracing is enabled @@ -2692,8 +2690,14 @@ void lf_stop() { LF_PRINT_LOG("Federate is stopping."); } -char* lf_get_federates_bin_directory() { return LF_FEDERATES_BIN_DIRECTORY; } +char* lf_get_federates_bin_directory() { +#ifdef LF_FEDERATES_BIN_DIRECTORY + return LF_FEDERATES_BIN_DIRECTORY; +#else + return NULL; +#endif +} -char* lf_get_federation_id() { return federation_metadata.federation_id; } +const char* lf_get_federation_id() { return federation_metadata.federation_id; } #endif diff --git a/include/core/utils/util.h b/include/core/utils/util.h index 327644f2e..25c6a8d72 100644 --- a/include/core/utils/util.h +++ b/include/core/utils/util.h @@ -215,6 +215,6 @@ char* lf_get_federates_bin_directory(); * * This function is useful for creating federates on runtime. */ -char* lf_get_federation_id(); +const char* lf_get_federation_id(); #endif /* UTIL_H */ From 01753240cf8c9ef51bb3a70df26a738a4cfbe407 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 15 Apr 2024 10:46:29 +0100 Subject: [PATCH 067/148] Fix variable name --- core/federated/RTI/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index dc4cbeb53..63d055db8 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -242,7 +242,7 @@ int process_args(int argc, const char* argv[]) { } i++; long num_transient_federates = strtol(argv[i], NULL, 10); - if (num_transient_federates == LONG_MAX || num_transient_federates == LONG_MIN) { + if (num_transient_federates < 0L || num_transient_federates == LONG_MAX || num_transient_federates == LONG_MIN) { lf_print_error("--number_of_transient_federates needs a valid positive or null integer argument."); usage(argc, argv); return 0; From 69d8ef7aa85281bc373a1dd4f4589aa94775f61e Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 24 Apr 2024 12:39:19 +0100 Subject: [PATCH 068/148] Fix the documentation of MSG_TYPE_FED_IDS to describe the federate type byte --- include/core/federated/network/net_common.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index 21e54c9cb..320ad039b 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -242,6 +242,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * this order: * * One byte equal to MSG_TYPE_FED_IDS. * * Two bytes (ushort) giving the federate ID. + * * One byte giving the type of the federate (1 if transient, 0 if persistent) * * One byte (uchar) giving the length N of the federation ID. * * N bytes containing the federation ID. * Each federate needs to have a unique ID between 0 and From 16cf1202a50dd89f62d661152091710bcc16f5fb Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 24 Apr 2024 13:14:23 +0100 Subject: [PATCH 069/148] Augment the documentation of receive_connection_information() to describe interface preservation checking in case of a hot swap --- core/federated/RTI/rti_remote.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index df67f20d9..d92fc7900 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1601,6 +1601,11 @@ static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_ /** * Listen for a MSG_TYPE_NEIGHBOR_STRUCTURE message, and upon receiving it, fill * out the relevant information in the federate's struct. + * + * In case of a hot swap, check that no changes were made to the connections, compared + * to the first instance that joigned. This means that the first instance to join + * __is__ the reference. + * * @return 1 on success and 0 on failure. */ static int receive_connection_information(int* socket_id, uint16_t fed_id) { From 8d2d6a5293605223a14249148dcf6d4f75a21c2a Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 24 Apr 2024 13:16:00 +0100 Subject: [PATCH 070/148] Invalidate upstream delays when a transient leaves the federation --- core/federated/RTI/rti_remote.c | 1 + 1 file changed, 1 insertion(+) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index d92fc7900..52f873e09 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -2216,6 +2216,7 @@ void reset_transient_federate(federate_info_t* fed) { fed->server_ip_addr.s_addr = 0; fed->server_port = -1; fed->requested_stop = false; + invalidate_min_delays_upstream(fed->enclave); } int32_t start_rti_server(uint16_t port) { From 0ba3d61602c62d8d8f048d73ef4e6c671590be57 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 25 Apr 2024 10:06:35 +0100 Subject: [PATCH 071/148] Fix the invalidation --- core/federated/RTI/rti_remote.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 52f873e09..d73e0cee5 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -2216,7 +2216,7 @@ void reset_transient_federate(federate_info_t* fed) { fed->server_ip_addr.s_addr = 0; fed->server_port = -1; fed->requested_stop = false; - invalidate_min_delays_upstream(fed->enclave); + invalidate_min_delays_upstream(&(fed->enclave)); } int32_t start_rti_server(uint16_t port) { From 033e99bcde63b71e0abd4b0a37d400204e19b9a9 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 8 May 2024 17:56:55 +0100 Subject: [PATCH 072/148] Upadte the computatio of the effective start tag to account for upstream messages --- core/federated/RTI/rti_remote.c | 40 ++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index d73e0cee5..74bf78070 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1006,40 +1006,54 @@ void handle_timestamp(federate_info_t* my_fed) { //// Algorithm for computing the effective_start_time of a joining transient // The effective_start_time will be the max among all the following tags: - // - At tag: (joining time, 0 microstep) - // - The latest completed logical tag + 1 microstep - // - The latest granted tag + 1 microstep, of every downstream federate - // - The latest provisionnaly granted tag + 1 microstep, of every downstream federate + // 1. At tag: (joining time, 0 microstep) + // 2. The latest completed logical tag + 1 microstep + // 3. The latest granted (P)TAG + 1 microstep, of every downstream federate + // 4. The maximun tag of messages from the upstream federates + 1 microstep + // Condition 1. my_fed->effective_start_tag = (tag_t){.time = timestamp, .microstep = 0u}; - if (lf_tag_compare(my_fed->enclave.completed, my_fed->effective_start_tag) > 0) { + // Condition 2. + // FIXME: Not sure if this corner case can happen, but better to be on the safe side. + if (lf_tag_compare(my_fed->enclave.completed, my_fed->effective_start_tag) >= 0) { my_fed->effective_start_tag = my_fed->enclave.completed; my_fed->effective_start_tag.microstep++; } - // Iterate over the downstream federates + // Condition 3. Iterate over the downstream federates for (int j = 0; j < my_fed->enclave.num_downstream; j++) { federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); - // Ignore this federate if it has resigned. - if (downstream->enclave.state == NOT_CONNECTED) { - continue; - } - // Get the max over the TAG of the downstreams - if (lf_tag_compare(downstream->enclave.last_granted, my_fed->effective_start_tag) > 0) { + if (lf_tag_compare(downstream->enclave.last_granted, my_fed->effective_start_tag) >= 0) { my_fed->effective_start_tag = downstream->enclave.last_granted; my_fed->effective_start_tag.microstep++; } // Get the max over the PTAG of the downstreams - if (lf_tag_compare(downstream->enclave.last_provisionally_granted, my_fed->effective_start_tag) > 0) { + if (lf_tag_compare(downstream->enclave.last_provisionally_granted, my_fed->effective_start_tag) >= 0) { my_fed->effective_start_tag = downstream->enclave.last_provisionally_granted; my_fed->effective_start_tag.microstep++; } } + // Condition 4. Iterate over the messages from the upstream federates + for (int j = 0; j < my_fed->enclave.num_upstream; j++) { + federate_info_t* upstream = GET_FED_INFO(my_fed->enclave.upstream[j]); + + // Get the max over the TAG of the upstreams + size_t queue_size = pqueue_tag_size(upstream->in_transit_message_tags); + pqueue_t* pq = (pqueue_t*)(upstream->in_transit_message_tags); + pqueue_tag_element_t* message_with_max_tag = (pqueue_tag_element_t*)(pq->d[queue_size]); + tag_t max_tag = message_with_max_tag->tag; + + if (lf_tag_compare(max_tag, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = max_tag; + my_fed->effective_start_tag.microstep++; + } + } + // For every downstream that has a pending grant that is higher then the // effective_start_time of the federate, cancel it for (int j = 0; j < my_fed->enclave.num_downstream; j++) { From 4cbe5b81e5a3e39fb65db0aa34a880a6cf7f63b0 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 19 Jun 2024 22:13:21 +0100 Subject: [PATCH 073/148] Get the max of the TAG of the upstreams only if the queue is not empty --- core/federated/RTI/rti_remote.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 74bf78070..f99a64ac4 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1044,13 +1044,15 @@ void handle_timestamp(federate_info_t* my_fed) { // Get the max over the TAG of the upstreams size_t queue_size = pqueue_tag_size(upstream->in_transit_message_tags); - pqueue_t* pq = (pqueue_t*)(upstream->in_transit_message_tags); - pqueue_tag_element_t* message_with_max_tag = (pqueue_tag_element_t*)(pq->d[queue_size]); - tag_t max_tag = message_with_max_tag->tag; - - if (lf_tag_compare(max_tag, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = max_tag; - my_fed->effective_start_tag.microstep++; + if (queue_size != 0) { + pqueue_t* pq = (pqueue_t*)(upstream->in_transit_message_tags); + pqueue_tag_element_t* message_with_max_tag = (pqueue_tag_element_t*)(pq->d[queue_size]); + tag_t max_tag = message_with_max_tag->tag; + + if (lf_tag_compare(max_tag, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = max_tag; + my_fed->effective_start_tag.microstep++; + } } } @@ -2008,7 +2010,6 @@ void send_stop(federate_info_t* fed) { } void* lf_connect_to_transient_federates_thread(void* nothing) { - initialize_lf_thread_id(); // This loop will continue to accept connections of transient federates, as // soon as there is room, or enable hot swap From 1a7fba65263459e642c0feacc818d8bc655c63c8 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 19 Jun 2024 23:53:20 +0100 Subject: [PATCH 074/148] Fix return value of lf_get_federates_bin_directory() --- core/federated/federate.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 4db195b2e..350e7a6cd 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2693,9 +2693,8 @@ void lf_stop() { char* lf_get_federates_bin_directory() { #ifdef LF_FEDERATES_BIN_DIRECTORY return LF_FEDERATES_BIN_DIRECTORY; -#else - return NULL; #endif + return NULL; } const char* lf_get_federation_id() { return federation_metadata.federation_id; } From 7d85d0f55e63193d41a625ec656ef3f812d323a8 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 20 Jun 2024 00:43:01 +0100 Subject: [PATCH 075/148] Another attempt to fix return value of lf_get_federates_bin_directory() --- core/federated/federate.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 350e7a6cd..a3aedb04c 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2691,10 +2691,14 @@ void lf_stop() { } char* lf_get_federates_bin_directory() { + bool bin_directory_defined = false; #ifdef LF_FEDERATES_BIN_DIRECTORY - return LF_FEDERATES_BIN_DIRECTORY; + bin_directory_defined = true; #endif - return NULL; + if (bin_directory_defined) { + return LF_FEDERATES_BIN_DIRECTORY; + } + return {NULL}; } const char* lf_get_federation_id() { return federation_metadata.federation_id; } From c6e43ee0bc416d83e8f29a062b975f75516351cc Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 20 Jun 2024 01:11:31 +0100 Subject: [PATCH 076/148] Another attempt to fix return value of lf_get_federates_bin_directory() --- core/federated/federate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index a3aedb04c..096dfd472 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2696,9 +2696,9 @@ char* lf_get_federates_bin_directory() { bin_directory_defined = true; #endif if (bin_directory_defined) { - return LF_FEDERATES_BIN_DIRECTORY; + return (LF_FEDERATES_BIN_DIRECTORY); } - return {NULL}; + return NULL; } const char* lf_get_federation_id() { return federation_metadata.federation_id; } From 5949b75f2126cf872729d900ae36a71e1472cb06 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 1 Feb 2024 10:40:40 +0100 Subject: [PATCH 077/148] Start on adding federate type (transient or not) --- core/federated/RTI/rti_remote.c | 2 +- core/federated/federate.c | 17 ----------------- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index f99a64ac4..403c1b3f8 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1451,7 +1451,7 @@ static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_ // First byte received is the message type. if (buffer[0] != MSG_TYPE_FED_IDS) { if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } if (buffer[0] == MSG_TYPE_P2P_SENDING_FED_ID || buffer[0] == MSG_TYPE_P2P_TAGGED_MESSAGE) { // The federate is trying to connect to a peer, not to the RTI. diff --git a/core/federated/federate.c b/core/federated/federate.c index 096dfd472..cb6f0aa64 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -76,23 +76,6 @@ int max_level_allowed_to_advance; * The state of this federate instance. Each executable has exactly one federate instance, * and the _fed global variable refers to that instance. */ -<<<<<<< HEAD -federate_instance_t _fed = {.socket_TCP_RTI = -1, - .number_of_inbound_p2p_connections = 0, - .inbound_socket_listeners = NULL, - .number_of_outbound_p2p_connections = 0, - .inbound_p2p_handling_thread_id = 0, - .server_socket = -1, - .server_port = -1, - .last_TAG = {.time = NEVER, .microstep = 0u}, - .is_last_TAG_provisional = false, - .has_upstream = false, - .has_downstream = false, - .received_stop_request_from_rti = false, - .last_sent_LTC = {.time = NEVER, .microstep = 0u}, - .last_sent_NET = {.time = NEVER, .microstep = 0u}, - .min_delay_from_physical_action_to_federate_output = NEVER}; -======= federate_instance_t _fed = {.socket_TCP_RTI = -1, .number_of_inbound_p2p_connections = 0, .inbound_socket_listeners = NULL, From 9d92decf884267d1f0a32e78174b42528931b61c Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 1 Feb 2024 11:04:17 +0100 Subject: [PATCH 078/148] Add cmdline argument -nt to RTI and add nbr_transient_federates attribute --- core/federated/RTI/main.c | 13 ++++++++----- core/federated/RTI/rti_remote.c | 16 ---------------- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index 63d055db8..005b784ae 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -122,7 +122,7 @@ void usage(int argc, const char* argv[]) { lf_print(" -n, --number_of_federates "); lf_print(" The number of federates in the federation that this RTI will control.\n"); lf_print(" -nt, --number_of_transient_federates "); - lf_print(" The number of federates that are transient; this must be strictly less than the number of federates.\n"); + lf_print(" The number of transient federates in the federation that this RTI will control.\n"); lf_print(" -p, --port "); lf_print(" The port number to use for the RTI. Must be larger than 0 and smaller than %d. Default is %d.\n", UINT16_MAX, DEFAULT_PORT); @@ -242,7 +242,7 @@ int process_args(int argc, const char* argv[]) { } i++; long num_transient_federates = strtol(argv[i], NULL, 10); - if (num_transient_federates < 0L || num_transient_federates == LONG_MAX || num_transient_federates == LONG_MIN) { + if (num_transient_federates == LONG_MAX || num_transient_federates == LONG_MIN) { lf_print_error("--number_of_transient_federates needs a valid positive or null integer argument."); usage(argc, argv); return 0; @@ -294,8 +294,8 @@ int process_args(int argc, const char* argv[]) { usage(argc, argv); return 0; } - if (rti.number_of_transient_federates >= rti.base.number_of_scheduling_nodes) { - lf_print_error("--number_of_transient_federates cannot be higher or equal to the number of federates."); + if (rti.number_of_transient_federates > rti.base.number_of_scheduling_nodes) { + lf_print_error("--number_of_transient_federates cannot be higher than the number of federates."); usage(argc, argv); return 0; } @@ -339,7 +339,10 @@ int main(int argc, const char* argv[]) { } lf_print("Starting RTI for a total of %d federates, with %d being transient, in federation ID %s", - rti.base.number_of_scheduling_nodes, rti.number_of_transient_federates, rti.federation_id); + rti.base.number_of_scheduling_nodes, rti.number_of_transient_federates, + + rti.federation_id); + assert(rti.base.number_of_scheduling_nodes < UINT16_MAX); // Allocate memory for the federates diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 403c1b3f8..db85072e7 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -2215,23 +2215,7 @@ void initialize_federate(federate_info_t* fed, uint16_t id) { strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); fed->server_ip_addr.s_addr = 0; fed->server_port = -1; - fed->has_upstream_transient_federates = false; fed->is_transient = true; - fed->effective_start_tag = NEVER_TAG; -} - -void reset_transient_federate(federate_info_t* fed) { - // Reset of the federate-related attributes - fed->socket = -1; // No socket. - fed->clock_synchronization_enabled = true; - // FIXME: The following two lines can be improved? - pqueue_tag_free(fed->in_transit_message_tags); - fed->in_transit_message_tags = pqueue_tag_init(10); - strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); - fed->server_ip_addr.s_addr = 0; - fed->server_port = -1; - fed->requested_stop = false; - invalidate_min_delays_upstream(&(fed->enclave)); } int32_t start_rti_server(uint16_t port) { From 29ca14bbed9a127e3d61f692aace7a432533bcff Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 1 Feb 2024 13:37:09 +0100 Subject: [PATCH 079/148] Differentiate between connecting to persistent federates and transient federates --- core/federated/RTI/rti_remote.c | 1005 +++++++++++++++++-------------- core/federated/RTI/rti_remote.h | 2 +- 2 files changed, 545 insertions(+), 462 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index db85072e7..b713bd6a0 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1906,519 +1906,602 @@ static bool authenticate_federate(int* socket) { } #endif -// FIXME: The socket descriptor here (parameter) is not used. Should be removed? void lf_connect_to_persistent_federates(int socket_descriptor) { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates; i++) { - // Wait for an incoming connection request. - struct sockaddr client_fd; - uint32_t client_length = sizeof(client_fd); - // The following blocks until a federate connects. - int socket_id = -1; - while (1) { - socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); - if (socket_id >= 0) { - // Got a socket - break; - } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { - lf_print_error_system_failure("RTI failed to accept the socket."); - } else { - // Try again - lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); - continue; - } - } + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote.number_of_transient_federates; i++) { + // FIXME: The socket descriptor here (parameter) is not used. Should be removed? + void lf_connect_to_persistent_federates(int socket_descriptor) { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates; + i++) { + // Wait for an incoming connection request. + struct sockaddr client_fd; + uint32_t client_length = sizeof(client_fd); + // The following blocks until a federate connects. + int socket_id = -1; + while (1) { + socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); + if (socket_id >= 0) { + // Got a socket + break; + } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { + lf_print_error_system_failure("RTI failed to accept the socket."); + } else { + // Try again + lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); + continue; + } + } // Wait for the first message from the federate when RTI -a option is on. #ifdef __RTI_AUTH__ - if (rti_remote->authentication_enabled) { - if (!authenticate_federate(&socket_id)) { - lf_print_warning("RTI failed to authenticate the incoming federate."); - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - socket_id = -1; - // Ignore the federate that failed authentication. - i--; - continue; - } - } + if (rti_remote->authentication_enabled) { + if (!authenticate_federate(&socket_id)) { + lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + socket_id = -1; + // Ignore the federate that failed authentication. + i--; + continue; + } + } #endif - // The first message from the federate should contain its ID and the federation ID. - int32_t fed_id = receive_and_check_fed_id_message(&socket_id); - if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && - receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - federate_info_t* fed = GET_FED_INFO(fed_id); - lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); - - // If the federate is transient, then do not count it. - if (fed->is_transient) { - rti_remote->number_of_connected_transient_federates++; - assert(rti_remote->number_of_connected_transient_federates <= rti_remote->number_of_transient_federates); - i--; - lf_print("RTI: Transient federate %d joined.", fed->enclave.id); - } - } else { - // Received message was rejected. Try again. - i--; - } - } - // All federates have connected. - LF_PRINT_DEBUG("All persistent federates have connected to RTI."); - - if (rti_remote->clock_sync_global_status >= clock_sync_on) { - // Create the thread that performs periodic PTP clock synchronization sessions - // over the UDP channel, but only if the UDP channel is open and at least one - // federate is performing runtime clock synchronization. - bool clock_sync_enabled = false; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed_info = GET_FED_INFO(i); - if (fed_info->clock_synchronization_enabled) { - clock_sync_enabled = true; - break; + // The first message from the federate should contain its ID and the federation ID. + int32_t fed_id = receive_and_check_fed_id_message(&socket_id); + if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && + receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { + + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + federate_info_t* fed = GET_FED_INFO(fed_id); + lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + + // If the federate is transient, then do not count it. + if (fed->is_transient) { + rti_remote.number_of_connected_transient_federates++; + assert(rti_remote.number_of_connected_transient_federates <= rti_remote.number_of_transient_federates); + i--; + lf_print("RTI: Transient federate %d joined.", fed->base.id); + } + } else { + // Received message was rejected. Try again. + i--; + } } - } - if (rti_remote->final_port_UDP != UINT16_MAX && clock_sync_enabled) { - lf_thread_create(&rti_remote->clock_thread, clock_synchronization_thread, NULL); - } - } -} + // All federates have connected. + LF_PRINT_DEBUG("All persistent federates have connected to RTI."); -/** - * @brief A request for immediate stop to the federate - * - * @param fed: the deferate to stop - */ -void send_stop(federate_info_t* fed) { - // Reply with a stop granted to all federates - unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; - outgoing_buffer[0] = MSG_TYPE_STOP; - lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); - - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_STOP, fed->enclave.id, NULL); - } - write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, - "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); - - LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); -} - -void* lf_connect_to_transient_federates_thread(void* nothing) { - // This loop will continue to accept connections of transient federates, as - // soon as there is room, or enable hot swap - - while (!rti_remote->all_persistent_federates_exited) { - // Continue waiting for an incoming connection requests from transients - // to join, or for hot swap. - // Wait for an incoming connection request. - struct sockaddr client_fd; - uint32_t client_length = sizeof(client_fd); - // The following blocks until a federate connects. - int socket_id = -1; - while (1) { - if (rti_remote->all_persistent_federates_exited) { - return NULL; - } - socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); - if (socket_id >= 0) { - // Got a socket - break; - } else { - // Continue trying - lf_print("RTI failed to accept the socket. %s. Continue trying.", strerror(errno)); - continue; + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + // Create the thread that performs periodic PTP clock synchronization sessions + // over the UDP channel, but only if the UDP channel is open and at least one + // federate is performing runtime clock synchronization. + bool clock_sync_enabled = false; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed_info = GET_FED_INFO(i); + if (fed_info->clock_synchronization_enabled) { + clock_sync_enabled = true; + break; + } + } + if (rti_remote->final_port_UDP != UINT16_MAX && clock_sync_enabled) { + lf_thread_create(&rti_remote->clock_thread, clock_synchronization_thread, NULL); + } } } + void* lf_connect_to_transient_federates_thread(int socket_descriptor) { + // This loop will continue to accept connections of transient federates, as + // soon as there is room, or enable hot swap + + while (!rti_remote.all_persistent_federates_exited) { + // Continue waiting for an incoming connection requests from transients + // to join, or for hot swap. + // Wait for an incoming connection request. + struct sockaddr client_fd; + uint32_t client_length = sizeof(client_fd); + // The following blocks until a federate connects. + int socket_id = -1; + while (1) { + if (!rti_remote.all_persistent_federates_exited) { + return NULL; + } + socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); + if (socket_id >= 0) { + // Got a socket + break; + } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { + lf_print_error_system_failure("RTI failed to accept the socket."); + } else { + // Try again + lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); + continue; + } + } + // Wait for the first message from the federate when RTI -a option is on. #ifdef __RTI_AUTH__ - if (rti_remote->authentication_enabled) { - if (!authenticate_federate(&socket_id)) { - lf_print_warning("RTI failed to authenticate the incoming federate."); - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - socket_id = -1; - // Ignore the federate that failed authentication. - continue; - } - } + if (rti_remote->authentication_enabled) { + if (!authenticate_federate(&socket_id)) { + lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + socket_id = -1; + // Ignore the federate that failed authentication. + i--; + continue; + } + } #endif - // The first message from the federate should contain its ID and the federation ID. - // The function also detects if a hot swap request is initiated. - int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in*)&client_fd); + // The first message from the federate should contain its ID and the federation ID. + // The function also detects if a hot swap request is initiated. + int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in*)&client_fd); + if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && + receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { + } - if (fed_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && - receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - LF_MUTEX_LOCK(&rti_mutex); - if (hot_swap_in_progress) { - lf_print("RTI: Hot swap confirmed for federate %d.", fed_id); - - // Then send STOP - federate_info_t* fed_old = GET_FED_INFO(fed_id); - LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); - send_stop(fed_old); - LF_MUTEX_UNLOCK(&rti_mutex); - - // Wait for the old federate to send MSG_TYPE_RESIGN - LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); - // FIXME: Should this have a timeout? - while (!hot_swap_old_resigned) - ; - - // The latest LTC is the tag at which the old federate resigned. This is useful - // for computing the effective_start_time of the new joining federate. - hot_swap_federate->enclave.completed = fed_old->enclave.completed; - - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - lf_thread_create(&(hot_swap_federate->thread_id), federate_info_thread_TCP, hot_swap_federate); - - LF_MUTEX_LOCK(&rti_mutex); - // Redirect the federate in rti_remote - rti_remote->base.scheduling_nodes[fed_id] = (scheduling_node_t*)hot_swap_federate; - - // Free the old federate memory and reset the Hot wap indicators - pqueue_tag_free(fed_old->in_transit_message_tags); - free(fed_old); - hot_swap_federate = NULL; - hot_swap_in_progress = false; - hot_swap_old_resigned = false; - LF_MUTEX_UNLOCK(&rti_mutex); - lf_print("RTI: Hot swap succeeded for federate %d.", fed_id); - } else { - lf_mutex_unlock(&rti_mutex); - - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - federate_info_t* fed = GET_FED_INFO(fed_id); - lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); - lf_print("RTI: Transient federate %d joined.", fed_id); + // // Create a thread to communicate with the federate. + // // This has to be done after clock synchronization is finished + // // or that thread may end up attempting to handle incoming clock + // // synchronization messages. + // federate_info_t *fed = GET_FED_INFO(fed_id); + // lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + + // // If the federate is transient, then do not count it. + // if (fed->is_transient) { + // rti_remote.number_of_connected_transient_federates++; + // assert(rti_remote.number_of_connected_transient_federates <= + // rti_remote.number_of_transient_federates); i--; lf_print("RTI: Transient federate %d joined.", + // fed->base.id); + // } + // } else { + // // Received message was rejected. Try again. + // i--; + // } + + // FIXME: Check again if runtime clock synchronization should be lauched, + // only if the number of persistent threads is zero. This should be done + // only once, not at every transient connection. } - rti_remote->number_of_connected_transient_federates++; - } else { - // If a hot swap was initialed, but the connection information or/and clock - // synchronization fail, then reset hot_swap_in_profress, and free the memory - // allocated for hot_swap_federate - if (hot_swap_in_progress) { - lf_print("RTI: Hot swap canceled for federate %d.", fed_id); - lf_mutex_lock(&rti_mutex); - hot_swap_in_progress = false; - lf_mutex_unlock(&rti_mutex); + } + + /** + * @brief A request for immediate stop to the federate + * + * @param fed: the deferate to stop + */ + void send_stop(federate_info_t * fed) { + // Reply with a stop granted to all federates + unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; + outgoing_buffer[0] = MSG_TYPE_STOP; + lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); - // FIXME: Is this enough to free the memory of a federate_info_t data structure? - free(hot_swap_federate); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_STOP, fed->enclave.id, NULL); } + write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, + "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); + + LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); } - } - return NULL; -} -/** - * This thread is responsible for managing the priority queue of delayed grants to be issued. - * It waits until the current time matches the highest priority tag time in the queue. - * If reached, it notifies the grant immediately. If, however, the current time has not yet - * reached the highest priority tag and the queue has been updated (either by inserting or - * canceling an entry), the thread stops waiting and restarts the process again. - */ -void* lf_delayed_grants_thread(void* nothing) { - initialize_lf_thread_id(); + void* lf_connect_to_transient_federates_thread(void* nothing) { + // This loop will continue to accept connections of transient federates, as + // soon as there is room, or enable hot swap + + while (!rti_remote->all_persistent_federates_exited) { + // Continue waiting for an incoming connection requests from transients + // to join, or for hot swap. + // Wait for an incoming connection request. + struct sockaddr client_fd; + uint32_t client_length = sizeof(client_fd); + // The following blocks until a federate connects. + int socket_id = -1; + while (1) { + if (rti_remote->all_persistent_federates_exited) { + return NULL; + } + socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); + if (socket_id >= 0) { + // Got a socket + break; + } else { + // Continue trying + lf_print("RTI failed to accept the socket. %s. Continue trying.", strerror(errno)); + continue; + } + } - // Wait for the first condition signal - lf_cond_wait(&updated_delayed_grants); +// Wait for the first message from the federate when RTI -a option is on. +#ifdef __RTI_AUTH__ + if (rti_remote->authentication_enabled) { + if (!authenticate_federate(&socket_id)) { + lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + socket_id = -1; + // Ignore the federate that failed authentication. + continue; + } + } +#endif - while (true) { - if (rti_remote->all_federates_exited) { - break; - } - if (pqueue_delayed_grants_size(rti_remote->delayed_grants) != 0) { - pqueue_delayed_grant_element_t* next; - - // Do not pop, but rather read - next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); - instant_t next_time = next->base.tag.time; - // Wait for expiration, or a signal to stop or terminate. - if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time)) { - // Time reached to send the grant. Do it for delayed grants with the same tag - LF_MUTEX_LOCK(&rti_mutex); - next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); - federate_info_t* fed = GET_FED_INFO(next->fed_id); - if (next->is_provisional) { - notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + // The first message from the federate should contain its ID and the federation ID. + // The function also detects if a hot swap request is initiated. + int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in*)&client_fd); + + if (fed_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && + receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { + LF_MUTEX_LOCK(&rti_mutex); + if (hot_swap_in_progress) { + lf_print("RTI: Hot swap confirmed for federate %d.", fed_id); + + // Then send STOP + federate_info_t* fed_old = GET_FED_INFO(fed_id); + LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); + send_stop(fed_old); + LF_MUTEX_UNLOCK(&rti_mutex); + + // Wait for the old federate to send MSG_TYPE_RESIGN + LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); + // FIXME: Should this have a timeout? + while (!hot_swap_old_resigned) + ; + + // The latest LTC is the tag at which the old federate resigned. This is useful + // for computing the effective_start_time of the new joining federate. + hot_swap_federate->enclave.completed = fed_old->enclave.completed; + + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + lf_thread_create(&(hot_swap_federate->thread_id), federate_info_thread_TCP, hot_swap_federate); + + LF_MUTEX_LOCK(&rti_mutex); + // Redirect the federate in rti_remote + rti_remote->base.scheduling_nodes[fed_id] = (scheduling_node_t*)hot_swap_federate; + + // Free the old federate memory and reset the Hot wap indicators + pqueue_tag_free(fed_old->in_transit_message_tags); + free(fed_old); + hot_swap_federate = NULL; + hot_swap_in_progress = false; + hot_swap_old_resigned = false; + LF_MUTEX_UNLOCK(&rti_mutex); + lf_print("RTI: Hot swap succeeded for federate %d.", fed_id); + } else { + lf_mutex_unlock(&rti_mutex); + + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + federate_info_t* fed = GET_FED_INFO(fed_id); + lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + lf_print("RTI: Transient federate %d joined.", fed_id); + } + rti_remote->number_of_connected_transient_federates++; } else { - notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + // If a hot swap was initialed, but the connection information or/and clock + // synchronization fail, then reset hot_swap_in_profress, and free the memory + // allocated for hot_swap_federate + if (hot_swap_in_progress) { + lf_print("RTI: Hot swap canceled for federate %d.", fed_id); + lf_mutex_lock(&rti_mutex); + hot_swap_in_progress = false; + lf_mutex_unlock(&rti_mutex); + + // FIXME: Is this enough to free the memory of a federate_info_t data structure? + free(hot_swap_federate); + } } - LF_MUTEX_UNLOCK(&rti_mutex); - } else { - // Waiting was interrupted, because of an update in the queue, or - // because this thread needs to terminate - lf_print("RTI: lf_delayed_grants_thread() did not send grant to %d at " PRINTF_TIME ", but rather terminated!", - next->fed_id, next_time - start_time); } - } - } - // The federation is at the shutdown phase. All persistent federates exited. - // We can do a sanity check that the delayed_grants queue is empty. - // FIXME: If there are still pending grants, what does that mean? Maybe that the - // federation stopped after a request to stop (not a timeout). Therefore, we need - // cleanup, and free the memory... - // TODO: do it! - return NULL; -} - -void* respond_to_erroneous_connections(void* nothing) { - initialize_lf_thread_id(); - while (true) { - // Wait for an incoming connection request. - // The following will block until either a federate attempts to connect - // or close(rti->socket_descriptor_TCP) is called. - int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, -1); - if (socket_id < 0) { - return NULL; - } - if (rti_remote->all_federates_exited) { return NULL; } - lf_print_error("RTI received an unexpected connection request. Federation is running."); - unsigned char response[2]; - response[0] = MSG_TYPE_REJECT; - response[1] = FEDERATION_ID_DOES_NOT_MATCH; - // Ignore errors on this response. - if (write_to_socket(socket_id, 2, response)) { - lf_print_warning("RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); + /** + * This thread is responsible for managing the priority queue of delayed grants to be issued. + * It waits until the current time matches the highest priority tag time in the queue. + * If reached, it notifies the grant immediately. If, however, the current time has not yet + * reached the highest priority tag and the queue has been updated (either by inserting or + * canceling an entry), the thread stops waiting and restarts the process again. + */ + void* lf_delayed_grants_thread(void* nothing) { + initialize_lf_thread_id(); + + // Wait for the first condition signal + lf_cond_wait(&updated_delayed_grants); + + while (true) { + if (rti_remote->all_federates_exited) { + break; + } + if (pqueue_delayed_grants_size(rti_remote->delayed_grants) != 0) { + pqueue_delayed_grant_element_t* next; + + // Do not pop, but rather read + next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + instant_t next_time = next->base.tag.time; + // Wait for expiration, or a signal to stop or terminate. + if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time)) { + // Time reached to send the grant. Do it for delayed grants with the same tag + LF_MUTEX_LOCK(&rti_mutex); + next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); + federate_info_t* fed = GET_FED_INFO(next->fed_id); + if (next->is_provisional) { + notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } else { + notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } + LF_MUTEX_UNLOCK(&rti_mutex); + } else { + // Waiting was interrupted, because of an update in the queue, or + // because this thread needs to terminate + lf_print("RTI: lf_delayed_grants_thread() did not send grant to %d at " PRINTF_TIME + ", but rather terminated!", + next->fed_id, next_time - start_time); + } + } + } + // The federation is at the shutdown phase. All persistent federates exited. + // We can do a sanity check that the delayed_grants queue is empty. + // FIXME: If there are still pending grants, what does that mean? Maybe that the + // federation stopped after a request to stop (not a timeout). Therefore, we need + // cleanup, and free the memory... + // TODO: do it! + return NULL; } - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - } - return NULL; -} - -void initialize_federate(federate_info_t* fed, uint16_t id) { - initialize_scheduling_node(&(fed->enclave), id); - fed->requested_stop = false; - fed->socket = -1; // No socket. - fed->clock_synchronization_enabled = true; - fed->in_transit_message_tags = pqueue_tag_init(10); - strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); - fed->server_ip_addr.s_addr = 0; - fed->server_port = -1; - fed->is_transient = true; -} -int32_t start_rti_server(uint16_t port) { - _lf_initialize_clock(); - // Create the TCP socket server - if (create_server(port, &rti_remote->socket_descriptor_TCP, &rti_remote->final_port_TCP, TCP, true)) { - lf_print_error_system_failure("RTI failed to create TCP server: %s.", strerror(errno)); - }; - lf_print("RTI: Listening for federates."); - // Create the UDP socket server - // Try to get the rti_remote->final_port_TCP + 1 port - if (rti_remote->clock_sync_global_status >= clock_sync_on) { - if (create_server(rti_remote->final_port_TCP + 1, &rti_remote->socket_descriptor_UDP, &rti_remote->final_port_UDP, - UDP, true)) { - lf_print_error_system_failure("RTI failed to create UDP server: %s.", strerror(errno)); - } - } - return rti_remote->socket_descriptor_TCP; -} + void* respond_to_erroneous_connections(void* nothing) { + initialize_lf_thread_id(); + while (true) { + // Wait for an incoming connection request. + // The following will block until either a federate attempts to connect + // or close(rti->socket_descriptor_TCP) is called. + int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, -1); + if (socket_id < 0) { + return NULL; + } + if (rti_remote->all_federates_exited) { + return NULL; + } -/** - * Iterate over the federates and sets 'has_upstream_transient_federates'. - * Once done, check that no transient federate has an upstream transient federate, - * and compute the number of persistent federates that do have upstream transients, - * which is the maximun number of delayed grants that can be pending at the same time. - * This is useful for initialyzing the queue of delayed grants. - - * @return -1, if there is more than one level of transiency, else, the number of - * persistents that have an upstream transient - */ -static int set_has_upstream_transient_federates_parameter_and_check() { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - for (int j = 0; j < fed->enclave.num_upstream; j++) { - federate_info_t* upstream_fed = GET_FED_INFO(fed->enclave.upstream[j]); - if (upstream_fed->is_transient) { - fed->has_upstream_transient_federates = true; - break; + lf_print_error("RTI received an unexpected connection request. Federation is running."); + unsigned char response[2]; + response[0] = MSG_TYPE_REJECT; + response[1] = FEDERATION_ID_DOES_NOT_MATCH; + // Ignore errors on this response. + if (write_to_socket(socket_id, 2, response)) { + lf_print_warning("RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); + } + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); } + return NULL; } - } - // Now check that no transient has an upstream transient - // FIXME: Do we really need this? Or should it be the job of the validator? - int max_number_of_delayed_grants = 0; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->is_transient && fed->has_upstream_transient_federates) { - return -1; + void initialize_federate(federate_info_t * fed, uint16_t id) { + initialize_scheduling_node(&(fed->enclave), id); + fed->requested_stop = false; + fed->socket = -1; // No socket. + fed->clock_synchronization_enabled = true; + fed->in_transit_message_tags = pqueue_tag_init(10); + strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); + fed->server_ip_addr.s_addr = 0; + fed->server_port = -1; + fed->is_transient = true; } - if (!fed->is_transient && fed->has_upstream_transient_federates) { - max_number_of_delayed_grants++; - } - } - return max_number_of_delayed_grants; -} - -void wait_for_federates(int socket_descriptor) { - // Wait for connections from persistent federates and create a thread for each. - lf_connect_to_persistent_federates(socket_descriptor); - - // Set has_upstream_transient_federates parameter in all federates and check - // that there is no more than one level of transiency - if (rti_remote->number_of_transient_federates > 0) { - int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); - if (max_number_of_pending_grants == -1) { - lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); + int32_t start_rti_server(uint16_t port) { + _lf_initialize_clock(); + // Create the TCP socket server + if (create_server(port, &rti_remote->socket_descriptor_TCP, &rti_remote->final_port_TCP, TCP, true)) { + lf_print_error_system_failure("RTI failed to create TCP server: %s.", strerror(errno)); + }; + lf_print("RTI: Listening for federates."); + // Create the UDP socket server + // Try to get the rti_remote->final_port_TCP + 1 port + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + if (create_server(rti_remote->final_port_TCP + 1, &rti_remote->socket_descriptor_UDP, + &rti_remote->final_port_UDP, UDP, true)) { + lf_print_error_system_failure("RTI failed to create UDP server: %s.", strerror(errno)); + } + } + return rti_remote->socket_descriptor_TCP; } - rti_remote->delayed_grants = pqueue_delayed_grants_init(max_number_of_pending_grants); - } - - // All persistent federates have connected. - lf_print("RTI: All expected persistent federates have connected. Starting execution."); - if (rti_remote->number_of_transient_federates > 0) { - lf_print("RTI: Transient Federates can join and leave the federation at anytime."); - } - // The socket server will only continue to accept connections from transient - // federates. - // In case some other federation's federates are trying to join the wrong - // federation, need to respond. Start a separate thread to do that. - lf_thread_t responder_thread; - lf_thread_t transient_thread; - lf_thread_t delayed_grants_thread; + /** + * Iterate over the federates and sets 'has_upstream_transient_federates'. + * Once done, check that no transient federate has an upstream transient federate, + * and compute the number of persistent federates that do have upstream transients, + * which is the maximun number of delayed grants that can be pending at the same time. + * This is useful for initialyzing the queue of delayed grants. + + * @return -1, if there is more than one level of transiency, else, the number of + * persistents that have an upstream transient + */ + static int set_has_upstream_transient_federates_parameter_and_check() { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + for (int j = 0; j < fed->enclave.num_upstream; j++) { + federate_info_t* upstream_fed = GET_FED_INFO(fed->enclave.upstream[j]); + if (upstream_fed->is_transient) { + fed->has_upstream_transient_federates = true; + break; + } + } + } - // If the federation does not include transient federates, then respond to - // erronous connections. Otherwise, continue to accept transients joining and - // respond to duplicate joing requests. - if (rti_remote->number_of_transient_federates == 0) { - lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); - } else if (rti_remote->number_of_transient_federates > 0) { - lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); - lf_thread_create(&delayed_grants_thread, lf_delayed_grants_thread, NULL); - } + // Now check that no transient has an upstream transient + // FIXME: Do we really need this? Or should it be the job of the validator? + int max_number_of_delayed_grants = 0; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->is_transient && fed->has_upstream_transient_federates) { + return -1; + } + if (!fed->is_transient && fed->has_upstream_transient_federates) { + max_number_of_delayed_grants++; + } + } - // Wait for persistent federate threads to exit. - void* thread_exit_status; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (!fed->is_transient) { - lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); - lf_thread_join(fed->thread_id, &thread_exit_status); - pqueue_tag_free(fed->in_transit_message_tags); - lf_print("RTI: Persistent federate %d thread exited.", fed->enclave.id); + return max_number_of_delayed_grants; } - } - rti_remote->all_persistent_federates_exited = true; - rti_remote->phase = shutdown_phase; - lf_print("RTI: All persistent threads exited."); + void wait_for_federates(int socket_descriptor) { + // Wait for connections from persistent federates and create a thread for each. + lf_connect_to_persistent_federates(socket_descriptor); + // Wait for connections from persistent federates and create a thread for each. + lf_connect_to_persistent_federates(socket_descriptor); + + // Set has_upstream_transient_federates parameter in all federates and check + // that there is no more than one level of transiency + if (rti_remote->number_of_transient_federates > 0) { + int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); + if (max_number_of_pending_grants == -1) { + lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); + } + rti_remote->delayed_grants = pqueue_delayed_grants_init(max_number_of_pending_grants); + } - // Wait for transient federate threads to exit, if any. - if (rti_remote->number_of_transient_federates > 0) { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->is_transient) { - lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); - lf_thread_join(fed->thread_id, &thread_exit_status); - pqueue_tag_free(fed->in_transit_message_tags); - lf_print("RTI: Transient federate %d thread exited.", fed->enclave.id); + // All persistent federates have connected. + lf_print("RTI: All expected persistent federates have connected. Starting execution."); + if (rti_remote->number_of_transient_federates > 0) { + lf_print("RTI: Transient Federates can join and leave the federation at anytime."); } - } - } - rti_remote->all_federates_exited = true; + // The socket server will only continue to accept connections from transient + // federates. + // In case some other federation's federates are trying to join the wrong + // federation, need to respond. Start a separate thread to do that. + lf_thread_t responder_thread; + lf_thread_t transient_thread; + lf_thread_t delayed_grants_thread; + + // If the federation does not include transient federates, then respond to + // erronous connections. Otherwise, continue to accept transients joining and + // respond to duplicate joing requests. + if (rti_remote->number_of_transient_federates == 0) { + lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); + } else if (rti_remote->number_of_transient_federates > 0) { + lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); + lf_thread_create(&delayed_grants_thread, lf_delayed_grants_thread, NULL); + } - // Shutdown and close the socket that is listening for incoming connections - // so that the accept() call in respond_to_erroneous_connections returns. - // That thread should then check rti->all_federates_exited and it should exit. - if (shutdown(socket_descriptor, SHUT_RDWR)) { - LF_PRINT_LOG("On shut down TCP socket, received reply: %s", strerror(errno)); - } - // NOTE: In all common TCP/IP stacks, there is a time period, - // typically between 30 and 120 seconds, called the TIME_WAIT period, - // before the port is released after this close. This is because - // the OS is preventing another program from accidentally receiving - // duplicated packets intended for this program. - close(socket_descriptor); + // Wait for persistent federate threads to exit. + void* thread_exit_status; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (!fed->is_transient) { + lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); + lf_thread_join(fed->thread_id, &thread_exit_status); + pqueue_tag_free(fed->in_transit_message_tags); + lf_print("RTI: Persistent federate %d thread exited.", fed->enclave.id); + } + } - if (rti_remote->socket_descriptor_UDP > 0) { - if (shutdown(rti_remote->socket_descriptor_UDP, SHUT_RDWR)) { - LF_PRINT_LOG("On shut down UDP socket, received reply: %s", strerror(errno)); - } - close(rti_remote->socket_descriptor_UDP); - } -} + rti_remote->all_persistent_federates_exited = true; + rti_remote->phase = shutdown_phase; + lf_print("RTI: All persistent threads exited."); + + // Wait for transient federate threads to exit, if any. + if (rti_remote->number_of_transient_federates > 0) { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->is_transient) { + lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); + lf_thread_join(fed->thread_id, &thread_exit_status); + pqueue_tag_free(fed->in_transit_message_tags); + lf_print("RTI: Transient federate %d thread exited.", fed->enclave.id); + } + } + } -void initialize_RTI(rti_remote_t* rti) { - rti_remote = rti; - - // Initialize thread synchronization primitives - LF_MUTEX_INIT(&rti_mutex); - LF_COND_INIT(&received_start_times, &rti_mutex); - LF_COND_INIT(&sent_start_time, &rti_mutex); - LF_COND_INIT(&updated_delayed_grants, &rti_mutex); - - initialize_rti_common(&rti_remote->base); - rti_remote->base.mutex = &rti_mutex; - - // federation_rti related initializations - rti_remote->max_start_time = 0LL; - rti_remote->num_feds_proposed_start = 0; - rti_remote->all_federates_exited = false; - rti_remote->all_persistent_federates_exited = false; - rti_remote->federation_id = "Unidentified Federation"; - rti_remote->user_specified_port = 0; - rti_remote->final_port_TCP = 0; - rti_remote->socket_descriptor_TCP = -1; - rti_remote->final_port_UDP = UINT16_MAX; - rti_remote->socket_descriptor_UDP = -1; - rti_remote->clock_sync_global_status = clock_sync_init; - rti_remote->clock_sync_period_ns = MSEC(10); - rti_remote->clock_sync_exchanges_per_interval = 10; - rti_remote->authentication_enabled = false; - rti_remote->base.tracing_enabled = false; - rti_remote->stop_in_progress = false; - rti_remote->number_of_transient_federates = 0; - rti_remote->phase = startup_phase; -} + rti_remote->all_federates_exited = true; -// The RTI includes clock.c, which requires the following functions that are defined -// in clock-sync.c. But clock-sync.c is not included in the standalone RTI. -// Provide empty implementations of these functions. -void clock_sync_add_offset(instant_t* t) { (void)t; } -void clock_sync_subtract_offset(instant_t* t) { (void)t; } - -void free_scheduling_nodes(scheduling_node_t** scheduling_nodes, uint16_t number_of_scheduling_nodes) { - for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { - scheduling_node_t* node = scheduling_nodes[i]; - if (node->upstream != NULL) { - free(node->upstream); - free(node->upstream_delay); + // Shutdown and close the socket that is listening for incoming connections + // so that the accept() call in respond_to_erroneous_connections returns. + // That thread should then check rti->all_federates_exited and it should exit. + if (shutdown(socket_descriptor, SHUT_RDWR)) { + LF_PRINT_LOG("On shut down TCP socket, received reply: %s", strerror(errno)); + } + // NOTE: In all common TCP/IP stacks, there is a time period, + // typically between 30 and 120 seconds, called the TIME_WAIT period, + // before the port is released after this close. This is because + // the OS is preventing another program from accidentally receiving + // duplicated packets intended for this program. + close(socket_descriptor); + + if (rti_remote->socket_descriptor_UDP > 0) { + if (shutdown(rti_remote->socket_descriptor_UDP, SHUT_RDWR)) { + LF_PRINT_LOG("On shut down UDP socket, received reply: %s", strerror(errno)); + } + close(rti_remote->socket_descriptor_UDP); + } } - if (node->min_delays != NULL) { - free(node->min_delays); + + void initialize_RTI(rti_remote_t * rti) { + rti_remote = rti; + + // Initialize thread synchronization primitives + LF_MUTEX_INIT(&rti_mutex); + LF_COND_INIT(&received_start_times, &rti_mutex); + LF_COND_INIT(&sent_start_time, &rti_mutex); + LF_COND_INIT(&updated_delayed_grants, &rti_mutex); + + initialize_rti_common(&rti_remote->base); + rti_remote->base.mutex = &rti_mutex; + + // federation_rti related initializations + rti_remote->max_start_time = 0LL; + rti_remote->num_feds_proposed_start = 0; + rti_remote->all_federates_exited = false; + rti_remote->all_persistent_federates_exited = false; + rti_remote->federation_id = "Unidentified Federation"; + rti_remote->user_specified_port = 0; + rti_remote->final_port_TCP = 0; + rti_remote->socket_descriptor_TCP = -1; + rti_remote->final_port_UDP = UINT16_MAX; + rti_remote->socket_descriptor_UDP = -1; + rti_remote->clock_sync_global_status = clock_sync_init; + rti_remote->clock_sync_period_ns = MSEC(10); + rti_remote->clock_sync_exchanges_per_interval = 10; + rti_remote->authentication_enabled = false; + rti_remote->base.tracing_enabled = false; + rti_remote->stop_in_progress = false; + rti_remote->number_of_transient_federates = 0; + rti_remote->phase = startup_phase; } - if (node->downstream != NULL) { - free(node->downstream); + + // The RTI includes clock.c, which requires the following functions that are defined + // in clock-sync.c. But clock-sync.c is not included in the standalone RTI. + // Provide empty implementations of these functions. + void clock_sync_add_offset(instant_t * t) { (void)t; } + void clock_sync_subtract_offset(instant_t * t) { (void)t; } + + void free_scheduling_nodes(scheduling_node_t * *scheduling_nodes, uint16_t number_of_scheduling_nodes) { + for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { + scheduling_node_t* node = scheduling_nodes[i]; + if (node->upstream != NULL) { + free(node->upstream); + free(node->upstream_delay); + } + if (node->min_delays != NULL) { + free(node->min_delays); + } + if (node->downstream != NULL) { + free(node->downstream); + } + free(node); + } + free(scheduling_nodes); } - free(node); - } - free(scheduling_nodes); -} #endif // STANDALONE_RTI diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index 0ea2fd5bd..da0dc1832 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -398,7 +398,7 @@ void send_reject(int* socket_id, unsigned char error_code); * Return when all persistent federates have connected. * @param socket_descriptor The socket on which to accept connections. */ -void lf_connect_to_persistent_federates(int socket_descriptor); +void* lf_connect_to_persistent_transient_federates_thread(int socket_descriptor); /** * Thread to wait for incoming connection request from transient federates. From fad4dcc98c3e7b50341bdeffa2f986c6cc1f95e4 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 1 Feb 2024 14:00:36 +0100 Subject: [PATCH 080/148] Add support of the effective_start_tag cont. --- core/threaded/reactor_threaded.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index 6d0759df9..ef6634b0b 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -606,7 +606,7 @@ void _lf_initialize_start_tag(environment_t* env) { // the required waiting time. Second, this call releases the mutex lock and allows // other threads (specifically, federate threads that handle incoming p2p messages // from other federates) to hold the lock and possibly raise a tag barrier. - while (!wait_until(effective_start_tag.time + _lf_fed_STA_offset, &env->event_q_changed)) { + while (!wait_until(effective_start_tag.time + lf_fed_STA_offset, &env->event_q_changed)) { }; LF_PRINT_DEBUG("Done waiting for effective start time + STA offset " PRINTF_TIME ".", effective_start_tag.time + lf_fed_STA_offset); From d28bd54e000e2ba9669374672c3380fcdaa23664 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 1 Feb 2024 14:21:36 +0100 Subject: [PATCH 081/148] Federate shares its type (persistent or transient) --- core/federated/federate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index cb6f0aa64..3435a5d6d 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2087,7 +2087,7 @@ void* lf_handle_p2p_connections_from_federates(void* env_arg) { // Extract the ID of the sending federate. uint16_t remote_fed_id = extract_uint16((unsigned char*)&(buffer[1])); - // bool remote_fed_is_transient = buffer[1 + sizeof(uint16_t)]; + bool remote_fed_is_transient = buffer[1 + sizeof(uint16_t)]; LF_PRINT_DEBUG("Received sending federate ID %d.", remote_fed_id); // Trace the event when tracing is enabled From f19ac22af114c5aba7ffb079c05f53203844d629 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 15 Feb 2024 11:58:39 +0100 Subject: [PATCH 082/148] Granting TAG or PTAGs can be delayed if one of the upstream federates is an absent transient --- core/federated/RTI/rti_remote.c | 345 +++++++++++++++++--------------- core/federated/RTI/rti_remote.h | 5 - 2 files changed, 189 insertions(+), 161 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index b713bd6a0..e56b5f3ad 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -267,6 +267,60 @@ static void notify_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) } } +/** + * @brief Thread that sleeps for a period of time, and then wakes up to check if + * a tag advance grant needs to be sent. That is, if the pending tag have not + * been reset to NEVER_TAG, the tag advance grant will be immediate. + * + * @param federate the fedarate whose tag advance grant needs to be delayed. + */ +void* pending_grant_thread(void* federate) { + federate_info_t* fed = (federate_info_t*)federate; + + interval_t sleep_interval = fed->pending_grant.time - lf_time_physical(); + if (sleep_interval > 0) { + lf_sleep(sleep_interval); + } + + lf_mutex_lock(&rti_mutex); + + // If the pending grant becomes NEVER_TAG, then this means that it should + // not be sent + if (lf_tag_compare(fed->pending_grant, NEVER_TAG) != 0) { + notify_tag_advance_grant_immediate(&(fed->enclave), fed->pending_grant); + fed->pending_grant = NEVER_TAG; + } + lf_mutex_unlock(&rti_mutex); +} + +/** + * Notify a tag advance grant (TAG) message to the specified federate after + * the physical time reaches the tag. A thread is created to this end. + * + * If a provisionl tag advance grant is pending, cancel it. If there is another + * pending tag advance grant, do not proceed with the thread creation. + * + * @param e The enclave. + * @param tag The tag to grant. + */ +void notify_tag_advance_grant_delayed(scheduling_node_t* e, tag_t tag) { + federate_info_t* fed = GET_FED_INFO(e->id); + + // Check wether there is already a pending grant + // And check the pending provisional grant as well + lf_mutex_lock(&rti_mutex); + if (lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) { + // If a tag is issued, then stop any possible provisional tag grant + fed->pending_grant = tag; + fed->pending_provisional_grant = NEVER_TAG; + lf_thread_create(&(fed->pending_grant_thread_id), pending_grant_thread, fed); + } else { + // If there is already a pending tag grant, then let it be sent first + // FIXME: Is this correct? + } + lf_mutex_unlock(&rti_mutex); +} + void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { @@ -279,21 +333,22 @@ void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { lf_cond_wait(&sent_start_time); } - // Check if sending the tag advance grant needs to be delayed or not. - // Delay is needed when a federate has at least one absent upstream transient. - // Check if sending the tag advance grant needs to be delayed or not // Delay is needed when a federate has, at least one, absent upstream transient - federate_info_t* fed = GET_FED_INFO(e->id); - if (!fed->has_upstream_transient_federates) { - notify_tag_advance_grant_immediate(e, tag); - } else { - if (get_num_absent_upstream_transients(fed) > 0) { - notify_grant_delayed(fed, tag, false); - } else { - notify_tag_advance_grant_immediate(e, tag); + int num_absent_upstram_transients = 0; + for (int j = 0; j < e->num_upstream; j++) { + federate_info_t* upstream = GET_FED_INFO(e->upstream[j]); + // Do Ignore this enclave if it no longer connected. + if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { + num_absent_upstram_transients++; + break; } } + if (num_absent_upstram_transients > 0) { + notify_tag_advance_grant_delayed(e, tag); + } else { + notify_tag_advance_grant_immediate(e, tag); + } } /** @@ -306,7 +361,7 @@ void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { * @param e The scheduling node. * @param tag The tag to grant. */ -static void notify_provisional_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { +void notify_provisional_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); unsigned char buffer[message_length]; buffer[0] = MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT; @@ -1908,167 +1963,145 @@ static bool authenticate_federate(int* socket) { void lf_connect_to_persistent_federates(int socket_descriptor) { for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote.number_of_transient_federates; i++) { - // FIXME: The socket descriptor here (parameter) is not used. Should be removed? - void lf_connect_to_persistent_federates(int socket_descriptor) { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates; - i++) { - // Wait for an incoming connection request. - struct sockaddr client_fd; - uint32_t client_length = sizeof(client_fd); - // The following blocks until a federate connects. - int socket_id = -1; - while (1) { - socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); - if (socket_id >= 0) { - // Got a socket - break; - } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { - lf_print_error_system_failure("RTI failed to accept the socket."); - } else { - // Try again - lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); - continue; - } - } + // Wait for an incoming connection request. + struct sockaddr client_fd; + uint32_t client_length = sizeof(client_fd); + // The following blocks until a federate connects. + int socket_id = -1; + while (1) { + socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); + if (socket_id >= 0) { + // Got a socket + break; + } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { + lf_print_error_system_failure("RTI failed to accept the socket."); + } else { + // Try again + lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); + continue; + } + } // Wait for the first message from the federate when RTI -a option is on. #ifdef __RTI_AUTH__ - if (rti_remote->authentication_enabled) { - if (!authenticate_federate(&socket_id)) { - lf_print_warning("RTI failed to authenticate the incoming federate."); - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - socket_id = -1; - // Ignore the federate that failed authentication. - i--; - continue; - } - } + if (rti_remote->authentication_enabled) { + if (!authenticate_federate(&socket_id)) { + lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + socket_id = -1; + // Ignore the federate that failed authentication. + i--; + continue; + } + } #endif - // The first message from the federate should contain its ID and the federation ID. - int32_t fed_id = receive_and_check_fed_id_message(&socket_id); - if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && - receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - federate_info_t* fed = GET_FED_INFO(fed_id); - lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + // The first message from the federate should contain its ID and the federation ID. + int32_t fed_id = receive_and_check_fed_id_message(&socket_id); + if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && + receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - // If the federate is transient, then do not count it. - if (fed->is_transient) { - rti_remote.number_of_connected_transient_federates++; - assert(rti_remote.number_of_connected_transient_federates <= rti_remote.number_of_transient_federates); - i--; - lf_print("RTI: Transient federate %d joined.", fed->base.id); - } - } else { - // Received message was rejected. Try again. - i--; - } + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + federate_info_t* fed = GET_FED_INFO(fed_id); + lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + + // If the federate is transient, then do not count it. + if (fed->is_transient) { + rti_remote->number_of_connected_transient_federates++; + assert(rti_remote->number_of_connected_transient_federates <= rti_remote->number_of_transient_federates); + i--; + lf_print("RTI: Transient federate %d joined.", fed->enclave.id); } - // All federates have connected. - LF_PRINT_DEBUG("All persistent federates have connected to RTI."); - - if (rti_remote->clock_sync_global_status >= clock_sync_on) { - // Create the thread that performs periodic PTP clock synchronization sessions - // over the UDP channel, but only if the UDP channel is open and at least one - // federate is performing runtime clock synchronization. - bool clock_sync_enabled = false; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed_info = GET_FED_INFO(i); - if (fed_info->clock_synchronization_enabled) { - clock_sync_enabled = true; - break; - } - } - if (rti_remote->final_port_UDP != UINT16_MAX && clock_sync_enabled) { - lf_thread_create(&rti_remote->clock_thread, clock_synchronization_thread, NULL); - } + } else { + // Received message was rejected. Try again. + i--; + } + } + // All federates have connected. + LF_PRINT_DEBUG("All persistent federates have connected to RTI."); + + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + // Create the thread that performs periodic PTP clock synchronization sessions + // over the UDP channel, but only if the UDP channel is open and at least one + // federate is performing runtime clock synchronization. + bool clock_sync_enabled = false; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed_info = GET_FED_INFO(i); + if (fed_info->clock_synchronization_enabled) { + clock_sync_enabled = true; + break; } } + if (rti_remote->final_port_UDP != UINT16_MAX && clock_sync_enabled) { + lf_thread_create(&rti_remote->clock_thread, clock_synchronization_thread, NULL); + } + } +} - void* lf_connect_to_transient_federates_thread(int socket_descriptor) { - // This loop will continue to accept connections of transient federates, as - // soon as there is room, or enable hot swap - - while (!rti_remote.all_persistent_federates_exited) { - // Continue waiting for an incoming connection requests from transients - // to join, or for hot swap. - // Wait for an incoming connection request. - struct sockaddr client_fd; - uint32_t client_length = sizeof(client_fd); - // The following blocks until a federate connects. - int socket_id = -1; - while (1) { - if (!rti_remote.all_persistent_federates_exited) { - return NULL; - } - socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); - if (socket_id >= 0) { - // Got a socket - break; - } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { - lf_print_error_system_failure("RTI failed to accept the socket."); - } else { - // Try again - lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); - continue; - } - } +void* lf_connect_to_transient_federates_thread(int socket_descriptor) { + // This loop will continue to accept connections of transient federates, as + // soon as there is room, or enable hot swap + + while (!rti_remote->all_persistent_federates_exited) { + // Continue waiting for an incoming connection requests from transients + // to join, or for hot swap. + // Wait for an incoming connection request. + struct sockaddr client_fd; + uint32_t client_length = sizeof(client_fd); + // The following blocks until a federate connects. + int socket_id = -1; + while (1) { + if (!rti_remote->all_persistent_federates_exited) { + return NULL; + } + socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); + if (socket_id >= 0) { + // Got a socket + break; + } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { + lf_print_error_system_failure("RTI failed to accept the socket."); + } else { + // Try again + lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); + continue; + } + } // Wait for the first message from the federate when RTI -a option is on. #ifdef __RTI_AUTH__ - if (rti_remote->authentication_enabled) { - if (!authenticate_federate(&socket_id)) { - lf_print_warning("RTI failed to authenticate the incoming federate."); - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - socket_id = -1; - // Ignore the federate that failed authentication. - i--; - continue; - } - } + if (rti_remote->authentication_enabled) { + if (!authenticate_federate(&socket_id)) { + lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + socket_id = -1; + // Ignore the federate that failed authentication. + i--; + continue; + } + } #endif - // The first message from the federate should contain its ID and the federation ID. - // The function also detects if a hot swap request is initiated. - int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in*)&client_fd); - if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && - receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - } - - // // Create a thread to communicate with the federate. - // // This has to be done after clock synchronization is finished - // // or that thread may end up attempting to handle incoming clock - // // synchronization messages. - // federate_info_t *fed = GET_FED_INFO(fed_id); - // lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); - - // // If the federate is transient, then do not count it. - // if (fed->is_transient) { - // rti_remote.number_of_connected_transient_federates++; - // assert(rti_remote.number_of_connected_transient_federates <= - // rti_remote.number_of_transient_federates); i--; lf_print("RTI: Transient federate %d joined.", - // fed->base.id); - // } - // } else { - // // Received message was rejected. Try again. - // i--; - // } - - // FIXME: Check again if runtime clock synchronization should be lauched, - // only if the number of persistent threads is zero. This should be done - // only once, not at every transient connection. - } + // The first message from the federate should contain its ID and the federation ID. + // The function also detects if a hot swap request is initiated. + int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in*)&client_fd); + if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && + receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { } + // // Create a thread to communicate with the federate. + // // This has to be done after clock synchronization is finished + // // or that thread may end up attempting to handle incoming clock + // // synchronization messages. + // federate_info_t *fed = GET_FED_INFO(fed_id); + // lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + /** * @brief A request for immediate stop to the federate * diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index da0dc1832..62e75235b 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -408,11 +408,6 @@ void* lf_connect_to_persistent_transient_federates_thread(int socket_descriptor) */ void* lf_connect_to_transient_federates_thread(void* nothing); -/** - * Thread that manages the delayed grants using a priprity queue. - */ -void* lf_delayed_grants_thread(void* nothing); - /** * Thread to respond to new connections, which could be federates of other * federations who are attempting to join the wrong federation. From 97b38123860f3a7dd3cb0713928a14c08ab003db Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 15 Feb 2024 14:37:56 +0100 Subject: [PATCH 083/148] Start on enabling the hot swap mechanism --- core/federated/RTI/rti_remote.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index e56b5f3ad..eb8894bbb 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1447,10 +1447,11 @@ void* federate_info_thread_TCP(void* fed) { close(my_fed->socket); // from unistd.h // Manual clean, in case of a transient federate if (my_fed->is_transient) { + free_in_transit_message_q(my_fed->in_transit_message_tags); lf_print("RTI: Transient Federate %d thread exited.", my_fed->enclave.id); // Update the number of connected transient federates - rti_remote->number_of_connected_transient_federates--; + _f_rti->number_of_connected_transient_federates--; // Reset the status of the leaving federate reset_transient_federate(my_fed); @@ -1550,7 +1551,7 @@ static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_ LF_PRINT_DEBUG("RTI received federation ID: %s.", federation_id_received); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_FED_ID, fed_id, NULL); + tracepoint_rti_from_federate(rti_remote->base.trace, receive_FED_ID, fed_id, NULL); } // Compare the received federation ID to mine. if (strncmp(rti_remote->federation_id, federation_id_received, federation_id_length) != 0) { @@ -1558,7 +1559,7 @@ static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_ lf_print_warning("Federate from another federation %s attempted to connect to RTI in federation %s.", federation_id_received, rti_remote->federation_id); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } send_reject(socket_id, FEDERATION_ID_DOES_NOT_MATCH); return -1; @@ -1567,11 +1568,17 @@ static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_ // Federate ID is out of range. lf_print_error("RTI received federate ID %d, which is out of range.", fed_id); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } send_reject(socket_id, FEDERATE_ID_OUT_OF_RANGE); return -1; } else { + // Find out if it is a new connection or a hot swap. + // Reject if: + // - duplicate of a connected persistent federate + // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that + // particular federate + // - or it is a hot swap, but it is not the execution phase yet // Find out if it is a new connection or a hot swap. // Reject if: // - duplicate of a connected persistent federate From 275a0c638243bd7fe6f5db809f312ee4b657851d Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 15 Feb 2024 17:42:09 +0100 Subject: [PATCH 084/148] More on the hot swap mechanism + various fixes --- core/federated/RTI/rti_remote.c | 713 +++++++++++++------------------- 1 file changed, 297 insertions(+), 416 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index eb8894bbb..eb7237601 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1447,11 +1447,12 @@ void* federate_info_thread_TCP(void* fed) { close(my_fed->socket); // from unistd.h // Manual clean, in case of a transient federate if (my_fed->is_transient) { - free_in_transit_message_q(my_fed->in_transit_message_tags); + // FIXME: Aren't there transit messages anymore??? + // free_in_transit_message_q(my_fed->in_transit_message_tags); lf_print("RTI: Transient Federate %d thread exited.", my_fed->enclave.id); // Update the number of connected transient federates - _f_rti->number_of_connected_transient_federates--; + rti_remote->number_of_connected_transient_federates--; // Reset the status of the leaving federate reset_transient_federate(my_fed); @@ -1968,6 +1969,7 @@ static bool authenticate_federate(int* socket) { } #endif +// FIXME: The socket descriptor here (parameter) is not used. Should be removed? void lf_connect_to_persistent_federates(int socket_descriptor) { for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote.number_of_transient_federates; i++) { // Wait for an incoming connection request. @@ -2098,450 +2100,329 @@ void* lf_connect_to_transient_federates_thread(int socket_descriptor) { // The first message from the federate should contain its ID and the federation ID. // The function also detects if a hot swap request is initiated. int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in*)&client_fd); - if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && - receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - } - - // // Create a thread to communicate with the federate. - // // This has to be done after clock synchronization is finished - // // or that thread may end up attempting to handle incoming clock - // // synchronization messages. - // federate_info_t *fed = GET_FED_INFO(fed_id); - // lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); - - /** - * @brief A request for immediate stop to the federate - * - * @param fed: the deferate to stop - */ - void send_stop(federate_info_t * fed) { - // Reply with a stop granted to all federates - unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; - outgoing_buffer[0] = MSG_TYPE_STOP; - lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_STOP, fed->enclave.id, NULL); + if (fed_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && + receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { + lf_mutex_lock(&rti_mutex); + if (hot_swap_in_progress) { + lf_print("RTI: Hot swap confirmed for federate %d.", fed_id); + + // Then send STOP + federate_info_t* fed_old = GET_FED_INFO(fed_id); + hot_swap_federate->enclave.completed = fed_old->enclave.completed; + + LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); + send_stop(fed_old); + lf_mutex_unlock(&rti_mutex); + + // Wait for the old federate to send MSG_TYPE_RESIGN + LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); + // FIXME: Should this have a timeout? + while (!hot_swap_old_resigned) + ; + + // The latest LTC is the tag at which the old federate resigned. This is useful + // for computing the effective_start_time of the new joining federate. + hot_swap_federate->enclave.completed = fed_old->enclave.completed; + + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + lf_thread_create(&(hot_swap_federate->thread_id), federate_info_thread_TCP, hot_swap_federate); + + // Redirect the federate in rti_remote + rti_remote->base.scheduling_nodes[fed_id] = (scheduling_node_t*)hot_swap_federate; + + // Free the old federate memory and reset the Hot wap indicators + // FIXME: Is this enough to free the memory allocated to the federate? + free(fed_old); + lf_mutex_lock(&rti_mutex); + hot_swap_in_progress = false; + lf_mutex_unlock(&rti_mutex); + + lf_print("RTI: Hot swap succeeded for federate %d.", fed_id); + } else { + lf_mutex_unlock(&rti_mutex); + + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + federate_info_t* fed = GET_FED_INFO(fed_id); + lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + lf_print("RTI: Transient federate %d joined.", fed_id); } - write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, - "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); + rti_remote->number_of_connected_transient_federates++; + } else { + // If a hot swap was initialed, but the connection information or/and clock + // synchronization fail, then reset hot_swap_in_profress, and free the memory + // allocated for hot_swap_federate + if (hot_swap_in_progress) { + lf_print("RTI: Hot swap canceled for federate %d.", fed_id); + lf_mutex_lock(&rti_mutex); + hot_swap_in_progress = false; + lf_mutex_unlock(&rti_mutex); - LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); + // FIXME: Is this enough to free the memory of a federate_info_t data structure? + free(hot_swap_federate); + } } + } +} - void* lf_connect_to_transient_federates_thread(void* nothing) { - // This loop will continue to accept connections of transient federates, as - // soon as there is room, or enable hot swap - - while (!rti_remote->all_persistent_federates_exited) { - // Continue waiting for an incoming connection requests from transients - // to join, or for hot swap. - // Wait for an incoming connection request. - struct sockaddr client_fd; - uint32_t client_length = sizeof(client_fd); - // The following blocks until a federate connects. - int socket_id = -1; - while (1) { - if (rti_remote->all_persistent_federates_exited) { - return NULL; - } - socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); - if (socket_id >= 0) { - // Got a socket - break; - } else { - // Continue trying - lf_print("RTI failed to accept the socket. %s. Continue trying.", strerror(errno)); - continue; - } - } - -// Wait for the first message from the federate when RTI -a option is on. -#ifdef __RTI_AUTH__ - if (rti_remote->authentication_enabled) { - if (!authenticate_federate(&socket_id)) { - lf_print_warning("RTI failed to authenticate the incoming federate."); - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - socket_id = -1; - // Ignore the federate that failed authentication. - continue; - } - } -#endif - - // The first message from the federate should contain its ID and the federation ID. - // The function also detects if a hot swap request is initiated. - int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in*)&client_fd); - - if (fed_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && - receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - LF_MUTEX_LOCK(&rti_mutex); - if (hot_swap_in_progress) { - lf_print("RTI: Hot swap confirmed for federate %d.", fed_id); - - // Then send STOP - federate_info_t* fed_old = GET_FED_INFO(fed_id); - LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); - send_stop(fed_old); - LF_MUTEX_UNLOCK(&rti_mutex); - - // Wait for the old federate to send MSG_TYPE_RESIGN - LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); - // FIXME: Should this have a timeout? - while (!hot_swap_old_resigned) - ; - - // The latest LTC is the tag at which the old federate resigned. This is useful - // for computing the effective_start_time of the new joining federate. - hot_swap_federate->enclave.completed = fed_old->enclave.completed; - - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - lf_thread_create(&(hot_swap_federate->thread_id), federate_info_thread_TCP, hot_swap_federate); - - LF_MUTEX_LOCK(&rti_mutex); - // Redirect the federate in rti_remote - rti_remote->base.scheduling_nodes[fed_id] = (scheduling_node_t*)hot_swap_federate; - - // Free the old federate memory and reset the Hot wap indicators - pqueue_tag_free(fed_old->in_transit_message_tags); - free(fed_old); - hot_swap_federate = NULL; - hot_swap_in_progress = false; - hot_swap_old_resigned = false; - LF_MUTEX_UNLOCK(&rti_mutex); - lf_print("RTI: Hot swap succeeded for federate %d.", fed_id); - } else { - lf_mutex_unlock(&rti_mutex); - - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - federate_info_t* fed = GET_FED_INFO(fed_id); - lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); - lf_print("RTI: Transient federate %d joined.", fed_id); - } - rti_remote->number_of_connected_transient_federates++; - } else { - // If a hot swap was initialed, but the connection information or/and clock - // synchronization fail, then reset hot_swap_in_profress, and free the memory - // allocated for hot_swap_federate - if (hot_swap_in_progress) { - lf_print("RTI: Hot swap canceled for federate %d.", fed_id); - lf_mutex_lock(&rti_mutex); - hot_swap_in_progress = false; - lf_mutex_unlock(&rti_mutex); - - // FIXME: Is this enough to free the memory of a federate_info_t data structure? - free(hot_swap_federate); - } - } - } +void* respond_to_erroneous_connections(void* nothing) { + initialize_lf_thread_id(); + while (true) { + // Wait for an incoming connection request. + // The following will block until either a federate attempts to connect + // or close(rti->socket_descriptor_TCP) is called. + int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, -1); + if (socket_id < 0) { return NULL; } - - /** - * This thread is responsible for managing the priority queue of delayed grants to be issued. - * It waits until the current time matches the highest priority tag time in the queue. - * If reached, it notifies the grant immediately. If, however, the current time has not yet - * reached the highest priority tag and the queue has been updated (either by inserting or - * canceling an entry), the thread stops waiting and restarts the process again. - */ - void* lf_delayed_grants_thread(void* nothing) { - initialize_lf_thread_id(); - - // Wait for the first condition signal - lf_cond_wait(&updated_delayed_grants); - - while (true) { - if (rti_remote->all_federates_exited) { - break; - } - if (pqueue_delayed_grants_size(rti_remote->delayed_grants) != 0) { - pqueue_delayed_grant_element_t* next; - - // Do not pop, but rather read - next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); - instant_t next_time = next->base.tag.time; - // Wait for expiration, or a signal to stop or terminate. - if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time)) { - // Time reached to send the grant. Do it for delayed grants with the same tag - LF_MUTEX_LOCK(&rti_mutex); - next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); - federate_info_t* fed = GET_FED_INFO(next->fed_id); - if (next->is_provisional) { - notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - } else { - notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - } - LF_MUTEX_UNLOCK(&rti_mutex); - } else { - // Waiting was interrupted, because of an update in the queue, or - // because this thread needs to terminate - lf_print("RTI: lf_delayed_grants_thread() did not send grant to %d at " PRINTF_TIME - ", but rather terminated!", - next->fed_id, next_time - start_time); - } - } - } - // The federation is at the shutdown phase. All persistent federates exited. - // We can do a sanity check that the delayed_grants queue is empty. - // FIXME: If there are still pending grants, what does that mean? Maybe that the - // federation stopped after a request to stop (not a timeout). Therefore, we need - // cleanup, and free the memory... - // TODO: do it! + if (rti_remote->all_federates_exited) { return NULL; } - void* respond_to_erroneous_connections(void* nothing) { - initialize_lf_thread_id(); - while (true) { - // Wait for an incoming connection request. - // The following will block until either a federate attempts to connect - // or close(rti->socket_descriptor_TCP) is called. - int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, -1); - if (socket_id < 0) { - return NULL; - } - if (rti_remote->all_federates_exited) { - return NULL; - } - - lf_print_error("RTI received an unexpected connection request. Federation is running."); - unsigned char response[2]; - response[0] = MSG_TYPE_REJECT; - response[1] = FEDERATION_ID_DOES_NOT_MATCH; - // Ignore errors on this response. - if (write_to_socket(socket_id, 2, response)) { - lf_print_warning("RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); - } - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - } - return NULL; + lf_print_error("RTI received an unexpected connection request. Federation is running."); + unsigned char response[2]; + response[0] = MSG_TYPE_REJECT; + response[1] = FEDERATION_ID_DOES_NOT_MATCH; + // Ignore errors on this response. + if (write_to_socket(socket_id, 2, response)) { + lf_print_warning("RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); } + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + } + return NULL; +} + +void initialize_federate(federate_info_t* fed, uint16_t id) { + initialize_scheduling_node(&(fed->enclave), id); + fed->requested_stop = false; + fed->socket = -1; // No socket. + fed->clock_synchronization_enabled = true; + fed->in_transit_message_tags = pqueue_tag_init(10); + strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); + fed->server_ip_addr.s_addr = 0; + fed->server_port = -1; + fed->is_transient = true; + fed->effective_start_tag = NEVER_TAG; + fed->pending_grant = NEVER_TAG; + fed->pending_provisional_grant = NEVER_TAG; +} - void initialize_federate(federate_info_t * fed, uint16_t id) { - initialize_scheduling_node(&(fed->enclave), id); - fed->requested_stop = false; - fed->socket = -1; // No socket. - fed->clock_synchronization_enabled = true; - fed->in_transit_message_tags = pqueue_tag_init(10); - strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); - fed->server_ip_addr.s_addr = 0; - fed->server_port = -1; - fed->is_transient = true; +void reset_transient_federate(federate_info_t* fed) { + fed->enclave.next_event = NEVER_TAG; + fed->enclave.state = NOT_CONNECTED; + // Reset of the federate-related attributes + fed->socket = -1; // No socket. + fed->clock_synchronization_enabled = true; + fed->in_transit_message_tags = pqueue_tag_init(10); + strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); + fed->server_ip_addr.s_addr = 0; + fed->server_port = -1; + fed->requested_stop = false; + fed->is_transient = true; + fed->effective_start_tag = NEVER_TAG; + fed->pending_grant = NEVER_TAG; + fed->pending_provisional_grant = NEVER_TAG; + // FIXME: There is room though to check if the interface has changed??? Do we allow this? +} + +int32_t start_rti_server(uint16_t port) { + _lf_initialize_clock(); + // Create the TCP socket server + if (create_server(port, &rti_remote->socket_descriptor_TCP, &rti_remote->final_port_TCP, TCP, true)) { + lf_print_error_system_failure("RTI failed to create TCP server: %s.", strerror(errno)); + }; + lf_print("RTI: Listening for federates."); + // Create the UDP socket server + // Try to get the rti_remote->final_port_TCP + 1 port + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + if (create_server(rti_remote->final_port_TCP + 1, &rti_remote->socket_descriptor_UDP, &rti_remote->final_port_UDP, + UDP, true)) { + lf_print_error_system_failure("RTI failed to create UDP server: %s.", strerror(errno)); } + } + return rti_remote->socket_descriptor_TCP; +} - int32_t start_rti_server(uint16_t port) { - _lf_initialize_clock(); - // Create the TCP socket server - if (create_server(port, &rti_remote->socket_descriptor_TCP, &rti_remote->final_port_TCP, TCP, true)) { - lf_print_error_system_failure("RTI failed to create TCP server: %s.", strerror(errno)); - }; - lf_print("RTI: Listening for federates."); - // Create the UDP socket server - // Try to get the rti_remote->final_port_TCP + 1 port - if (rti_remote->clock_sync_global_status >= clock_sync_on) { - if (create_server(rti_remote->final_port_TCP + 1, &rti_remote->socket_descriptor_UDP, - &rti_remote->final_port_UDP, UDP, true)) { - lf_print_error_system_failure("RTI failed to create UDP server: %s.", strerror(errno)); - } +/** + * Iterate over the federates and sets 'has_upstream_transient_federates'. + * Once done, check that no transient federate has an upstream transient federate, + * and compute the number of persistent federates that do have upstream transients, + * which is the maximun number of delayed grants that can be pending at the same time. + * This is useful for initialyzing the queue of delayed grants. + + * @return -1, if there is more than one level of transiency, else, the number of + * persistents that have an upstream transient + */ +static int set_has_upstream_transient_federates_parameter_and_check() { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + for (int j = 0; j < fed->enclave.num_upstream; j++) { + federate_info_t* upstream_fed = GET_FED_INFO(fed->enclave.upstream[j]); + if (upstream_fed->is_transient) { + fed->has_upstream_transient_federates = true; + break; } - return rti_remote->socket_descriptor_TCP; } + } - /** - * Iterate over the federates and sets 'has_upstream_transient_federates'. - * Once done, check that no transient federate has an upstream transient federate, - * and compute the number of persistent federates that do have upstream transients, - * which is the maximun number of delayed grants that can be pending at the same time. - * This is useful for initialyzing the queue of delayed grants. - - * @return -1, if there is more than one level of transiency, else, the number of - * persistents that have an upstream transient - */ - static int set_has_upstream_transient_federates_parameter_and_check() { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - for (int j = 0; j < fed->enclave.num_upstream; j++) { - federate_info_t* upstream_fed = GET_FED_INFO(fed->enclave.upstream[j]); - if (upstream_fed->is_transient) { - fed->has_upstream_transient_federates = true; - break; - } - } - } + // Now check that no transient has an upstream transient + // FIXME: Do we really need this? Or should it be the job of the validator? + int max_number_of_delayed_grants = 0; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->is_transient && fed->has_upstream_transient_federates) { + return -1; + } + if (!fed->is_transient && fed->has_upstream_transient_federates) { + max_number_of_delayed_grants++; + } + } - // Now check that no transient has an upstream transient - // FIXME: Do we really need this? Or should it be the job of the validator? - int max_number_of_delayed_grants = 0; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->is_transient && fed->has_upstream_transient_federates) { - return -1; - } - if (!fed->is_transient && fed->has_upstream_transient_federates) { - max_number_of_delayed_grants++; - } - } + return max_number_of_delayed_grants; +} - return max_number_of_delayed_grants; +void wait_for_federates(int socket_descriptor) { + // Wait for connections from persistent federates and create a thread for each. + lf_connect_to_persistent_federates(socket_descriptor); + // Wait for connections from persistent federates and create a thread for each. + lf_connect_to_persistent_federates(socket_descriptor); + + // Set has_upstream_transient_federates parameter in all federates and check + // that there is no more than one level of transiency + if (rti_remote->number_of_transient_federates > 0) { + int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); + if (max_number_of_pending_grants == -1) { + lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); } + rti_remote->delayed_grants = pqueue_delayed_grants_init(max_number_of_pending_grants); + } - void wait_for_federates(int socket_descriptor) { - // Wait for connections from persistent federates and create a thread for each. - lf_connect_to_persistent_federates(socket_descriptor); - // Wait for connections from persistent federates and create a thread for each. - lf_connect_to_persistent_federates(socket_descriptor); - - // Set has_upstream_transient_federates parameter in all federates and check - // that there is no more than one level of transiency - if (rti_remote->number_of_transient_federates > 0) { - int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); - if (max_number_of_pending_grants == -1) { - lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); - } - rti_remote->delayed_grants = pqueue_delayed_grants_init(max_number_of_pending_grants); - } - - // All persistent federates have connected. - lf_print("RTI: All expected persistent federates have connected. Starting execution."); - if (rti_remote->number_of_transient_federates > 0) { - lf_print("RTI: Transient Federates can join and leave the federation at anytime."); - } + // All persistent federates have connected. + lf_print("RTI: All expected persistent federates have connected. Starting execution."); + if (rti_remote->number_of_transient_federates > 0) { + lf_print("RTI: Transient Federates can join and leave the federation at anytime."); + } - // The socket server will only continue to accept connections from transient - // federates. - // In case some other federation's federates are trying to join the wrong - // federation, need to respond. Start a separate thread to do that. - lf_thread_t responder_thread; - lf_thread_t transient_thread; - lf_thread_t delayed_grants_thread; - - // If the federation does not include transient federates, then respond to - // erronous connections. Otherwise, continue to accept transients joining and - // respond to duplicate joing requests. - if (rti_remote->number_of_transient_federates == 0) { - lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); - } else if (rti_remote->number_of_transient_federates > 0) { - lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); - lf_thread_create(&delayed_grants_thread, lf_delayed_grants_thread, NULL); - } + // The socket server will only continue to accept connections from transient + // federates. + // In case some other federation's federates are trying to join the wrong + // federation, need to respond. Start a separate thread to do that. + lf_thread_t responder_thread; + lf_thread_t transient_thread; + lf_thread_t delayed_grants_thread; - // Wait for persistent federate threads to exit. - void* thread_exit_status; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (!fed->is_transient) { - lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); - lf_thread_join(fed->thread_id, &thread_exit_status); - pqueue_tag_free(fed->in_transit_message_tags); - lf_print("RTI: Persistent federate %d thread exited.", fed->enclave.id); - } - } + // If the federation does not include transient federates, then respond to + // erronous connections. Otherwise, continue to accept transients joining and + // respond to duplicate joing requests. + if (rti_remote->number_of_transient_federates == 0) { + lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); + } else if (rti_remote->number_of_transient_federates > 0) { + lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); + lf_thread_create(&delayed_grants_thread, lf_delayed_grants_thread, NULL); + } - rti_remote->all_persistent_federates_exited = true; - rti_remote->phase = shutdown_phase; - lf_print("RTI: All persistent threads exited."); - - // Wait for transient federate threads to exit, if any. - if (rti_remote->number_of_transient_federates > 0) { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->is_transient) { - lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); - lf_thread_join(fed->thread_id, &thread_exit_status); - pqueue_tag_free(fed->in_transit_message_tags); - lf_print("RTI: Transient federate %d thread exited.", fed->enclave.id); - } - } - } + // Wait for persistent federate threads to exit. + void* thread_exit_status; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (!fed->is_transient) { + lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); + lf_thread_join(fed->thread_id, &thread_exit_status); + pqueue_tag_free(fed->in_transit_message_tags); + lf_print("RTI: Persistent federate %d thread exited.", fed->enclave.id); + } + } - rti_remote->all_federates_exited = true; + rti_remote->all_persistent_federates_exited = true; + rti_remote->phase = shutdown_phase; + lf_print("RTI: All persistent threads exited."); - // Shutdown and close the socket that is listening for incoming connections - // so that the accept() call in respond_to_erroneous_connections returns. - // That thread should then check rti->all_federates_exited and it should exit. - if (shutdown(socket_descriptor, SHUT_RDWR)) { - LF_PRINT_LOG("On shut down TCP socket, received reply: %s", strerror(errno)); - } - // NOTE: In all common TCP/IP stacks, there is a time period, - // typically between 30 and 120 seconds, called the TIME_WAIT period, - // before the port is released after this close. This is because - // the OS is preventing another program from accidentally receiving - // duplicated packets intended for this program. - close(socket_descriptor); - - if (rti_remote->socket_descriptor_UDP > 0) { - if (shutdown(rti_remote->socket_descriptor_UDP, SHUT_RDWR)) { - LF_PRINT_LOG("On shut down UDP socket, received reply: %s", strerror(errno)); - } - close(rti_remote->socket_descriptor_UDP); + // Wait for transient federate threads to exit, if any. + if (rti_remote->number_of_transient_federates > 0) { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->is_transient) { + lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); + lf_thread_join(fed->thread_id, &thread_exit_status); + pqueue_tag_free(fed->in_transit_message_tags); + lf_print("RTI: Transient federate %d thread exited.", fed->enclave.id); } } + } - void initialize_RTI(rti_remote_t * rti) { - rti_remote = rti; - - // Initialize thread synchronization primitives - LF_MUTEX_INIT(&rti_mutex); - LF_COND_INIT(&received_start_times, &rti_mutex); - LF_COND_INIT(&sent_start_time, &rti_mutex); - LF_COND_INIT(&updated_delayed_grants, &rti_mutex); - - initialize_rti_common(&rti_remote->base); - rti_remote->base.mutex = &rti_mutex; - - // federation_rti related initializations - rti_remote->max_start_time = 0LL; - rti_remote->num_feds_proposed_start = 0; - rti_remote->all_federates_exited = false; - rti_remote->all_persistent_federates_exited = false; - rti_remote->federation_id = "Unidentified Federation"; - rti_remote->user_specified_port = 0; - rti_remote->final_port_TCP = 0; - rti_remote->socket_descriptor_TCP = -1; - rti_remote->final_port_UDP = UINT16_MAX; - rti_remote->socket_descriptor_UDP = -1; - rti_remote->clock_sync_global_status = clock_sync_init; - rti_remote->clock_sync_period_ns = MSEC(10); - rti_remote->clock_sync_exchanges_per_interval = 10; - rti_remote->authentication_enabled = false; - rti_remote->base.tracing_enabled = false; - rti_remote->stop_in_progress = false; - rti_remote->number_of_transient_federates = 0; - rti_remote->phase = startup_phase; - } + rti_remote->all_federates_exited = true; - // The RTI includes clock.c, which requires the following functions that are defined - // in clock-sync.c. But clock-sync.c is not included in the standalone RTI. - // Provide empty implementations of these functions. - void clock_sync_add_offset(instant_t * t) { (void)t; } - void clock_sync_subtract_offset(instant_t * t) { (void)t; } - - void free_scheduling_nodes(scheduling_node_t * *scheduling_nodes, uint16_t number_of_scheduling_nodes) { - for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { - scheduling_node_t* node = scheduling_nodes[i]; - if (node->upstream != NULL) { - free(node->upstream); - free(node->upstream_delay); - } - if (node->min_delays != NULL) { - free(node->min_delays); - } - if (node->downstream != NULL) { - free(node->downstream); - } - free(node); - } - free(scheduling_nodes); + // Shutdown and close the socket that is listening for incoming connections + // so that the accept() call in respond_to_erroneous_connections returns. + // That thread should then check rti->all_federates_exited and it should exit. + if (shutdown(socket_descriptor, SHUT_RDWR)) { + LF_PRINT_LOG("On shut down TCP socket, received reply: %s", strerror(errno)); + } + // NOTE: In all common TCP/IP stacks, there is a time period, + // typically between 30 and 120 seconds, called the TIME_WAIT period, + // before the port is released after this close. This is because + // the OS is preventing another program from accidentally receiving + // duplicated packets intended for this program. + close(socket_descriptor); + + if (rti_remote->socket_descriptor_UDP > 0) { + if (shutdown(rti_remote->socket_descriptor_UDP, SHUT_RDWR)) { + LF_PRINT_LOG("On shut down UDP socket, received reply: %s", strerror(errno)); } + close(rti_remote->socket_descriptor_UDP); + } +} + +void initialize_RTI(rti_remote_t* rti) { + rti_remote = rti; + + // Initialize thread synchronization primitives + LF_MUTEX_INIT(&rti_mutex); + LF_COND_INIT(&received_start_times, &rti_mutex); + LF_COND_INIT(&sent_start_time, &rti_mutex); + LF_COND_INIT(&updated_delayed_grants, &rti_mutex); + + initialize_rti_common(&rti_remote->base); + rti_remote->base.mutex = &rti_mutex; + + // federation_rti related initializations + rti_remote->max_start_time = 0LL; + rti_remote->num_feds_proposed_start = 0; + rti_remote->all_federates_exited = false; + rti_remote->federation_id = "Unidentified Federation"; + rti_remote->user_specified_port = 0; + rti_remote->final_port_TCP = 0; + rti_remote->socket_descriptor_TCP = -1; + rti_remote->final_port_UDP = UINT16_MAX; + rti_remote->socket_descriptor_UDP = -1; + rti_remote->clock_sync_global_status = clock_sync_init; + rti_remote->clock_sync_period_ns = MSEC(10); + rti_remote->clock_sync_exchanges_per_interval = 10; + rti_remote->authentication_enabled = false; + rti_remote->base.tracing_enabled = false; + rti_remote->stop_in_progress = false; + rti_remote->number_of_transient_federates = 0; + rti_remote->phase = startup_phase; +} + +void free_scheduling_nodes(scheduling_node_t** scheduling_nodes, uint16_t number_of_scheduling_nodes) { + for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { + // FIXME: Gives error freeing memory not allocated!!!! + scheduling_node_t* node = scheduling_nodes[i]; + if (node->upstream != NULL) + free(node->upstream); + if (node->downstream != NULL) + free(node->downstream); + } + free(scheduling_nodes); +} #endif // STANDALONE_RTI From 148d84dc2386bdb9377700093266a3a1bed62b9e Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 15 Feb 2024 21:19:15 +0100 Subject: [PATCH 085/148] RTI can sent a request for immediate stop. This is required by the hot swap mechanism --- core/federated/RTI/rti_remote.c | 42 ++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index eb7237601..120e7bd9a 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -2052,7 +2052,47 @@ void lf_connect_to_persistent_federates(int socket_descriptor) { } } -void* lf_connect_to_transient_federates_thread(int socket_descriptor) { +/** + * @brief A request for immediate stop to the federate + * + * @param fed: the deferate to stop + */ +void send_stop(federate_info_t* fed) { + // Reply with a stop granted to all federates + unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; + outgoing_buffer[0] = MSG_TYPE_STOP; + lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); + + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); + } + write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, + "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); + + LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); +} + +/** + * @brief A request for immediate stop to the federate + * + * @param fed: the deferate to stop + */ +void send_stop(federate_info_t* fed) { + // Reply with a stop granted to all federates + unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; + outgoing_buffer[0] = MSG_TYPE_STOP; + lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); + + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); + } + write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, + "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); + + LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); +} + +void* lf_connect_to_transient_federates_thread(void* nothing) { // This loop will continue to accept connections of transient federates, as // soon as there is room, or enable hot swap From 03705b3cd268079641e6733ec4f91d8780159172 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 16 Feb 2024 12:44:26 +0100 Subject: [PATCH 086/148] A federate can get its id, its effective_start_time, and the start_time of the federation. This is particularly useful for testing. --- include/core/federated/federate.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index 604a36637..ef5751322 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -545,4 +545,21 @@ bool lf_update_max_level(tag_t tag, bool is_provisional); instant_t lf_wait_until_time(tag_t tag); #endif // FEDERATED_DECENTRALIZED +/** + * @brief Returns the federation id. + * + * This function is useful for creating federates on runtime. + */ +char* lf_get_federation_id(); + +/** + * @brief Returns the effective start time of the federate. The start_time of persistent + * federates is equal to their effective_start_time. Transient federates, however, + * have their effective_start_time higher or equal to their start_time. + */ +instant_t lf_get_effective_start_time(); + +/** @brief Returns the start time of the federate. */ +instant_t lf_get_start_time(); + #endif // FEDERATE_H From 1dafa302dcf21c791ad3a475a8e049c8c6a8001d Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 19 Feb 2024 10:33:20 +0100 Subject: [PATCH 087/148] Fix the use of lf_stop() --- core/threaded/reactor_threaded.c | 4 +++- include/core/federated/federate.h | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index ef6634b0b..c6a1beaee 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -588,7 +588,9 @@ void _lf_initialize_start_tag(environment_t* env) { // If we have a non-zero STA offset, then we need to allow messages to arrive // at the start time. To avoid spurious STP violations, we temporarily // set the current time back by the STA offset. - env->current_tag.time -= lf_fed_STA_offset; + env->current_tag = + (tag_t){.time = effective_start_tag.time - lf_fed_STA_offset, .microstep = effective_start_tag.microstep}; + LF_PRINT_LOG("Waiting for start time " PRINTF_TIME " plus STA " PRINTF_TIME ".", start_time, lf_fed_STA_offset); #else // For other than federated decentralized execution, there is no lf_fed_STA_offset variable defined. diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index ef5751322..d77a6731a 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -530,6 +530,16 @@ void lf_synchronize_with_other_federates(); */ bool lf_update_max_level(tag_t tag, bool is_provisional); +/** + * @brief Stop the execution of a federate. + * Every enclave within the federate will stop at one microstep later than its + * current tag. Unlike lf_request_stop(), this process does not require any + * involvement from the RTI, nor does it necessitate any consensus. + * + * This function is particularly useful for testing transient federates. + */ +void lf_stop(); + #ifdef FEDERATED_DECENTRALIZED /** * @brief Return the physical time that we should wait until before advancing to the specified tag. @@ -547,14 +557,14 @@ instant_t lf_wait_until_time(tag_t tag); /** * @brief Returns the federation id. - * + * * This function is useful for creating federates on runtime. */ char* lf_get_federation_id(); /** * @brief Returns the effective start time of the federate. The start_time of persistent - * federates is equal to their effective_start_time. Transient federates, however, + * federates is equal to their effective_start_time. Transient federates, however, * have their effective_start_time higher or equal to their start_time. */ instant_t lf_get_effective_start_time(); From 070302b67637af17166c422abb2ca3fac81c3c1f Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Tue, 20 Feb 2024 13:42:10 +0100 Subject: [PATCH 088/148] Attemp to fix the hot swap mechanism --- core/federated/RTI/rti_remote.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 120e7bd9a..5afd0d8d2 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1481,6 +1481,7 @@ void send_reject(int* socket_id, unsigned char error_code) { *socket_id = -1; LF_MUTEX_UNLOCK(&rti_mutex); } +lf_print("handle_timestamp for transient 1157"); /** * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload @@ -2143,7 +2144,7 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { if (fed_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(&rti_mutex); if (hot_swap_in_progress) { lf_print("RTI: Hot swap confirmed for federate %d.", fed_id); @@ -2153,7 +2154,7 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); send_stop(fed_old); - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); // Wait for the old federate to send MSG_TYPE_RESIGN LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); From 9429cf7c272a4a94a7e870ec50ee93b91c213eb9 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 29 Feb 2024 15:57:49 +0100 Subject: [PATCH 089/148] Code review: remove redundancy and add has_transient_upstream_federats --- core/federated/RTI/rti_remote.c | 3904 ++++++++++++++++--------------- 1 file changed, 1988 insertions(+), 1916 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 5afd0d8d2..fbd79e0ca 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -235,6 +235,40 @@ static int get_num_absent_upstream_transients(federate_info_t* fed) { return num_absent_upstream_transients; } +/** + * Find the number of non connected upstream transients + * @param fed The federate + * @return the number of non connected upstream transients + */ +static int get_num_absent_upstream_transients(federate_info_t* fed) { + int num_absent_upstream_transients = 0; + for (int j = 0; j < fed->enclave.num_upstream; j++) { + federate_info_t* upstream = GET_FED_INFO(fed->enclave.upstream[j]); + // Do Ignore this enclave if it no longer connected. + if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { + num_absent_upstream_transients++; + } + } + return num_absent_upstream_transients; +} + +/** + * Find the number of non connected upstream transients + * @param fed The federate + * @return the number of non connected upstream transients + */ +static int get_num_absent_upstream_transients(federate_info_t* fed) { + int num_absent_upstream_transients = 0; + for (int j = 0; j < fed->enclave.num_upstream; j++) { + federate_info_t* upstream = GET_FED_INFO(fed->enclave.upstream[j]); + // Do Ignore this enclave if it no longer connected. + if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { + num_absent_upstream_transients++; + } + } + return num_absent_upstream_transients; +} + /** * Notify a tag advance grant (TAG) message to the specified federate immediately. * @@ -300,2170 +334,2208 @@ void* pending_grant_thread(void* federate) { * If a provisionl tag advance grant is pending, cancel it. If there is another * pending tag advance grant, do not proceed with the thread creation. * - * @param e The enclave. + * @param fed The federate. * @param tag The tag to grant. */ void notify_tag_advance_grant_delayed(scheduling_node_t* e, tag_t tag) { federate_info_t* fed = GET_FED_INFO(e->id); - - // Check wether there is already a pending grant - // And check the pending provisional grant as well - lf_mutex_lock(&rti_mutex); - if (lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) { - // If a tag is issued, then stop any possible provisional tag grant - fed->pending_grant = tag; - fed->pending_provisional_grant = NEVER_TAG; - lf_thread_create(&(fed->pending_grant_thread_id), pending_grant_thread, fed); - } else { - // If there is already a pending tag grant, then let it be sent first - // FIXME: Is this correct? - } - lf_mutex_unlock(&rti_mutex); -} - -void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { - if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || - lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { - return; - } - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (e->state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); + static void notify_tag_advance_grant_delayed(scheduling_node_t * e, tag_t tag) { + federate_info_t* fed = (federate_info_t*)GET_FED_INFO(e->id); + + // Check wether there is already a pending grant + // And check the pending provisional grant as well + lf_mutex_lock(&rti_mutex); + if (lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) { + // If a tag is issued, then stop any possible provisional tag grant + fed->pending_grant = tag; + fed->pending_provisional_grant = NEVER_TAG; + lf_thread_create(&(fed->pending_grant_thread_id), pending_grant_thread, fed); + } else { + // If there is already a pending tag grant, then let it be sent first + // FIXME: Is this correct? + } + lf_mutex_unlock(&rti_mutex); } - // Check if sending the tag advance grant needs to be delayed or not - // Delay is needed when a federate has, at least one, absent upstream transient - int num_absent_upstram_transients = 0; - for (int j = 0; j < e->num_upstream; j++) { - federate_info_t* upstream = GET_FED_INFO(e->upstream[j]); - // Do Ignore this enclave if it no longer connected. - if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { - num_absent_upstram_transients++; - break; + void notify_tag_advance_grant(scheduling_node_t * e, tag_t tag) { + if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || + lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { + return; + } + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (e->state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); } - } - if (num_absent_upstram_transients > 0) { - notify_tag_advance_grant_delayed(e, tag); - } else { - notify_tag_advance_grant_immediate(e, tag); - } -} -/** - * Notify a provisional tag advance grant (PTAG) message to the specified federate - * immediately. - * - * This function will keep a record of this TAG in the enclave's last_provisionally_granted - * field. - * - * @param e The scheduling node. - * @param tag The tag to grant. - */ -void notify_provisional_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { - size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); - unsigned char buffer[message_length]; - buffer[0] = MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT; - encode_int64(tag.time, &(buffer[1])); - encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); + // Check if sending the tag advance grant needs to be delayed or not. + // Delay is needed when a federate has at least one absent upstream transient. - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_PTAG, e->id, &tag); - } - // This function is called in notify_advance_grant_if_safe(), which is a long - // function. During this call, the socket might close, causing the following write_to_socket - // to fail. Consider a failure here a soft failure and update the federate's status. - if (write_to_socket(((federate_info_t*)e)->socket, message_length, buffer)) { - lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); - e->state = NOT_CONNECTED; - } else { - e->last_provisionally_granted = tag; - LF_PRINT_LOG("RTI sent to federate %d the Provisional Tag Advance Grant (PTAG) " PRINTF_TAG ".", e->id, - tag.time - start_time, tag.microstep); - - // Send PTAG to all upstream federates, if they have not had - // a later or equal PTAG or TAG sent previously and if their transitive - // NET is greater than or equal to the tag. - // This is needed to stimulate absent messages from upstream and break deadlocks. - // The scenario this deals with is illustrated in `test/C/src/federated/FeedbackDelay2.lf` - // and `test/C/src/federated/FeedbackDelay4.lf`. - // Note that this is transitive. - // NOTE: This is not needed for enclaves because zero-delay loops are prohibited. - // It's only needed for federates, which is why this is implemented here. - for (int j = 0; j < e->num_upstream; j++) { - scheduling_node_t* upstream = rti_remote->base.scheduling_nodes[e->upstream[j]]; - - // Ignore this federate if it has resigned. - if (upstream->state == NOT_CONNECTED) - continue; - - tag_t earliest = earliest_future_incoming_message_tag(upstream); - tag_t strict_earliest = eimt_strict(upstream); // Non-ZDC version. - - // If these tags are equal, then a TAG or PTAG should have already been granted, - // in which case, another will not be sent. But it may not have been already granted. - if (lf_tag_compare(earliest, tag) > 0) { - notify_tag_advance_grant(upstream, tag); - } else if (lf_tag_compare(earliest, tag) == 0 && lf_tag_compare(strict_earliest, tag) > 0) { - notify_provisional_tag_advance_grant(upstream, tag); + // Check if sending the tag advance grant needs to be delayed or not + // Delay is needed when a federate has, at least one, absent upstream transient + federate_info_t* fed = GET_FED_INFO(e->id); + if (!fed->has_upstream_transient_federates) { + notify_tag_advance_grant_immediate(e, tag); + } else { + if (get_num_absent_upstream_transients(fed) > 0) { + notify_tag_advance_grant_delayed(fed, tag); + } else { + notify_tag_advance_grant_immediate(e, tag); } } } -} -void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { - if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || - lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { - return; - } - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (e->state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); - } + /** + * Notify a provisional tag advance grant (PTAG) message to the specified federate + * immediately. + * + * This function will keep a record of this TAG in the enclave's last_provisionally_granted + * field. + * + * @param e The scheduling node. + * @param tag The tag to grant. + */ + void notify_provisional_tag_advance_grant_immediate(scheduling_node_t * e, tag_t tag) { + size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); + unsigned char buffer[message_length]; + buffer[0] = MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT; + encode_int64(tag.time, &(buffer[1])); + encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); - // Check if sending the tag advance grant needs to be delayed or not - // Delay is needed when a federate has, at least one, absent upstream transient - federate_info_t* fed = GET_FED_INFO(e->id); - if (!fed->has_upstream_transient_federates) { - notify_provisional_tag_advance_grant_immediate(e, tag); - } else { - if (get_num_absent_upstream_transients(fed) > 0) { - notify_grant_delayed(fed, tag, true); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_PTAG, e->id, &tag); + } + // This function is called in notify_advance_grant_if_safe(), which is a long + // function. During this call, the socket might close, causing the following write_to_socket + // to fail. Consider a failure here a soft failure and update the federate's status. + if (write_to_socket(((federate_info_t*)e)->socket, message_length, buffer)) { + lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); + e->state = NOT_CONNECTED; } else { - notify_provisional_tag_advance_grant_immediate(e, tag); + e->last_provisionally_granted = tag; + LF_PRINT_LOG("RTI sent to federate %d the Provisional Tag Advance Grant (PTAG) " PRINTF_TAG ".", e->id, + tag.time - start_time, tag.microstep); + + // Send PTAG to all upstream federates, if they have not had + // a later or equal PTAG or TAG sent previously and if their transitive + // NET is greater than or equal to the tag. + // This is needed to stimulate absent messages from upstream and break deadlocks. + // The scenario this deals with is illustrated in `test/C/src/federated/FeedbackDelay2.lf` + // and `test/C/src/federated/FeedbackDelay4.lf`. + // Note that this is transitive. + // NOTE: This is not needed for enclaves because zero-delay loops are prohibited. + // It's only needed for federates, which is why this is implemented here. + for (int j = 0; j < e->num_upstream; j++) { + scheduling_node_t* upstream = rti_remote->base.scheduling_nodes[e->upstream[j]]; + + // Ignore this federate if it has resigned. + if (upstream->state == NOT_CONNECTED) + continue; + + tag_t earliest = earliest_future_incoming_message_tag(upstream); + tag_t strict_earliest = eimt_strict(upstream); // Non-ZDC version. + + // If these tags are equal, then a TAG or PTAG should have already been granted, + // in which case, another will not be sent. But it may not have been already granted. + if (lf_tag_compare(earliest, tag) > 0) { + notify_tag_advance_grant(upstream, tag); + } else if (lf_tag_compare(earliest, tag) == 0 && lf_tag_compare(strict_earliest, tag) > 0) { + notify_provisional_tag_advance_grant(upstream, tag); + } + } } } -} -void update_federate_next_event_tag_locked(uint16_t federate_id, tag_t next_event_tag) { - federate_info_t* fed = GET_FED_INFO(federate_id); - tag_t min_in_transit_tag = pqueue_tag_peek_tag(fed->in_transit_message_tags); - if (lf_tag_compare(min_in_transit_tag, next_event_tag) < 0) { - next_event_tag = min_in_transit_tag; + /** + * Notify a provisional tag advance grant (PTAG) message to the specified federate + * after the physical time reaches the tag. A thread is created to this end. + * + * If a tag advance grant or a provisional one is pending, then do not proceed + * with the thread creation. + * + * @param e The scheduling node. + * @param tag The provisional tag to grant. + */ + static void notify_provisional_tag_advance_grant_delayed(scheduling_node_t * e, tag_t tag) { + federate_info_t* fed = (federate_info_t*)e; + + // Proceed with the delayed provisional tag grant notification only if + // there is no pending grant and no provisional pending grant + lf_mutex_lock(&rti_mutex); + if ((lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) && + (lf_tag_compare(fed->pending_provisional_grant, NEVER_TAG) >= 0)) { + fed->pending_provisional_grant = tag; + lf_thread_create(&(fed->pending_provisional_grant_thread_id), pending_provisional_grant_thread, fed); + } + lf_mutex_unlock(&rti_mutex); } - update_scheduling_node_next_event_tag_locked(&(fed->enclave), next_event_tag); -} -void handle_port_absent_message(federate_info_t* sending_federate, unsigned char* buffer) { - size_t message_size = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int64_t) + sizeof(uint32_t); + void notify_provisional_tag_advance_grant(scheduling_node_t * e, tag_t tag) { + if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || + lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { + return; + } + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (e->state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); + } - read_from_socket_fail_on_error(&sending_federate->socket, message_size, &(buffer[1]), NULL, - " RTI failed to read port absent message from federate %u.", - sending_federate->enclave.id); + // Check if sending the tag advance grant needs to be delayed or not + // Delay is needed when a federate has, at least one, absent upstream transient + federate_info_t* fed = GET_FED_INFO(e->id); + if (!fed->has_upstream_transient_federates) { + notify_provisional_tag_advance_grant_immediate(e, tag); + } else { + if (get_num_absent_upstream_transients(fed) > 0) { + notify_provisional_tag_advance_grant_delayed(fed, tag); + } else { + notify_provisional_tag_advance_grant_immediate(e, tag); + } + federate_info_t* fed = GET_FED_INFO(e->id); + if (!fed->has_upstream_transient_federates) { + notify_provisional_tag_advance_grant_immediate(e, tag); + } else { + if (get_num_absent_upstream_transients(fed) > 0) { + notify_grant_delayed(fed, tag, true); + } else { + notify_provisional_tag_advance_grant_immediate(e, tag); + } + } + } - uint16_t reactor_port_id = extract_uint16(&(buffer[1])); - uint16_t federate_id = extract_uint16(&(buffer[1 + sizeof(uint16_t)])); - tag_t tag = extract_tag(&(buffer[1 + 2 * sizeof(uint16_t)])); + void update_federate_next_event_tag_locked(uint16_t federate_id, tag_t next_event_tag) { + federate_info_t* fed = GET_FED_INFO(federate_id); + tag_t min_in_transit_tag = pqueue_tag_peek_tag(fed->in_transit_message_tags); + if (lf_tag_compare(min_in_transit_tag, next_event_tag) < 0) { + next_event_tag = min_in_transit_tag; + } + update_scheduling_node_next_event_tag_locked(&(fed->enclave), next_event_tag); + } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_PORT_ABS, sending_federate->enclave.id, &tag); - } + void handle_port_absent_message(federate_info_t * sending_federate, unsigned char* buffer) { + size_t message_size = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int64_t) + sizeof(uint32_t); - // Need to acquire the mutex lock to ensure that the thread handling - // messages coming from the socket connected to the destination does not - // issue a TAG before this message has been forwarded. - LF_MUTEX_LOCK(&rti_mutex); + read_from_socket_fail_on_error(&sending_federate->socket, message_size, &(buffer[1]), NULL, + " RTI failed to read port absent message from federate %u.", + sending_federate->enclave.id); - // If the destination federate is no longer connected, issue a warning - // and return. - federate_info_t* fed = GET_FED_INFO(federate_id); - if (fed->enclave.state == NOT_CONNECTED) { - LF_MUTEX_UNLOCK(&rti_mutex); - lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); - LF_PRINT_LOG("Fed status: next_event " PRINTF_TAG ", " - "completed " PRINTF_TAG ", " - "last_granted " PRINTF_TAG ", " - "last_provisionally_granted " PRINTF_TAG ".", - fed->enclave.next_event.time - start_time, fed->enclave.next_event.microstep, - fed->enclave.completed.time - start_time, fed->enclave.completed.microstep, - fed->enclave.last_granted.time - start_time, fed->enclave.last_granted.microstep, - fed->enclave.last_provisionally_granted.time - start_time, - fed->enclave.last_provisionally_granted.microstep); - return; - } + uint16_t reactor_port_id = extract_uint16(&(buffer[1])); + uint16_t federate_id = extract_uint16(&(buffer[1 + sizeof(uint16_t)])); + tag_t tag = extract_tag(&(buffer[1 + 2 * sizeof(uint16_t)])); - LF_PRINT_LOG("RTI forwarding port absent message for port %u to federate %u.", reactor_port_id, federate_id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_PORT_ABS, sending_federate->enclave.id, &tag); + } - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (fed->enclave.state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); - } + // Need to acquire the mutex lock to ensure that the thread handling + // messages coming from the socket connected to the destination does not + // issue a TAG before this message has been forwarded. + LF_MUTEX_LOCK(&rti_mutex); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_PORT_ABS, federate_id, &tag); - } + // If the destination federate is no longer connected, issue a warning + // and return. + federate_info_t* fed = GET_FED_INFO(federate_id); + if (fed->enclave.state == NOT_CONNECTED) { + LF_MUTEX_UNLOCK(&rti_mutex); + lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); + LF_PRINT_LOG("Fed status: next_event " PRINTF_TAG ", " + "completed " PRINTF_TAG ", " + "last_granted " PRINTF_TAG ", " + "last_provisionally_granted " PRINTF_TAG ".", + fed->enclave.next_event.time - start_time, fed->enclave.next_event.microstep, + fed->enclave.completed.time - start_time, fed->enclave.completed.microstep, + fed->enclave.last_granted.time - start_time, fed->enclave.last_granted.microstep, + fed->enclave.last_provisionally_granted.time - start_time, + fed->enclave.last_provisionally_granted.microstep); + return; + } - // Forward the message. - write_to_socket_fail_on_error(&fed->socket, message_size + 1, buffer, &rti_mutex, - "RTI failed to forward message to federate %d.", federate_id); + LF_PRINT_LOG("RTI forwarding port absent message for port %u to federate %u.", reactor_port_id, federate_id); - LF_MUTEX_UNLOCK(&rti_mutex); -} + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (fed->enclave.state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); + } -void handle_timed_message(federate_info_t* sending_federate, unsigned char* buffer) { - size_t header_size = 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t) + sizeof(int64_t) + sizeof(uint32_t); - // Read the header, minus the first byte which has already been read. - read_from_socket_fail_on_error(&sending_federate->socket, header_size - 1, &(buffer[1]), NULL, - "RTI failed to read the timed message header from remote federate."); - // Extract the header information. of the sender - uint16_t reactor_port_id; - uint16_t federate_id; - size_t length; - tag_t intended_tag; - // Extract information from the header. - extract_timed_header(&(buffer[1]), &reactor_port_id, &federate_id, &length, &intended_tag); - - size_t total_bytes_to_read = length + header_size; - size_t bytes_to_read = length; - - if (FED_COM_BUFFER_SIZE < header_size + 1) { - lf_print_error_and_exit("Buffer size (%d) is not large enough to " - "read the header plus one byte.", - FED_COM_BUFFER_SIZE); - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_PORT_ABS, federate_id, &tag); + } - // Cut up the payload in chunks. - if (bytes_to_read > FED_COM_BUFFER_SIZE - header_size) { - bytes_to_read = FED_COM_BUFFER_SIZE - header_size; - } + // Forward the message. + write_to_socket_fail_on_error(&fed->socket, message_size + 1, buffer, &rti_mutex, + "RTI failed to forward message to federate %d.", federate_id); - LF_PRINT_LOG("RTI received message from federate %d for federate %u port %u with intended tag " PRINTF_TAG - ". Forwarding.", - sending_federate->enclave.id, federate_id, reactor_port_id, intended_tag.time - lf_time_start(), - intended_tag.microstep); + LF_MUTEX_UNLOCK(&rti_mutex); + } - read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, &(buffer[header_size]), NULL, - "RTI failed to read timed message from federate %d.", federate_id); - size_t bytes_read = bytes_to_read + header_size; - // Following only works for string messages. - // LF_PRINT_DEBUG("Message received by RTI: %s.", buffer + header_size); + void handle_timed_message(federate_info_t * sending_federate, unsigned char* buffer) { + size_t header_size = + 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t) + sizeof(int64_t) + sizeof(uint32_t); + // Read the header, minus the first byte which has already been read. + read_from_socket_fail_on_error(&sending_federate->socket, header_size - 1, &(buffer[1]), NULL, + "RTI failed to read the timed message header from remote federate."); + // Extract the header information. of the sender + uint16_t reactor_port_id; + uint16_t federate_id; + size_t length; + tag_t intended_tag; + // Extract information from the header. + extract_timed_header(&(buffer[1]), &reactor_port_id, &federate_id, &length, &intended_tag); + + size_t total_bytes_to_read = length + header_size; + size_t bytes_to_read = length; + + if (FED_COM_BUFFER_SIZE < header_size + 1) { + lf_print_error_and_exit("Buffer size (%d) is not large enough to " + "read the header plus one byte.", + FED_COM_BUFFER_SIZE); + } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_TAGGED_MSG, sending_federate->enclave.id, &intended_tag); - } + // Cut up the payload in chunks. + if (bytes_to_read > FED_COM_BUFFER_SIZE - header_size) { + bytes_to_read = FED_COM_BUFFER_SIZE - header_size; + } - // Need to acquire the mutex lock to ensure that the thread handling - // messages coming from the socket connected to the destination does not - // issue a TAG before this message has been forwarded. - LF_MUTEX_LOCK(&rti_mutex); + LF_PRINT_LOG("RTI received message from federate %d for federate %u port %u with intended tag " PRINTF_TAG + ". Forwarding.", + sending_federate->enclave.id, federate_id, reactor_port_id, intended_tag.time - lf_time_start(), + intended_tag.microstep); - // If the destination federate is no longer connected, issue a warning, - // remove the message from the socket and return. - federate_info_t* fed = GET_FED_INFO(federate_id); - if (fed->enclave.state == NOT_CONNECTED) { - lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); - LF_PRINT_LOG("Fed status: next_event " PRINTF_TAG ", " - "completed " PRINTF_TAG ", " - "last_granted " PRINTF_TAG ", " - "last_provisionally_granted " PRINTF_TAG ".", - fed->enclave.next_event.time - start_time, fed->enclave.next_event.microstep, - fed->enclave.completed.time - start_time, fed->enclave.completed.microstep, - fed->enclave.last_granted.time - start_time, fed->enclave.last_granted.microstep, - fed->enclave.last_provisionally_granted.time - start_time, - fed->enclave.last_provisionally_granted.microstep); - // If the message was larger than the buffer, we must empty out the remainder also. - size_t total_bytes_read = bytes_read; - while (total_bytes_read < total_bytes_to_read) { - bytes_to_read = total_bytes_to_read - total_bytes_read; - if (bytes_to_read > FED_COM_BUFFER_SIZE) { - bytes_to_read = FED_COM_BUFFER_SIZE; - } - read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, - "RTI failed to clear message chunks."); - total_bytes_read += bytes_to_read; - } - LF_MUTEX_UNLOCK(&rti_mutex); - return; - } else { - if (lf_tag_compare(intended_tag, fed->effective_start_tag) < 0) { - // Do not forward the message if the federate is connected, but its - // start_time is not reached yet - lf_mutex_unlock(&rti_mutex); - return; - } - } + read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, &(buffer[header_size]), NULL, + "RTI failed to read timed message from federate %d.", federate_id); + size_t bytes_read = bytes_to_read + header_size; + // Following only works for string messages. + // LF_PRINT_DEBUG("Message received by RTI: %s.", buffer + header_size); - LF_PRINT_DEBUG("RTI forwarding message to port %d of federate %hu of length %zu.", reactor_port_id, federate_id, - length); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_TAGGED_MSG, sending_federate->enclave.id, &intended_tag); + } - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (fed->enclave.state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); - } + // Need to acquire the mutex lock to ensure that the thread handling + // messages coming from the socket connected to the destination does not + // issue a TAG before this message has been forwarded. + LF_MUTEX_LOCK(&rti_mutex); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_TAGGED_MSG, federate_id, &intended_tag); - } + // If the destination federate is no longer connected, issue a warning, + // remove the message from the socket and return. + federate_info_t* fed = GET_FED_INFO(federate_id); + if (fed->enclave.state == NOT_CONNECTED) { + lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); + LF_PRINT_LOG("Fed status: next_event " PRINTF_TAG ", " + "completed " PRINTF_TAG ", " + "last_granted " PRINTF_TAG ", " + "last_provisionally_granted " PRINTF_TAG ".", + fed->enclave.next_event.time - start_time, fed->enclave.next_event.microstep, + fed->enclave.completed.time - start_time, fed->enclave.completed.microstep, + fed->enclave.last_granted.time - start_time, fed->enclave.last_granted.microstep, + fed->enclave.last_provisionally_granted.time - start_time, + fed->enclave.last_provisionally_granted.microstep); + // If the message was larger than the buffer, we must empty out the remainder also. + size_t total_bytes_read = bytes_read; + while (total_bytes_read < total_bytes_to_read) { + bytes_to_read = total_bytes_to_read - total_bytes_read; + if (bytes_to_read > FED_COM_BUFFER_SIZE) { + bytes_to_read = FED_COM_BUFFER_SIZE; + } + read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, + "RTI failed to clear message chunks."); + total_bytes_read += bytes_to_read; + } + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } else { + if (lf_tag_compare(intended_tag, fed->effective_start_tag) < 0) { + // Do not forward the message if the federate is connected, but its + // start_time is not reached yet + lf_mutex_unlock(&rti_mutex); + return; + } + } - write_to_socket_fail_on_error(&fed->socket, bytes_read, buffer, &rti_mutex, - "RTI failed to forward message to federate %d.", federate_id); - - // The message length may be longer than the buffer, - // in which case we have to handle it in chunks. - size_t total_bytes_read = bytes_read; - while (total_bytes_read < total_bytes_to_read) { - LF_PRINT_DEBUG("Forwarding message in chunks."); - bytes_to_read = total_bytes_to_read - total_bytes_read; - if (bytes_to_read > FED_COM_BUFFER_SIZE) { - bytes_to_read = FED_COM_BUFFER_SIZE; - } - read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, - "RTI failed to read message chunks."); - total_bytes_read += bytes_to_read; - - // FIXME: a mutex needs to be held for this so that other threads - // do not write to destination_socket and cause interleaving. However, - // holding the rti_mutex might be very expensive. Instead, each outgoing - // socket should probably have its own mutex. - write_to_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, &rti_mutex, - "RTI failed to send message chunks."); - } + LF_PRINT_DEBUG("RTI forwarding message to port %d of federate %hu of length %zu.", reactor_port_id, federate_id, + length); - // Record this in-transit message in federate's in-transit message queue. - if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) { - // Add a record of this message to the list of in-transit messages to this federate. - pqueue_tag_insert_if_no_match(fed->in_transit_message_tags, intended_tag); - LF_PRINT_DEBUG("RTI: Adding a message with tag " PRINTF_TAG " to the list of in-transit messages for federate %d.", - intended_tag.time - lf_time_start(), intended_tag.microstep, federate_id); - } else { - lf_print_error("RTI: Federate %d has already completed tag " PRINTF_TAG - ", but there is an in-transit message with tag " PRINTF_TAG " from federate %hu. " - "This is going to cause an STP violation under centralized coordination.", - federate_id, fed->enclave.completed.time - lf_time_start(), fed->enclave.completed.microstep, - intended_tag.time - lf_time_start(), intended_tag.microstep, sending_federate->enclave.id); - // FIXME: Drop the federate? - } + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (fed->enclave.state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); + } - // If the message tag is less than the most recently received NET from the federate, - // then update the federate's next event tag to match the message tag. - if (lf_tag_compare(intended_tag, fed->enclave.next_event) < 0) { - update_federate_next_event_tag_locked(federate_id, intended_tag); - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_TAGGED_MSG, federate_id, &intended_tag); + } - LF_MUTEX_UNLOCK(&rti_mutex); -} + write_to_socket_fail_on_error(&fed->socket, bytes_read, buffer, &rti_mutex, + "RTI failed to forward message to federate %d.", federate_id); + + // The message length may be longer than the buffer, + // in which case we have to handle it in chunks. + size_t total_bytes_read = bytes_read; + while (total_bytes_read < total_bytes_to_read) { + LF_PRINT_DEBUG("Forwarding message in chunks."); + bytes_to_read = total_bytes_to_read - total_bytes_read; + if (bytes_to_read > FED_COM_BUFFER_SIZE) { + bytes_to_read = FED_COM_BUFFER_SIZE; + } + read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, + "RTI failed to read message chunks."); + total_bytes_read += bytes_to_read; + + // FIXME: a mutex needs to be held for this so that other threads + // do not write to destination_socket and cause interleaving. However, + // holding the rti_mutex might be very expensive. Instead, each outgoing + // socket should probably have its own mutex. + write_to_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, &rti_mutex, + "RTI failed to send message chunks."); + } -void handle_latest_tag_confirmed(federate_info_t* fed) { - unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; - read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, - "RTI failed to read the content of the logical tag complete from federate %d.", - fed->enclave.id); - tag_t completed = extract_tag(buffer); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_LTC, fed->enclave.id, &completed); - } - _logical_tag_complete(&(fed->enclave), completed); + // Record this in-transit message in federate's in-transit message queue. + if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) { + // Add a record of this message to the list of in-transit messages to this federate. + pqueue_tag_insert_if_no_match(fed->in_transit_message_tags, intended_tag); + LF_PRINT_DEBUG("RTI: Adding a message with tag " PRINTF_TAG + " to the list of in-transit messages for federate %d.", + intended_tag.time - lf_time_start(), intended_tag.microstep, federate_id); + } else { + lf_print_error("RTI: Federate %d has already completed tag " PRINTF_TAG + ", but there is an in-transit message with tag " PRINTF_TAG " from federate %hu. " + "This is going to cause an STP violation under centralized coordination.", + federate_id, fed->enclave.completed.time - lf_time_start(), fed->enclave.completed.microstep, + intended_tag.time - lf_time_start(), intended_tag.microstep, sending_federate->enclave.id); + // FIXME: Drop the federate? + } - // FIXME: Should this function be in the enclave version? - LF_MUTEX_LOCK(&rti_mutex); - // See if we can remove any of the recorded in-transit messages for this. - pqueue_tag_remove_up_to(fed->in_transit_message_tags, completed); - LF_MUTEX_UNLOCK(&rti_mutex); -} + // If the message tag is less than the most recently received NET from the federate, + // then update the federate's next event tag to match the message tag. + if (lf_tag_compare(intended_tag, fed->enclave.next_event) < 0) { + update_federate_next_event_tag_locked(federate_id, intended_tag); + } -void handle_next_event_tag(federate_info_t* fed) { - unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; - read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, - "RTI failed to read the content of the next event tag from federate %d.", - fed->enclave.id); + LF_MUTEX_UNLOCK(&rti_mutex); + } - // Acquire a mutex lock to ensure that this state does not change while a - // message is in transport or being used to determine a TAG. - LF_MUTEX_LOCK(&rti_mutex); // FIXME: Instead of using a mutex, it might be more efficient to use a - // select() mechanism to read and process federates' buffers in an orderly fashion. + void handle_latest_tag_confirmed(federate_info_t * fed) { + unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, + "RTI failed to read the content of the logical tag complete from federate %d.", + fed->enclave.id); + tag_t completed = extract_tag(buffer); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_LTC, fed->enclave.id, &completed); + } + _logical_tag_complete(&(fed->enclave), completed); - tag_t intended_tag = extract_tag(buffer); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_NET, fed->enclave.id, &intended_tag); - } - LF_PRINT_LOG("RTI received from federate %d the Next Event Tag (NET) " PRINTF_TAG, fed->enclave.id, - intended_tag.time - start_time, intended_tag.microstep); - update_federate_next_event_tag_locked(fed->enclave.id, intended_tag); - LF_MUTEX_UNLOCK(&rti_mutex); -} + // FIXME: Should this function be in the enclave version? + LF_MUTEX_LOCK(&rti_mutex); + // See if we can remove any of the recorded in-transit messages for this. + pqueue_tag_remove_up_to(fed->in_transit_message_tags, completed); + LF_MUTEX_UNLOCK(&rti_mutex); + } -/////////////////// STOP functions //////////////////// + void handle_next_event_tag(federate_info_t * fed) { + unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, + "RTI failed to read the content of the next event tag from federate %d.", + fed->enclave.id); -/** - * Boolean used to prevent the RTI from sending the - * MSG_TYPE_STOP_GRANTED message multiple times. - */ -bool stop_granted_already_sent_to_federates = false; + // Acquire a mutex lock to ensure that this state does not change while a + // message is in transport or being used to determine a TAG. + LF_MUTEX_LOCK(&rti_mutex); // FIXME: Instead of using a mutex, it might be more efficient to use a + // select() mechanism to read and process federates' buffers in an orderly fashion. -/** - * Once the RTI has seen proposed tags from all connected federates, - * it will broadcast a MSG_TYPE_STOP_GRANTED carrying the _RTI.max_stop_tag. - * This function also checks the most recently received NET from - * each federate and resets that be no greater than the _RTI.max_stop_tag. - * - * This function assumes the caller holds the rti_mutex lock. - */ -static void broadcast_stop_time_to_federates_locked() { - if (stop_granted_already_sent_to_federates == true) { - return; - } - stop_granted_already_sent_to_federates = true; + tag_t intended_tag = extract_tag(buffer); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_NET, fed->enclave.id, &intended_tag); + } + LF_PRINT_LOG("RTI received from federate %d the Next Event Tag (NET) " PRINTF_TAG, fed->enclave.id, + intended_tag.time - start_time, intended_tag.microstep); + update_federate_next_event_tag_locked(fed->enclave.id, intended_tag); + LF_MUTEX_UNLOCK(&rti_mutex); + } - // Reply with a stop granted to all federates - unsigned char outgoing_buffer[MSG_TYPE_STOP_GRANTED_LENGTH]; - ENCODE_STOP_GRANTED(outgoing_buffer, rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); + /////////////////// STOP functions //////////////////// + + /** + * Boolean used to prevent the RTI from sending the + * MSG_TYPE_STOP_GRANTED message multiple times. + */ + bool stop_granted_already_sent_to_federates = false; + + /** + * Once the RTI has seen proposed tags from all connected federates, + * it will broadcast a MSG_TYPE_STOP_GRANTED carrying the _RTI.max_stop_tag. + * This function also checks the most recently received NET from + * each federate and resets that be no greater than the _RTI.max_stop_tag. + * + * This function assumes the caller holds the rti_mutex lock. + */ + static void broadcast_stop_time_to_federates_locked() { + if (stop_granted_already_sent_to_federates == true) { + return; + } + stop_granted_already_sent_to_federates = true; + + // Reply with a stop granted to all federates + unsigned char outgoing_buffer[MSG_TYPE_STOP_GRANTED_LENGTH]; + ENCODE_STOP_GRANTED(outgoing_buffer, rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); + + // Iterate over federates and send each the message. + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->enclave.state == NOT_CONNECTED) { + continue; + } + if (lf_tag_compare(fed->enclave.next_event, rti_remote->base.max_stop_tag) >= 0) { + // Need the next_event to be no greater than the stop tag. + fed->enclave.next_event = rti_remote->base.max_stop_tag; + } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_STOP_GRN, fed->enclave.id, &rti_remote->base.max_stop_tag); + } + write_to_socket_fail_on_error(&fed->socket, MSG_TYPE_STOP_GRANTED_LENGTH, outgoing_buffer, &rti_mutex, + "RTI failed to send MSG_TYPE_STOP_GRANTED message to federate %d.", + fed->enclave.id); + } - // Iterate over federates and send each the message. - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->enclave.state == NOT_CONNECTED) { - continue; + LF_PRINT_LOG("RTI sent to federates MSG_TYPE_STOP_GRANTED with tag " PRINTF_TAG, + rti_remote->base.max_stop_tag.time - start_time, rti_remote->base.max_stop_tag.microstep); } - if (lf_tag_compare(fed->enclave.next_event, rti_remote->base.max_stop_tag) >= 0) { - // Need the next_event to be no greater than the stop tag. - fed->enclave.next_event = rti_remote->base.max_stop_tag; + + /** + * Mark a federate requesting stop. If the number of federates handling stop reaches + * the number of persistent federates, broadcast MSG_TYPE_STOP_GRANTED to every federate. + * This function assumes the _RTI.mutex is already locked. + * @param fed The federate that has requested a stop. + * @return 1 if stop time has been sent to all federates and 0 otherwise. + */ + static int mark_federate_requesting_stop(federate_info_t * fed) { + if (!fed->requested_stop) { + // Increment the number of federates handling stop only if it is persistent + if (!fed->is_transient) + rti_remote->base.num_scheduling_nodes_handling_stop++; + fed->requested_stop = true; + } + if (rti_remote->base.num_scheduling_nodes_handling_stop == + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { + // We now have information about the stop time of all + // federates. + broadcast_stop_time_to_federates_locked(); + return 1; + } + return 0; } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_STOP_GRN, fed->enclave.id, &rti_remote->base.max_stop_tag); + + /** + * Thread to time out if federates do not reply to stop request. + */ + static void* wait_for_stop_request_reply(void* args) { + initialize_lf_thread_id(); + // Divide the time into small chunks and check periodically. + interval_t chunk = MAX_TIME_FOR_REPLY_TO_STOP_REQUEST / 30; + int count = 0; + while (count++ < 30) { + if (stop_granted_already_sent_to_federates) + return NULL; + lf_sleep(chunk); + } + // If we reach here, then error out. + lf_print_error_and_exit("Received only %d stop request replies within timeout " PRINTF_TIME "ns. RTI is exiting.", + rti_remote->base.num_scheduling_nodes_handling_stop, MAX_TIME_FOR_REPLY_TO_STOP_REQUEST); + return NULL; } - write_to_socket_fail_on_error(&fed->socket, MSG_TYPE_STOP_GRANTED_LENGTH, outgoing_buffer, &rti_mutex, - "RTI failed to send MSG_TYPE_STOP_GRANTED message to federate %d.", fed->enclave.id); - } - LF_PRINT_LOG("RTI sent to federates MSG_TYPE_STOP_GRANTED with tag " PRINTF_TAG, - rti_remote->base.max_stop_tag.time - start_time, rti_remote->base.max_stop_tag.microstep); -} + void handle_stop_request_message(federate_info_t * fed) { + LF_PRINT_DEBUG("RTI handling stop_request from federate %d.", fed->enclave.id); -/** - * Mark a federate requesting stop. If the number of federates handling stop reaches - * the number of persistent federates, broadcast MSG_TYPE_STOP_GRANTED to every federate. - * This function assumes the _RTI.mutex is already locked. - * @param fed The federate that has requested a stop. - * @return 1 if stop time has been sent to all federates and 0 otherwise. - */ -static int mark_federate_requesting_stop(federate_info_t* fed) { - if (!fed->requested_stop) { - // Increment the number of federates handling stop only if it is persistent - if (!fed->is_transient) - rti_remote->base.num_scheduling_nodes_handling_stop++; - fed->requested_stop = true; - } - if (rti_remote->base.num_scheduling_nodes_handling_stop == - (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { - // We now have information about the stop time of all - // federates. - broadcast_stop_time_to_federates_locked(); - return 1; - } - return 0; -} + size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; + unsigned char buffer[bytes_to_read]; + read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, NULL, + "RTI failed to read the MSG_TYPE_STOP_REQUEST payload from federate %d.", + fed->enclave.id); -/** - * Thread to time out if federates do not reply to stop request. - */ -static void* wait_for_stop_request_reply(void* args) { - initialize_lf_thread_id(); - // Divide the time into small chunks and check periodically. - interval_t chunk = MAX_TIME_FOR_REPLY_TO_STOP_REQUEST / 30; - int count = 0; - while (count++ < 30) { - if (stop_granted_already_sent_to_federates) - return NULL; - lf_sleep(chunk); - } - // If we reach here, then error out. - lf_print_error_and_exit("Received only %d stop request replies within timeout " PRINTF_TIME "ns. RTI is exiting.", - rti_remote->base.num_scheduling_nodes_handling_stop, MAX_TIME_FOR_REPLY_TO_STOP_REQUEST); - return NULL; -} + // Extract the proposed stop tag for the federate + tag_t proposed_stop_tag = extract_tag(buffer); -void handle_stop_request_message(federate_info_t* fed) { - LF_PRINT_DEBUG("RTI handling stop_request from federate %d.", fed->enclave.id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_STOP_REQ, fed->enclave.id, &proposed_stop_tag); + } - size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; - unsigned char buffer[bytes_to_read]; - read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, NULL, - "RTI failed to read the MSG_TYPE_STOP_REQUEST payload from federate %d.", - fed->enclave.id); + LF_PRINT_LOG("RTI received from federate %d a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", + fed->enclave.id, proposed_stop_tag.time - start_time, proposed_stop_tag.microstep); - // Extract the proposed stop tag for the federate - tag_t proposed_stop_tag = extract_tag(buffer); + // Acquire a mutex lock to ensure that this state does change while a + // message is in transport or being used to determine a TAG. + LF_MUTEX_LOCK(&rti_mutex); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_STOP_REQ, fed->enclave.id, &proposed_stop_tag); - } + // Check whether we have already received a stop_tag + // from this federate + if (fed->requested_stop) { + // If stop request messages have already been broadcast, treat this as if it were a reply. + if (rti_remote->stop_in_progress) { + mark_federate_requesting_stop(fed); + } + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } - LF_PRINT_LOG("RTI received from federate %d a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", - fed->enclave.id, proposed_stop_tag.time - start_time, proposed_stop_tag.microstep); + // Update the maximum stop tag received from federates + if (lf_tag_compare(proposed_stop_tag, rti_remote->base.max_stop_tag) > 0) { + rti_remote->base.max_stop_tag = proposed_stop_tag; + } - // Acquire a mutex lock to ensure that this state does change while a - // message is in transport or being used to determine a TAG. - LF_MUTEX_LOCK(&rti_mutex); + // If all federates have replied, send stop request granted. + if (mark_federate_requesting_stop(fed)) { + // Have send stop request granted to all federates. Nothing more to do. + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } - // Check whether we have already received a stop_tag - // from this federate - if (fed->requested_stop) { - // If stop request messages have already been broadcast, treat this as if it were a reply. - if (rti_remote->stop_in_progress) { - mark_federate_requesting_stop(fed); - } - LF_MUTEX_UNLOCK(&rti_mutex); - return; - } + // Forward the stop request to all other federates that have not + // also issued a stop request. + unsigned char stop_request_buffer[MSG_TYPE_STOP_REQUEST_LENGTH]; + ENCODE_STOP_REQUEST(stop_request_buffer, rti_remote->base.max_stop_tag.time, + rti_remote->base.max_stop_tag.microstep); - // Update the maximum stop tag received from federates - if (lf_tag_compare(proposed_stop_tag, rti_remote->base.max_stop_tag) > 0) { - rti_remote->base.max_stop_tag = proposed_stop_tag; - } + // Iterate over federates and send each the MSG_TYPE_STOP_REQUEST message + // if we do not have a stop_time already for them. Do not do this more than once. + if (rti_remote->stop_in_progress) { + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } + rti_remote->stop_in_progress = true; + // Need a timeout here in case a federate never replies. + lf_thread_t timeout_thread; + lf_thread_create(&timeout_thread, wait_for_stop_request_reply, NULL); + + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* f = GET_FED_INFO(i); + if (f->enclave.id != fed->enclave.id && f->requested_stop == false) { + if (f->enclave.state == NOT_CONNECTED) { + mark_federate_requesting_stop(f); + continue; + } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); + } + write_to_socket_fail_on_error(&f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, &rti_mutex, + "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", + f->enclave.id); + } + } + LF_PRINT_LOG("RTI forwarded to federates MSG_TYPE_STOP_REQUEST with tag (" PRINTF_TIME ", %u).", + rti_remote->base.max_stop_tag.time - start_time, rti_remote->base.max_stop_tag.microstep); + LF_MUTEX_UNLOCK(&rti_mutex); + } - // If all federates have replied, send stop request granted. - if (mark_federate_requesting_stop(fed)) { - // Have send stop request granted to all federates. Nothing more to do. - LF_MUTEX_UNLOCK(&rti_mutex); - return; - } + void handle_stop_request_reply(federate_info_t * fed) { + size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_REPLY_LENGTH - 1; + unsigned char buffer_stop_time[bytes_to_read]; + read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer_stop_time, NULL, + "RTI failed to read the reply to MSG_TYPE_STOP_REQUEST message from federate %d.", + fed->enclave.id); - // Forward the stop request to all other federates that have not - // also issued a stop request. - unsigned char stop_request_buffer[MSG_TYPE_STOP_REQUEST_LENGTH]; - ENCODE_STOP_REQUEST(stop_request_buffer, rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); + tag_t federate_stop_tag = extract_tag(buffer_stop_time); - // Iterate over federates and send each the MSG_TYPE_STOP_REQUEST message - // if we do not have a stop_time already for them. Do not do this more than once. - if (rti_remote->stop_in_progress) { - LF_MUTEX_UNLOCK(&rti_mutex); - return; - } - rti_remote->stop_in_progress = true; - // Need a timeout here in case a federate never replies. - lf_thread_t timeout_thread; - lf_thread_create(&timeout_thread, wait_for_stop_request_reply, NULL); - - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* f = GET_FED_INFO(i); - if (f->enclave.id != fed->enclave.id && f->requested_stop == false) { - if (f->enclave.state == NOT_CONNECTED) { - mark_federate_requesting_stop(f); - continue; - } if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); + tracepoint_rti_from_federate(receive_STOP_REQ_REP, fed->enclave.id, &federate_stop_tag); + } + + LF_PRINT_LOG("RTI received from federate %d STOP reply tag " PRINTF_TAG ".", fed->enclave.id, + federate_stop_tag.time - start_time, federate_stop_tag.microstep); + + // Acquire the mutex lock so that we can change the state of the RTI + LF_MUTEX_LOCK(&rti_mutex); + // If the federate has not requested stop before, count the reply + if (lf_tag_compare(federate_stop_tag, rti_remote->base.max_stop_tag) > 0) { + rti_remote->base.max_stop_tag = federate_stop_tag; } - write_to_socket_fail_on_error(&f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, &rti_mutex, - "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", - f->enclave.id); + mark_federate_requesting_stop(fed); + LF_MUTEX_UNLOCK(&rti_mutex); } - } - LF_PRINT_LOG("RTI forwarded to federates MSG_TYPE_STOP_REQUEST with tag (" PRINTF_TIME ", %u).", - rti_remote->base.max_stop_tag.time - start_time, rti_remote->base.max_stop_tag.microstep); - LF_MUTEX_UNLOCK(&rti_mutex); -} -void handle_stop_request_reply(federate_info_t* fed) { - size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_REPLY_LENGTH - 1; - unsigned char buffer_stop_time[bytes_to_read]; - read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer_stop_time, NULL, - "RTI failed to read the reply to MSG_TYPE_STOP_REQUEST message from federate %d.", - fed->enclave.id); + ////////////////////////////////////////////////// - tag_t federate_stop_tag = extract_tag(buffer_stop_time); + void handle_address_query(uint16_t fed_id) { + federate_info_t* fed = GET_FED_INFO(fed_id); + // Use buffer both for reading and constructing the reply. + // The length is what is needed for the reply. + unsigned char buffer[1 + sizeof(int32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(uint16_t), (unsigned char*)buffer, NULL, + "Failed to read address query."); + uint16_t remote_fed_id = extract_uint16(buffer); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_STOP_REQ_REP, fed->enclave.id, &federate_stop_tag); - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_ADR_QR, fed_id, NULL); + } - LF_PRINT_LOG("RTI received from federate %d STOP reply tag " PRINTF_TAG ".", fed->enclave.id, - federate_stop_tag.time - start_time, federate_stop_tag.microstep); + LF_PRINT_DEBUG("RTI received address query from %d for %d.", fed_id, remote_fed_id); - // Acquire the mutex lock so that we can change the state of the RTI - LF_MUTEX_LOCK(&rti_mutex); - // If the federate has not requested stop before, count the reply - if (lf_tag_compare(federate_stop_tag, rti_remote->base.max_stop_tag) > 0) { - rti_remote->base.max_stop_tag = federate_stop_tag; - } - mark_federate_requesting_stop(fed); - LF_MUTEX_UNLOCK(&rti_mutex); -} + // NOTE: server_port initializes to -1, which means the RTI does not know + // the port number because it has not yet received an MSG_TYPE_ADDRESS_ADVERTISEMENT message + // from this federate. In that case, it will respond by sending -1. -////////////////////////////////////////////////// + // Response message is MSG_TYPE_ADDRESS_QUERY_REPLY. + buffer[0] = MSG_TYPE_ADDRESS_QUERY_REPLY; -void handle_address_query(uint16_t fed_id) { - federate_info_t* fed = GET_FED_INFO(fed_id); - // Use buffer both for reading and constructing the reply. - // The length is what is needed for the reply. - unsigned char buffer[1 + sizeof(int32_t)]; - read_from_socket_fail_on_error(&fed->socket, sizeof(uint16_t), (unsigned char*)buffer, NULL, - "Failed to read address query."); - uint16_t remote_fed_id = extract_uint16(buffer); + // Encode the port number. + federate_info_t* remote_fed = GET_FED_INFO(remote_fed_id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_ADR_QR, fed_id, NULL); - } + // Send the port number (which could be -1). + LF_MUTEX_LOCK(&rti_mutex); + encode_int32(remote_fed->server_port, (unsigned char*)&buffer[1]); + write_to_socket_fail_on_error(&fed->socket, sizeof(int32_t) + 1, (unsigned char*)buffer, &rti_mutex, + "Failed to write port number to socket of federate %d.", fed_id); + + // Send the server IP address to federate. + write_to_socket_fail_on_error(&fed->socket, sizeof(remote_fed->server_ip_addr), + (unsigned char*)&remote_fed->server_ip_addr, &rti_mutex, + "Failed to write ip address to socket of federate %d.", fed_id); + LF_MUTEX_UNLOCK(&rti_mutex); + + LF_PRINT_DEBUG("Replied to address query from federate %d with address %s:%d.", fed_id, + remote_fed->server_hostname, remote_fed->server_port); + } - LF_PRINT_DEBUG("RTI received address query from %d for %d.", fed_id, remote_fed_id); + void handle_address_ad(uint16_t federate_id) { + federate_info_t* fed = GET_FED_INFO(federate_id); + // Read the port number of the federate that can be used for physical + // connections to other federates + int32_t server_port = -1; + unsigned char buffer[sizeof(int32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(int32_t), (unsigned char*)buffer, NULL, + "Error reading port data from federate %d.", federate_id); - // NOTE: server_port initializes to -1, which means the RTI does not know - // the port number because it has not yet received an MSG_TYPE_ADDRESS_ADVERTISEMENT message - // from this federate. In that case, it will respond by sending -1. + server_port = extract_int32(buffer); - // Response message is MSG_TYPE_ADDRESS_QUERY_REPLY. - buffer[0] = MSG_TYPE_ADDRESS_QUERY_REPLY; + assert(server_port < 65536); - // Encode the port number. - federate_info_t* remote_fed = GET_FED_INFO(remote_fed_id); + LF_MUTEX_LOCK(&rti_mutex); + fed->server_port = server_port; + LF_MUTEX_UNLOCK(&rti_mutex); - // Send the port number (which could be -1). - LF_MUTEX_LOCK(&rti_mutex); - encode_int32(remote_fed->server_port, (unsigned char*)&buffer[1]); - write_to_socket_fail_on_error(&fed->socket, sizeof(int32_t) + 1, (unsigned char*)buffer, &rti_mutex, - "Failed to write port number to socket of federate %d.", fed_id); - - // Send the server IP address to federate. - write_to_socket_fail_on_error(&fed->socket, sizeof(remote_fed->server_ip_addr), - (unsigned char*)&remote_fed->server_ip_addr, &rti_mutex, - "Failed to write ip address to socket of federate %d.", fed_id); - LF_MUTEX_UNLOCK(&rti_mutex); + LF_PRINT_LOG("Received address advertisement with port %d from federate %d.", server_port, federate_id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_ADR_AD, federate_id, NULL); + } + } - LF_PRINT_DEBUG("Replied to address query from federate %d with address %s:%d.", fed_id, remote_fed->server_hostname, - remote_fed->server_port); -} + /** + * Send to the start time to the federate my_fed. + * This function assumes the caller does not hold the mutex. + * + * If it is the startup phase, the start_time will be the maximum received timestamps + * plus an offset. The federate will then receive identical federation_start_time + * and federate_start_tag.time (the federate_start_tag.microstep will be 0). + * If, however, the startup phase is passed, the federate will receive different + * values than sateted above. + * + * @param my_fed the federate to send the start time to. + * @param federation_start_time the federation start_time + * @param federate_start_tag the federate effective start tag + */ + void send_start_tag(federate_info_t * my_fed, instant_t federation_start_time, tag_t federate_start_tag) { + // Send back to the federate the maximum time plus an offset on a TIMESTAMP_START + // message. + // In the startup phase, federates will receive identical start_time and + // effective_start_tag + unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_START_LENGTH]; + start_time_buffer[0] = MSG_TYPE_TIMESTAMP_START; + encode_int64(swap_bytes_if_big_endian_int64(start_time), &start_time_buffer[1]); + encode_tag(&(start_time_buffer[1 + sizeof(instant_t)]), federate_start_tag); -void handle_address_ad(uint16_t federate_id) { - federate_info_t* fed = GET_FED_INFO(federate_id); - // Read the port number of the federate that can be used for physical - // connections to other federates - int32_t server_port = -1; - unsigned char buffer[sizeof(int32_t)]; - read_from_socket_fail_on_error(&fed->socket, sizeof(int32_t), (unsigned char*)buffer, NULL, - "Error reading port data from federate %d.", federate_id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_TIMESTAMP, my_fed->enclave.id, &federate_start_tag); + } + if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_START_LENGTH, start_time_buffer)) { + lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); + } - server_port = extract_int32(buffer); + LF_MUTEX_LOCK(&rti_mutex); + // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP + // message has been sent. That MSG_TYPE_TIMESTAMP message grants time advance to + // the federate to the start time. + my_fed->enclave.state = GRANTED; + lf_cond_broadcast(&sent_start_time); + LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); + LF_MUTEX_UNLOCK(&rti_mutex); + } - assert(server_port < 65536); + void handle_timestamp(federate_info_t * my_fed) { + unsigned char buffer[sizeof(int64_t)]; + // Read bytes from the socket. We need 8 bytes. + read_from_socket_fail_on_error(&my_fed->socket, sizeof(int64_t), (unsigned char*)&buffer, NULL, + "ERROR reading timestamp from federate %d.\n", my_fed->enclave.id); - LF_MUTEX_LOCK(&rti_mutex); - fed->server_port = server_port; - LF_MUTEX_UNLOCK(&rti_mutex); + int64_t timestamp = swap_bytes_if_big_endian_int64(*((int64_t*)(&buffer))); + if (rti_remote->base.tracing_enabled) { + tag_t tag = {.time = timestamp, .microstep = 0}; + tracepoint_rti_from_federate(receive_TIMESTAMP, my_fed->enclave.id, &tag); + } + LF_PRINT_DEBUG("RTI received timestamp message with time: " PRINTF_TIME ".", timestamp); - LF_PRINT_LOG("Received address advertisement with port %d from federate %d.", server_port, federate_id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_ADR_AD, federate_id, NULL); - } -} + LF_MUTEX_LOCK(&rti_mutex); -/** - * Send to the start time to the federate my_fed. - * This function assumes the caller does not hold the mutex. - * - * If it is the startup phase, the start_time will be the maximum received timestamps - * plus an offset. The federate will then receive identical federation_start_time - * and federate_start_tag.time (the federate_start_tag.microstep will be 0). - * If, however, the startup phase is passed, the federate will receive different - * values than sateted above. - * - * @param my_fed the federate to send the start time to. - * @param federation_start_time the federation start_time - * @param federate_start_tag the federate effective start tag - */ -void send_start_tag(federate_info_t* my_fed, instant_t federation_start_time, tag_t federate_start_tag) { - // Send back to the federate the maximum time plus an offset on a TIMESTAMP_START - // message. - // In the startup phase, federates will receive identical start_time and - // effective_start_tag - unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_START_LENGTH]; - start_time_buffer[0] = MSG_TYPE_TIMESTAMP_START; - encode_int64(swap_bytes_if_big_endian_int64(start_time), &start_time_buffer[1]); - encode_tag(&(start_time_buffer[1 + sizeof(instant_t)]), federate_start_tag); + // Processing the TIMESTAMP depends on whether it is the startup phase (all + // persistent federates joined) or not. + if (rti_remote->phase == + startup_phase) { // This is equivalent to: rti_remote->num_feds_proposed_start < + // (rti_remote->number_of_enclaves - rti_remote->number_of_transient_federates) + if (timestamp > rti_remote->max_start_time) { + rti_remote->max_start_time = timestamp; + } + // Check that persistent federates did propose a start_time + if (!my_fed->is_transient) { + rti_remote->num_feds_proposed_start++; + } + if (rti_remote->num_feds_proposed_start == + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { + // All federates have proposed a start time. + lf_cond_broadcast(&received_start_times); + rti_remote->phase = execution_phase; + } else { + // Some federates have not yet proposed a start time. + // wait for a notification. + while (rti_remote->num_feds_proposed_start < + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { + // FIXME: Should have a timeout here? + lf_cond_wait(&received_start_times); + } + } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_TIMESTAMP, my_fed->enclave.id, &federate_start_tag); - } - if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_START_LENGTH, start_time_buffer)) { - lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); - } + LF_MUTEX_UNLOCK(&rti_mutex); - LF_MUTEX_LOCK(&rti_mutex); - // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP - // message has been sent. That MSG_TYPE_TIMESTAMP message grants time advance to - // the federate to the start time. - my_fed->enclave.state = GRANTED; - lf_cond_broadcast(&sent_start_time); - LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); - LF_MUTEX_UNLOCK(&rti_mutex); -} + // Send back to the federate the maximum time plus an offset on a TIMESTAMP + // message. + // Add an offset to this start time to get everyone starting together. + start_time = rti_remote->max_start_time + DELAY_START; + my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; + send_start_tag(my_fed, start_time, my_fed->effective_start_tag); + } else if (rti_remote->phase == shutdown_phase) { + // Do not answer the federate if the federation is in hsutdown phase + // Or maybe send and error message? + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } else { // The federation is the execution phase + // A transient has joined after the startup phase + // At this point, we already hold the mutex + + // This is rather a possible extreme corner case, where a transient sends its timestamp, and only + // enters the if section after all persistents have joined. + if (timestamp < start_time) { + timestamp = start_time; + } -void handle_timestamp(federate_info_t* my_fed) { - unsigned char buffer[sizeof(int64_t)]; - // Read bytes from the socket. We need 8 bytes. - read_from_socket_fail_on_error(&my_fed->socket, sizeof(int64_t), (unsigned char*)&buffer, NULL, - "ERROR reading timestamp from federate %d.\n", my_fed->enclave.id); + //// Algorithm for computing the effective_start_time of a joining transient + // The effective_start_time will be the max among all the following tags: + // 1. At tag: (joining time, 0 microstep) + // 2. The latest completed logical tag + 1 microstep + // 3. The latest granted (P)TAG + 1 microstep, of every downstream federate + // 4. The maximun tag of messages from the upstream federates + 1 microstep - int64_t timestamp = swap_bytes_if_big_endian_int64(*((int64_t*)(&buffer))); - if (rti_remote->base.tracing_enabled) { - tag_t tag = {.time = timestamp, .microstep = 0}; - tracepoint_rti_from_federate(receive_TIMESTAMP, my_fed->enclave.id, &tag); - } - LF_PRINT_DEBUG("RTI received timestamp message with time: " PRINTF_TIME ".", timestamp); + // Condition 1. + my_fed->effective_start_tag = (tag_t){.time = timestamp, .microstep = 0u}; - LF_MUTEX_LOCK(&rti_mutex); + // Condition 2. + // FIXME: Not sure if this corner case can happen, but better to be on the safe side. + if (lf_tag_compare(my_fed->enclave.completed, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = my_fed->enclave.completed; + my_fed->effective_start_tag.microstep++; + } - // Processing the TIMESTAMP depends on whether it is the startup phase (all - // persistent federates joined) or not. - if (rti_remote->phase == - startup_phase) { // This is equivalent to: rti_remote->num_feds_proposed_start < (rti_remote->number_of_enclaves - - // rti_remote->number_of_transient_federates) - if (timestamp > rti_remote->max_start_time) { - rti_remote->max_start_time = timestamp; - } - // Check that persistent federates did propose a start_time - if (!my_fed->is_transient) { - rti_remote->num_feds_proposed_start++; - } - if (rti_remote->num_feds_proposed_start == - (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { - // All federates have proposed a start time. - lf_cond_broadcast(&received_start_times); - rti_remote->phase = execution_phase; - } else { - // Some federates have not yet proposed a start time. - // wait for a notification. - while (rti_remote->num_feds_proposed_start < - (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { - // FIXME: Should have a timeout here? - lf_cond_wait(&received_start_times); - } - } + // Condition 3. Iterate over the downstream federates + for (int j = 0; j < my_fed->enclave.num_downstream; j++) { + federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); - LF_MUTEX_UNLOCK(&rti_mutex); - - // Send back to the federate the maximum time plus an offset on a TIMESTAMP - // message. - // Add an offset to this start time to get everyone starting together. - start_time = rti_remote->max_start_time + DELAY_START; - my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; - send_start_tag(my_fed, start_time, my_fed->effective_start_tag); - } else if (rti_remote->phase == shutdown_phase) { - // Do not answer the federate if the federation is in hsutdown phase - // Or maybe send and error message? - LF_MUTEX_UNLOCK(&rti_mutex); - return; - } else { // The federation is the execution phase - // A transient has joined after the startup phase - // At this point, we already hold the mutex - - // This is rather a possible extreme corner case, where a transient sends its timestamp, and only - // enters the if section after all persistents have joined. - if (timestamp < start_time) { - timestamp = start_time; - } + // Get the max over the TAG of the downstreams + if (lf_tag_compare(downstream->enclave.last_granted, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = downstream->enclave.last_granted; + my_fed->effective_start_tag.microstep++; + } - //// Algorithm for computing the effective_start_time of a joining transient - // The effective_start_time will be the max among all the following tags: - // 1. At tag: (joining time, 0 microstep) - // 2. The latest completed logical tag + 1 microstep - // 3. The latest granted (P)TAG + 1 microstep, of every downstream federate - // 4. The maximun tag of messages from the upstream federates + 1 microstep - - // Condition 1. - my_fed->effective_start_tag = (tag_t){.time = timestamp, .microstep = 0u}; - - // Condition 2. - // FIXME: Not sure if this corner case can happen, but better to be on the safe side. - if (lf_tag_compare(my_fed->enclave.completed, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = my_fed->enclave.completed; - my_fed->effective_start_tag.microstep++; - } + // Get the max over the PTAG of the downstreams + if (lf_tag_compare(downstream->enclave.last_provisionally_granted, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = downstream->enclave.last_provisionally_granted; + my_fed->effective_start_tag.microstep++; + } + } - // Condition 3. Iterate over the downstream federates - for (int j = 0; j < my_fed->enclave.num_downstream; j++) { - federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); + // Condition 4. Iterate over the messages from the upstream federates + for (int j = 0; j < my_fed->enclave.num_upstream; j++) { + federate_info_t* upstream = GET_FED_INFO(my_fed->enclave.upstream[j]); - // Get the max over the TAG of the downstreams - if (lf_tag_compare(downstream->enclave.last_granted, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = downstream->enclave.last_granted; - my_fed->effective_start_tag.microstep++; - } + // Get the max over the TAG of the upstreams + size_t queue_size = pqueue_tag_size(upstream->in_transit_message_tags); + if (queue_size != 0) { + pqueue_t* pq = (pqueue_t*)(upstream->in_transit_message_tags); + pqueue_tag_element_t* message_with_max_tag = (pqueue_tag_element_t*)(pq->d[queue_size]); + tag_t max_tag = message_with_max_tag->tag; - // Get the max over the PTAG of the downstreams - if (lf_tag_compare(downstream->enclave.last_provisionally_granted, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = downstream->enclave.last_provisionally_granted; - my_fed->effective_start_tag.microstep++; - } - } + if (lf_tag_compare(max_tag, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = max_tag; + my_fed->effective_start_tag.microstep++; + } + } + } - // Condition 4. Iterate over the messages from the upstream federates - for (int j = 0; j < my_fed->enclave.num_upstream; j++) { - federate_info_t* upstream = GET_FED_INFO(my_fed->enclave.upstream[j]); + // For every downstream that has a pending grant that is higher then the + // effective_start_time of the federate, cancel it + for (int j = 0; j < my_fed->enclave.num_downstream; j++) { + federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); - // Get the max over the TAG of the upstreams - size_t queue_size = pqueue_tag_size(upstream->in_transit_message_tags); - if (queue_size != 0) { - pqueue_t* pq = (pqueue_t*)(upstream->in_transit_message_tags); - pqueue_tag_element_t* message_with_max_tag = (pqueue_tag_element_t*)(pq->d[queue_size]); - tag_t max_tag = message_with_max_tag->tag; + // Ignore this federate if it has resigned. + if (downstream->enclave.state == NOT_CONNECTED) { + continue; + } - if (lf_tag_compare(max_tag, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = max_tag; - my_fed->effective_start_tag.microstep++; + // Check the pending grants, if any, and keep it only if it is + // sonner than the effective start tag + pqueue_delayed_grant_element_t* dge = + pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, downstream->enclave.id); + if (dge != NULL && lf_tag_compare(dge->base.tag, my_fed->effective_start_tag) > 0) { + pqueue_delayed_grants_remove(rti_remote->delayed_grants, dge); + } } - } - } - // For every downstream that has a pending grant that is higher then the - // effective_start_time of the federate, cancel it - for (int j = 0; j < my_fed->enclave.num_downstream; j++) { - federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); + LF_MUTEX_UNLOCK(&rti_mutex); - // Ignore this federate if it has resigned. - if (downstream->enclave.state == NOT_CONNECTED) { - continue; - } + // Once the effective start time set, sent it to the joining transient, + // together with the start time of the federation. - // Check the pending grants, if any, and keep it only if it is - // sonner than the effective start tag - pqueue_delayed_grant_element_t* dge = - pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, downstream->enclave.id); - if (dge != NULL && lf_tag_compare(dge->base.tag, my_fed->effective_start_tag) > 0) { - pqueue_delayed_grants_remove(rti_remote->delayed_grants, dge); + // Send the start time + send_start_tag(my_fed, start_time, my_fed->effective_start_tag); } } - LF_MUTEX_UNLOCK(&rti_mutex); - - // Once the effective start time set, sent it to the joining transient, - // together with the start time of the federation. - - // Send the start time - send_start_tag(my_fed, start_time, my_fed->effective_start_tag); - } -} + void send_physical_clock(unsigned char message_type, federate_info_t* fed, socket_type_t socket_type) { + if (fed->enclave.state == NOT_CONNECTED) { + lf_print_warning("Clock sync: RTI failed to send physical time to federate %d. Socket not connected.\n", + fed->enclave.id); + return; + } + unsigned char buffer[sizeof(int64_t) + 1]; + buffer[0] = message_type; + int64_t current_physical_time = lf_time_physical(); + encode_int64(current_physical_time, &(buffer[1])); + + // Send the message + if (socket_type == UDP) { + // FIXME: UDP_addr is never initialized. + LF_PRINT_DEBUG("Clock sync: RTI sending UDP message type %u.", buffer[0]); + ssize_t bytes_written = sendto(rti_remote->socket_descriptor_UDP, buffer, 1 + sizeof(int64_t), 0, + (struct sockaddr*)&fed->UDP_addr, sizeof(fed->UDP_addr)); + if (bytes_written < (ssize_t)sizeof(int64_t) + 1) { + lf_print_warning("Clock sync: RTI failed to send physical time to federate %d: %s\n", fed->enclave.id, + strerror(errno)); + return; + } + } else if (socket_type == TCP) { + LF_PRINT_DEBUG("Clock sync: RTI sending TCP message type %u.", buffer[0]); + LF_MUTEX_LOCK(&rti_mutex); + write_to_socket_fail_on_error(&fed->socket, 1 + sizeof(int64_t), buffer, &rti_mutex, + "Clock sync: RTI failed to send physical time to federate %d.", fed->enclave.id); + LF_MUTEX_UNLOCK(&rti_mutex); + } + LF_PRINT_DEBUG("Clock sync: RTI sent PHYSICAL_TIME_SYNC_MESSAGE with timestamp " PRINTF_TIME " to federate %d.", + current_physical_time, fed->enclave.id); + } -void send_physical_clock(unsigned char message_type, federate_info_t* fed, socket_type_t socket_type) { - if (fed->enclave.state == NOT_CONNECTED) { - lf_print_warning("Clock sync: RTI failed to send physical time to federate %d. Socket not connected.\n", - fed->enclave.id); - return; - } - unsigned char buffer[sizeof(int64_t) + 1]; - buffer[0] = message_type; - int64_t current_physical_time = lf_time_physical(); - encode_int64(current_physical_time, &(buffer[1])); - - // Send the message - if (socket_type == UDP) { - // FIXME: UDP_addr is never initialized. - LF_PRINT_DEBUG("Clock sync: RTI sending UDP message type %u.", buffer[0]); - ssize_t bytes_written = sendto(rti_remote->socket_descriptor_UDP, buffer, 1 + sizeof(int64_t), 0, - (struct sockaddr*)&fed->UDP_addr, sizeof(fed->UDP_addr)); - if (bytes_written < (ssize_t)sizeof(int64_t) + 1) { - lf_print_warning("Clock sync: RTI failed to send physical time to federate %d: %s\n", fed->enclave.id, - strerror(errno)); - return; + void handle_physical_clock_sync_message(federate_info_t * my_fed, socket_type_t socket_type) { + // Lock the mutex to prevent interference between sending the two + // coded probe messages. + LF_MUTEX_LOCK(&rti_mutex); + // Reply with a T4 type message + send_physical_clock(MSG_TYPE_CLOCK_SYNC_T4, my_fed, socket_type); + // Send the corresponding coded probe immediately after, + // but only if this is a UDP channel. + if (socket_type == UDP) { + send_physical_clock(MSG_TYPE_CLOCK_SYNC_CODED_PROBE, my_fed, socket_type); + } + LF_MUTEX_UNLOCK(&rti_mutex); } - } else if (socket_type == TCP) { - LF_PRINT_DEBUG("Clock sync: RTI sending TCP message type %u.", buffer[0]); - LF_MUTEX_LOCK(&rti_mutex); - write_to_socket_fail_on_error(&fed->socket, 1 + sizeof(int64_t), buffer, &rti_mutex, - "Clock sync: RTI failed to send physical time to federate %d.", fed->enclave.id); - LF_MUTEX_UNLOCK(&rti_mutex); - } - LF_PRINT_DEBUG("Clock sync: RTI sent PHYSICAL_TIME_SYNC_MESSAGE with timestamp " PRINTF_TIME " to federate %d.", - current_physical_time, fed->enclave.id); -} -void handle_physical_clock_sync_message(federate_info_t* my_fed, socket_type_t socket_type) { - // Lock the mutex to prevent interference between sending the two - // coded probe messages. - LF_MUTEX_LOCK(&rti_mutex); - // Reply with a T4 type message - send_physical_clock(MSG_TYPE_CLOCK_SYNC_T4, my_fed, socket_type); - // Send the corresponding coded probe immediately after, - // but only if this is a UDP channel. - if (socket_type == UDP) { - send_physical_clock(MSG_TYPE_CLOCK_SYNC_CODED_PROBE, my_fed, socket_type); - } - LF_MUTEX_UNLOCK(&rti_mutex); -} + void* clock_synchronization_thread(void* noargs) { + initialize_lf_thread_id(); + // Wait until all federates have been notified of the start time. + // FIXME: Use lf_ version of this when merged with master. + LF_MUTEX_LOCK(&rti_mutex); + while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) { + lf_cond_wait(&received_start_times); + } + LF_MUTEX_UNLOCK(&rti_mutex); -void* clock_synchronization_thread(void* noargs) { - initialize_lf_thread_id(); - // Wait until all federates have been notified of the start time. - // FIXME: Use lf_ version of this when merged with master. - LF_MUTEX_LOCK(&rti_mutex); - while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) { - lf_cond_wait(&received_start_times); - } - LF_MUTEX_UNLOCK(&rti_mutex); + // Wait until the start time before starting clock synchronization. + // The above wait ensures that start_time has been set. + interval_t ns_to_wait = start_time - lf_time_physical(); - // Wait until the start time before starting clock synchronization. - // The above wait ensures that start_time has been set. - interval_t ns_to_wait = start_time - lf_time_physical(); + if (ns_to_wait > 0LL) { + lf_sleep(ns_to_wait); + } - if (ns_to_wait > 0LL) { - lf_sleep(ns_to_wait); - } + // Initiate a clock synchronization every rti->clock_sync_period_ns + bool any_federates_connected = true; + while (any_federates_connected) { + // Sleep + lf_sleep(rti_remote->clock_sync_period_ns); // Can be interrupted + any_federates_connected = false; + for (int fed_id = 0; fed_id < rti_remote->base.number_of_scheduling_nodes; fed_id++) { + federate_info_t* fed = GET_FED_INFO(fed_id); + if (fed->enclave.state == NOT_CONNECTED) { + // FIXME: We need better error handling here, but clock sync failure + // should not stop execution. + lf_print_error("Clock sync failed with federate %d. Not connected.", fed_id); + continue; + } else if (!fed->clock_synchronization_enabled) { + continue; + } + // Send the RTI's current physical time to the federate + // Send on UDP. + LF_PRINT_DEBUG("RTI sending T1 message to initiate clock sync round."); + send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, UDP); - // Initiate a clock synchronization every rti->clock_sync_period_ns - bool any_federates_connected = true; - while (any_federates_connected) { - // Sleep - lf_sleep(rti_remote->clock_sync_period_ns); // Can be interrupted - any_federates_connected = false; - for (int fed_id = 0; fed_id < rti_remote->base.number_of_scheduling_nodes; fed_id++) { - federate_info_t* fed = GET_FED_INFO(fed_id); - if (fed->enclave.state == NOT_CONNECTED) { - // FIXME: We need better error handling here, but clock sync failure - // should not stop execution. - lf_print_error("Clock sync failed with federate %d. Not connected.", fed_id); - continue; - } else if (!fed->clock_synchronization_enabled) { - continue; - } - // Send the RTI's current physical time to the federate - // Send on UDP. - LF_PRINT_DEBUG("RTI sending T1 message to initiate clock sync round."); - send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, UDP); - - // Listen for reply message, which should be T3. - size_t message_size = 1 + sizeof(uint16_t); - unsigned char buffer[message_size]; - // Maximum number of messages that we discard before giving up on this cycle. - // If the T3 message from this federate does not arrive and we keep receiving - // other messages, then give up on this federate and move to the next federate. - int remaining_attempts = 5; - while (remaining_attempts > 0) { - remaining_attempts--; - int read_failed = read_from_socket(rti_remote->socket_descriptor_UDP, message_size, buffer); - // If any errors occur, either discard the message or the clock sync round. - if (!read_failed) { - if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { - uint16_t fed_id_2 = extract_uint16(&(buffer[1])); - // Check that this message came from the correct federate. - if (fed_id_2 != fed->enclave.id) { - // Message is from the wrong federate. Discard the message. - lf_print_warning("Clock sync: Received T3 message from federate %d, " - "but expected one from %d. Discarding message.", - fed_id_2, fed->enclave.id); - continue; + // Listen for reply message, which should be T3. + size_t message_size = 1 + sizeof(uint16_t); + unsigned char buffer[message_size]; + // Maximum number of messages that we discard before giving up on this cycle. + // If the T3 message from this federate does not arrive and we keep receiving + // other messages, then give up on this federate and move to the next federate. + int remaining_attempts = 5; + while (remaining_attempts > 0) { + remaining_attempts--; + int read_failed = read_from_socket(rti_remote->socket_descriptor_UDP, message_size, buffer); + // If any errors occur, either discard the message or the clock sync round. + if (!read_failed) { + if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { + uint16_t fed_id_2 = extract_uint16(&(buffer[1])); + // Check that this message came from the correct federate. + if (fed_id_2 != fed->enclave.id) { + // Message is from the wrong federate. Discard the message. + lf_print_warning("Clock sync: Received T3 message from federate %d, " + "but expected one from %d. Discarding message.", + fed_id_2, fed->enclave.id); + continue; + } + LF_PRINT_DEBUG("Clock sync: RTI received T3 message from federate %d.", fed_id_2); + handle_physical_clock_sync_message(GET_FED_INFO(fed_id_2), UDP); + break; + } else { + // The message is not a T3 message. Discard the message and + // continue waiting for the T3 message. This is possibly a message + // from a previous cycle that was discarded. + lf_print_warning("Clock sync: Unexpected UDP message %u. Expected %u from federate %d. " + "Discarding message.", + buffer[0], MSG_TYPE_CLOCK_SYNC_T3, fed->enclave.id); + continue; + } + } else { + lf_print_warning("Clock sync: Read from UDP socket failed: %s. " + "Skipping clock sync round for federate %d.", + strerror(errno), fed->enclave.id); + remaining_attempts = -1; } - LF_PRINT_DEBUG("Clock sync: RTI received T3 message from federate %d.", fed_id_2); - handle_physical_clock_sync_message(GET_FED_INFO(fed_id_2), UDP); - break; - } else { - // The message is not a T3 message. Discard the message and - // continue waiting for the T3 message. This is possibly a message - // from a previous cycle that was discarded. - lf_print_warning("Clock sync: Unexpected UDP message %u. Expected %u from federate %d. " - "Discarding message.", - buffer[0], MSG_TYPE_CLOCK_SYNC_T3, fed->enclave.id); - continue; } - } else { - lf_print_warning("Clock sync: Read from UDP socket failed: %s. " - "Skipping clock sync round for federate %d.", - strerror(errno), fed->enclave.id); - remaining_attempts = -1; + if (remaining_attempts > 0) { + any_federates_connected = true; + } } } - if (remaining_attempts > 0) { - any_federates_connected = true; - } + return NULL; } - } - return NULL; -} - -/** - * Handle MSG_TYPE_FAILED sent by a federate. This message is sent by a federate - * that is exiting in failure. In this case, the RTI will - * also terminate abnormally, returning a non-zero exit code when it exits. - * - * This function assumes the caller does not hold the mutex. - * - * @param my_fed The federate sending a MSG_TYPE_FAILED message. - */ -static void handle_federate_failed(federate_info_t* my_fed) { - // Nothing more to do. Close the socket and exit. - LF_MUTEX_LOCK(&rti_mutex); - - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_FAILED, my_fed->enclave.id, NULL); - } - - // Set the flag telling the RTI to exit with an error code when it exits. - _lf_federate_reports_error = true; - lf_print_error("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); - - my_fed->enclave.state = NOT_CONNECTED; - - // Indicate that there will no further events from this federate. - my_fed->enclave.next_event = FOREVER_TAG; - - // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, - // the close should happen when receiving a 0 length message from the other end. - // Here, we just signal the other side that no further writes to the socket are - // forthcoming, which should result in the other end getting a zero-length reception. - shutdown(my_fed->socket, SHUT_RDWR); - - // We can now safely close the socket. - close(my_fed->socket); // from unistd.h - // Check downstream federates to see whether they should now be granted a TAG. - // To handle cycles, need to create a boolean array to keep - // track of which upstream federates have been visited. - bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. - notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); - free(visited); - - LF_MUTEX_UNLOCK(&rti_mutex); -} - -/** - * Handle MSG_TYPE_RESIGN sent by a federate. This message is sent at the time of termination - * after all shutdown events are processed on the federate. - * - * This function assumes the caller does not hold the mutex. - * - * @note At this point, the RTI might have outgoing messages to the federate. This - * function thus first performs a shutdown on the socket, which sends an EOF. It then - * waits for the remote socket to be closed before closing the socket itself. - * - * @param my_fed The federate sending a MSG_TYPE_RESIGN message. - */ -static void handle_federate_resign(federate_info_t* my_fed) { - // Nothing more to do. Close the socket and exit. - LF_MUTEX_LOCK(&rti_mutex); + /** + * Handle MSG_TYPE_FAILED sent by a federate. This message is sent by a federate + * that is exiting in failure. In this case, the RTI will + * also terminate abnormally, returning a non-zero exit code when it exits. + * + * This function assumes the caller does not hold the mutex. + * + * @param my_fed The federate sending a MSG_TYPE_FAILED message. + */ + static void handle_federate_failed(federate_info_t * my_fed) { + // Nothing more to do. Close the socket and exit. + LF_MUTEX_LOCK(&rti_mutex); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_RESIGN, my_fed->enclave.id, NULL); - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_FAILED, my_fed->enclave.id, NULL); + } - lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); + // Set the flag telling the RTI to exit with an error code when it exits. + _lf_federate_reports_error = true; + lf_print_error("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); - my_fed->enclave.state = NOT_CONNECTED; + my_fed->enclave.state = NOT_CONNECTED; - // Indicate that there will no further events from this federate. - my_fed->enclave.next_event = FOREVER_TAG; + // Indicate that there will no further events from this federate. + my_fed->enclave.next_event = FOREVER_TAG; - // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, - // the close should happen when receiving a 0 length message from the other end. - // Here, we just signal the other side that no further writes to the socket are - // forthcoming, which should result in the other end getting a zero-length reception. - shutdown(my_fed->socket, SHUT_WR); + // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, + // the close should happen when receiving a 0 length message from the other end. + // Here, we just signal the other side that no further writes to the socket are + // forthcoming, which should result in the other end getting a zero-length reception. + shutdown(my_fed->socket, SHUT_RDWR); - // Wait for the federate to send an EOF or a socket error to occur. - // Discard any incoming bytes. Normally, this read should return 0 because - // the federate is resigning and should itself invoke shutdown. - unsigned char buffer[10]; - while (read(my_fed->socket, buffer, 10) > 0) - ; + // We can now safely close the socket. + close(my_fed->socket); // from unistd.h - // We can now safely close the socket. - close(my_fed->socket); // from unistd.h + // Check downstream federates to see whether they should now be granted a TAG. + // To handle cycles, need to create a boolean array to keep + // track of which upstream federates have been visited. + bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. + notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); + free(visited); - // Check downstream federates to see whether they should now be granted a TAG. - // To handle cycles, need to create a boolean array to keep - // track of which upstream federates have been visited. - bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. - notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); - free(visited); + LF_MUTEX_UNLOCK(&rti_mutex); + } - LF_MUTEX_UNLOCK(&rti_mutex); -} + /** + * Handle MSG_TYPE_RESIGN sent by a federate. This message is sent at the time of termination + * after all shutdown events are processed on the federate. + * + * This function assumes the caller does not hold the mutex. + * + * @note At this point, the RTI might have outgoing messages to the federate. This + * function thus first performs a shutdown on the socket, which sends an EOF. It then + * waits for the remote socket to be closed before closing the socket itself. + * + * @param my_fed The federate sending a MSG_TYPE_RESIGN message. + */ + static void handle_federate_resign(federate_info_t * my_fed) { + // Nothing more to do. Close the socket and exit. + LF_MUTEX_LOCK(&rti_mutex); -void* federate_info_thread_TCP(void* fed) { - initialize_lf_thread_id(); - federate_info_t* my_fed = (federate_info_t*)fed; - - // Buffer for incoming messages. - // This does not constrain the message size because messages - // are forwarded piece by piece. - unsigned char buffer[FED_COM_BUFFER_SIZE]; - - // Listen for messages from the federate. - while (my_fed->enclave.state != NOT_CONNECTED) { - // Read no more than one byte to get the message type. - int read_failed = read_from_socket(my_fed->socket, 1, buffer); - if (read_failed) { - // Socket is closed - lf_print_error("RTI: Socket to federate %d is closed. Exiting the thread.", my_fed->enclave.id); - my_fed->enclave.state = NOT_CONNECTED; - my_fed->socket = -1; - // FIXME: We need better error handling here, but do not stop execution here. - break; - } - LF_PRINT_DEBUG("RTI: Received message type %u from federate %d.", buffer[0], my_fed->enclave.id); - switch (buffer[0]) { - case MSG_TYPE_TIMESTAMP: - handle_timestamp(my_fed); - break; - case MSG_TYPE_ADDRESS_QUERY: - handle_address_query(my_fed->enclave.id); - break; - case MSG_TYPE_ADDRESS_ADVERTISEMENT: - handle_address_ad(my_fed->enclave.id); - break; - case MSG_TYPE_TAGGED_MESSAGE: - handle_timed_message(my_fed, buffer); - break; - case MSG_TYPE_RESIGN: - handle_federate_resign(my_fed); - break; - case MSG_TYPE_NEXT_EVENT_TAG: - handle_next_event_tag(my_fed); - break; - case MSG_TYPE_LATEST_TAG_CONFIRMED: - handle_latest_tag_confirmed(my_fed); - break; - case MSG_TYPE_STOP_REQUEST: - handle_stop_request_message(my_fed); // FIXME: Reviewed until here. - // Need to also look at - // notify_advance_grant_if_safe() - // and notify_downstream_advance_grant_if_safe() - break; - case MSG_TYPE_STOP_REQUEST_REPLY: - handle_stop_request_reply(my_fed); - break; - case MSG_TYPE_PORT_ABSENT: - handle_port_absent_message(my_fed, buffer); - break; - case MSG_TYPE_FAILED: - handle_federate_failed(my_fed); - return NULL; - default: - lf_print_error("RTI received from federate %d an unrecognized TCP message type: %u.", my_fed->enclave.id, - buffer[0]); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_UNIDENTIFIED, my_fed->enclave.id, NULL); + tracepoint_rti_from_federate(receive_RESIGN, my_fed->enclave.id, NULL); } - } - } - // Nothing more to do. Close the socket and exit. - // Prevent multiple threads from closing the same socket at the same time. - LF_MUTEX_LOCK(&rti_mutex); - close(my_fed->socket); // from unistd.h - // Manual clean, in case of a transient federate - if (my_fed->is_transient) { - // FIXME: Aren't there transit messages anymore??? - // free_in_transit_message_q(my_fed->in_transit_message_tags); - lf_print("RTI: Transient Federate %d thread exited.", my_fed->enclave.id); - - // Update the number of connected transient federates - rti_remote->number_of_connected_transient_federates--; - - // Reset the status of the leaving federate - reset_transient_federate(my_fed); - } - // Signal the hot swap mechanism, if needed - if (hot_swap_in_progress && hot_swap_federate->enclave.id == my_fed->enclave.id) { - hot_swap_old_resigned = true; - } - LF_MUTEX_UNLOCK(&rti_mutex); - return NULL; -} + lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); -void send_reject(int* socket_id, unsigned char error_code) { - LF_PRINT_DEBUG("RTI sending MSG_TYPE_REJECT."); - unsigned char response[2]; - response[0] = MSG_TYPE_REJECT; - response[1] = error_code; - LF_MUTEX_LOCK(&rti_mutex); - // NOTE: Ignore errors on this response. - if (write_to_socket(*socket_id, 2, response)) { - lf_print_warning("RTI failed to write MSG_TYPE_REJECT message on the socket."); - } - // Close the socket. - shutdown(*socket_id, SHUT_RDWR); - close(*socket_id); - *socket_id = -1; - LF_MUTEX_UNLOCK(&rti_mutex); -} -lf_print("handle_timestamp for transient 1157"); + my_fed->enclave.state = NOT_CONNECTED; -/** - * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload - * a federate ID and a federation ID. If the federation ID - * matches this federation, send an MSG_TYPE_ACK and otherwise send - * a MSG_TYPE_REJECT message. - * @param socket_id Pointer to the socket on which to listen. - * @param client_fd The socket address. - * @return The federate ID for success or -1 for failure. - */ -static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_in* client_fd) { - // Buffer for message ID, federate ID, type (persistent or transient), and federation ID length. - size_t length = 1 + sizeof(uint16_t) + 1 + 1; // Message ID, federate ID, length of fedration ID, type. - unsigned char buffer[length]; - - // Read bytes from the socket. We need 4 bytes. - if (read_from_socket_close_on_error(socket_id, length, buffer)) { - lf_print_error("RTI failed to read from accepted socket."); - return -1; - } + // Indicate that there will no further events from this federate. + my_fed->enclave.next_event = FOREVER_TAG; + + // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, + // the close should happen when receiving a 0 length message from the other end. + // Here, we just signal the other side that no further writes to the socket are + // forthcoming, which should result in the other end getting a zero-length reception. + shutdown(my_fed->socket, SHUT_WR); + + // Wait for the federate to send an EOF or a socket error to occur. + // Discard any incoming bytes. Normally, this read should return 0 because + // the federate is resigning and should itself invoke shutdown. + unsigned char buffer[10]; + while (read(my_fed->socket, buffer, 10) > 0) + ; + + // We can now safely close the socket. + close(my_fed->socket); // from unistd.h + + // Check downstream federates to see whether they should now be granted a TAG. + // To handle cycles, need to create a boolean array to keep + // track of which upstream federates have been visited. + bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. + notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); + free(visited); + + LF_MUTEX_UNLOCK(&rti_mutex); + } - uint16_t fed_id = rti_remote->base.number_of_scheduling_nodes; // Initialize to an invalid value. - bool is_transient = false; + void* federate_info_thread_TCP(void* fed) { + initialize_lf_thread_id(); + federate_info_t* my_fed = (federate_info_t*)fed; + + // Buffer for incoming messages. + // This does not constrain the message size because messages + // are forwarded piece by piece. + unsigned char buffer[FED_COM_BUFFER_SIZE]; + + // Listen for messages from the federate. + while (my_fed->enclave.state != NOT_CONNECTED) { + // Read no more than one byte to get the message type. + int read_failed = read_from_socket(my_fed->socket, 1, buffer); + if (read_failed) { + // Socket is closed + lf_print_error("RTI: Socket to federate %d is closed. Exiting the thread.", my_fed->enclave.id); + my_fed->enclave.state = NOT_CONNECTED; + my_fed->socket = -1; + // FIXME: We need better error handling here, but do not stop execution here. + break; + } + LF_PRINT_DEBUG("RTI: Received message type %u from federate %d.", buffer[0], my_fed->enclave.id); + switch (buffer[0]) { + case MSG_TYPE_TIMESTAMP: + handle_timestamp(my_fed); + break; + case MSG_TYPE_ADDRESS_QUERY: + handle_address_query(my_fed->enclave.id); + break; + case MSG_TYPE_ADDRESS_ADVERTISEMENT: + handle_address_ad(my_fed->enclave.id); + break; + case MSG_TYPE_TAGGED_MESSAGE: + handle_timed_message(my_fed, buffer); + break; + case MSG_TYPE_RESIGN: + handle_federate_resign(my_fed); + break; + case MSG_TYPE_NEXT_EVENT_TAG: + handle_next_event_tag(my_fed); + break; + case MSG_TYPE_LATEST_TAG_CONFIRMED: + handle_latest_tag_confirmed(my_fed); + break; + case MSG_TYPE_STOP_REQUEST: + handle_stop_request_message(my_fed); // FIXME: Reviewed until here. + // Need to also look at + // notify_advance_grant_if_safe() + // and notify_downstream_advance_grant_if_safe() + break; + case MSG_TYPE_STOP_REQUEST_REPLY: + handle_stop_request_reply(my_fed); + break; + case MSG_TYPE_PORT_ABSENT: + handle_port_absent_message(my_fed, buffer); + break; + case MSG_TYPE_FAILED: + handle_federate_failed(my_fed); + return NULL; + default: + lf_print_error("RTI received from federate %d an unrecognized TCP message type: %u.", my_fed->enclave.id, + buffer[0]); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_UNIDENTIFIED, my_fed->enclave.id, NULL); + } + } + } - // First byte received is the message type. - if (buffer[0] != MSG_TYPE_FED_IDS) { - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); - } - if (buffer[0] == MSG_TYPE_P2P_SENDING_FED_ID || buffer[0] == MSG_TYPE_P2P_TAGGED_MESSAGE) { - // The federate is trying to connect to a peer, not to the RTI. - // It has connected to the RTI instead. - // FIXME: This should not happen, but apparently has been observed. - // It should not happen because the peers get the port and IP address - // of the peer they want to connect to from the RTI. - // If the connection is a peer-to-peer connection between two - // federates, reject the connection with the WRONG_SERVER error. - send_reject(socket_id, WRONG_SERVER); - } else if (buffer[0] == MSG_TYPE_FED_NONCE) { - send_reject(socket_id, RTI_NOT_EXECUTED_WITH_AUTH); - lf_print_error("RTI not executed with HMAC authentication option using -a or --auth."); - } else { - send_reject(socket_id, UNEXPECTED_MESSAGE); - } - lf_print_error("RTI expected a MSG_TYPE_FED_IDS message. Got %u (see net_common.h).", buffer[0]); - return -1; - } else { - // Received federate ID. - fed_id = extract_uint16(buffer + 1); - is_transient = (buffer[sizeof(uint16_t) + 1] == 1) ? true : false; - if (is_transient) { - LF_PRINT_LOG("RTI received federate ID: %d, which is transient.", fed_id); - } else { - LF_PRINT_LOG("RTI received federate ID: %d, which is persistent.", fed_id); + // Nothing more to do. Close the socket and exit. + // Prevent multiple threads from closing the same socket at the same time. + LF_MUTEX_LOCK(&rti_mutex); + close(my_fed->socket); // from unistd.h + // Manual clean, in case of a transient federate + if (my_fed->is_transient) { + // FIXME: Aren't there transit messages anymore??? + // free_in_transit_message_q(my_fed->in_transit_message_tags); + lf_print("RTI: Transient Federate %d thread exited.", my_fed->enclave.id); + + // Update the number of connected transient federates + rti_remote->number_of_connected_transient_federates--; + + // Reset the status of the leaving federate + reset_transient_federate(my_fed); + } + // Signal the hot swap mechanism, if needed + if (hot_swap_in_progress && hot_swap_federate->enclave.id == my_fed->enclave.id) { + hot_swap_old_resigned = true; + } + LF_MUTEX_UNLOCK(&rti_mutex); + return NULL; } - // Read the federation ID. First read the length, which is one byte. - size_t federation_id_length = (size_t)buffer[sizeof(uint16_t) + 2]; - char federation_id_received[federation_id_length + 1]; // One extra for null terminator. - // Next read the actual federation ID. - if (read_from_socket_close_on_error(socket_id, federation_id_length, (unsigned char*)federation_id_received)) { - lf_print_error("RTI failed to read federation id from federate %d.", fed_id); - return -1; + void send_reject(int* socket_id, unsigned char error_code) { + LF_PRINT_DEBUG("RTI sending MSG_TYPE_REJECT."); + unsigned char response[2]; + response[0] = MSG_TYPE_REJECT; + response[1] = error_code; + LF_MUTEX_LOCK(&rti_mutex); + // NOTE: Ignore errors on this response. + if (write_to_socket(*socket_id, 2, response)) { + lf_print_warning("RTI failed to write MSG_TYPE_REJECT message on the socket."); + } + // Close the socket. + shutdown(*socket_id, SHUT_RDWR); + close(*socket_id); + *socket_id = -1; + LF_MUTEX_UNLOCK(&rti_mutex); } + lf_print("handle_timestamp for transient 1157"); + + /** + * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload + * a federate ID and a federation ID. If the federation ID + * matches this federation, send an MSG_TYPE_ACK and otherwise send + * a MSG_TYPE_REJECT message. + * @param socket_id Pointer to the socket on which to listen. + * @param client_fd The socket address. + * @return The federate ID for success or -1 for failure. + */ + static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_in* client_fd) { + // Buffer for message ID, federate ID, type (persistent or transient), and federation ID length. + size_t length = 1 + sizeof(uint16_t) + 1 + 1; // Message ID, federate ID, length of fedration ID, type. + unsigned char buffer[length]; + + // Read bytes from the socket. We need 4 bytes. + if (read_from_socket_close_on_error(socket_id, length, buffer)) { + lf_print_error("RTI failed to read from accepted socket."); + return -1; + } - // Terminate the string with a null. - federation_id_received[federation_id_length] = 0; - - LF_PRINT_DEBUG("RTI received federation ID: %s.", federation_id_received); + uint16_t fed_id = rti_remote->base.number_of_scheduling_nodes; // Initialize to an invalid value. + bool is_transient = false; - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(rti_remote->base.trace, receive_FED_ID, fed_id, NULL); - } - // Compare the received federation ID to mine. - if (strncmp(rti_remote->federation_id, federation_id_received, federation_id_length) != 0) { - // Federation IDs do not match. Send back a MSG_TYPE_REJECT message. - lf_print_warning("Federate from another federation %s attempted to connect to RTI in federation %s.", - federation_id_received, rti_remote->federation_id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); - } - send_reject(socket_id, FEDERATION_ID_DOES_NOT_MATCH); - return -1; - } else { - if (fed_id >= rti_remote->base.number_of_scheduling_nodes) { - // Federate ID is out of range. - lf_print_error("RTI received federate ID %d, which is out of range.", fed_id); + // First byte received is the message type. + if (buffer[0] != MSG_TYPE_FED_IDS) { if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } - send_reject(socket_id, FEDERATE_ID_OUT_OF_RANGE); + if (buffer[0] == MSG_TYPE_P2P_SENDING_FED_ID || buffer[0] == MSG_TYPE_P2P_TAGGED_MESSAGE) { + // The federate is trying to connect to a peer, not to the RTI. + // It has connected to the RTI instead. + // FIXME: This should not happen, but apparently has been observed. + // It should not happen because the peers get the port and IP address + // of the peer they want to connect to from the RTI. + // If the connection is a peer-to-peer connection between two + // federates, reject the connection with the WRONG_SERVER error. + send_reject(socket_id, WRONG_SERVER); + } else if (buffer[0] == MSG_TYPE_FED_NONCE) { + send_reject(socket_id, RTI_NOT_EXECUTED_WITH_AUTH); + lf_print_error("RTI not executed with HMAC authentication option using -a or --auth."); + } else { + send_reject(socket_id, UNEXPECTED_MESSAGE); + } + lf_print_error("RTI expected a MSG_TYPE_FED_IDS message. Got %u (see net_common.h).", buffer[0]); return -1; } else { - // Find out if it is a new connection or a hot swap. - // Reject if: - // - duplicate of a connected persistent federate - // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that - // particular federate - // - or it is a hot swap, but it is not the execution phase yet - // Find out if it is a new connection or a hot swap. - // Reject if: - // - duplicate of a connected persistent federate - // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that - // particular federate - // - or it is a hot swap, but it is not the execution phase yet - if ((rti_remote->base.scheduling_nodes[fed_id])->state != NOT_CONNECTED) { - if (!is_transient) { - lf_print_error("RTI received duplicate federate ID: %d.", fed_id); + // Received federate ID. + fed_id = extract_uint16(buffer + 1); + is_transient = (buffer[sizeof(uint16_t) + 1] == 1) ? true : false; + if (is_transient) { + LF_PRINT_LOG("RTI received federate ID: %d, which is transient.", fed_id); + } else { + LF_PRINT_LOG("RTI received federate ID: %d, which is persistent.", fed_id); + } + + // Read the federation ID. First read the length, which is one byte. + size_t federation_id_length = (size_t)buffer[sizeof(uint16_t) + 2]; + char federation_id_received[federation_id_length + 1]; // One extra for null terminator. + // Next read the actual federation ID. + if (read_from_socket_close_on_error(socket_id, federation_id_length, (unsigned char*)federation_id_received)) { + lf_print_error("RTI failed to read federation id from federate %d.", fed_id); + return -1; + } + + // Terminate the string with a null. + federation_id_received[federation_id_length] = 0; + + LF_PRINT_DEBUG("RTI received federation ID: %s.", federation_id_received); + + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(rti_remote->base.trace, receive_FED_ID, fed_id, NULL); + } + // Compare the received federation ID to mine. + if (strncmp(rti_remote->federation_id, federation_id_received, federation_id_length) != 0) { + // Federation IDs do not match. Send back a MSG_TYPE_REJECT message. + lf_print_warning("Federate from another federation %s attempted to connect to RTI in federation %s.", + federation_id_received, rti_remote->federation_id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); + } + send_reject(socket_id, FEDERATION_ID_DOES_NOT_MATCH); + return -1; + } else { + if (fed_id >= rti_remote->base.number_of_scheduling_nodes) { + // Federate ID is out of range. + lf_print_error("RTI received federate ID %d, which is out of range.", fed_id); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } - send_reject(socket_id, FEDERATE_ID_IN_USE); + send_reject(socket_id, FEDERATE_ID_OUT_OF_RANGE); return -1; - } else if (hot_swap_in_progress || rti_remote->phase != execution_phase) { - lf_print_warning("RTI rejects the connection of transient federate %d, \ + } else { + // Find out if it is a new connection or a hot swap. + // Reject if: + // - duplicate of a connected persistent federate + // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that + // particular federate + // - or it is a hot swap, but it is not the execution phase yet + // Find out if it is a new connection or a hot swap. + // Reject if: + // - duplicate of a connected persistent federate + // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that + // particular federate + // - or it is a hot swap, but it is not the execution phase yet + if ((rti_remote->base.scheduling_nodes[fed_id])->state != NOT_CONNECTED) { + if (!is_transient) { + lf_print_error("RTI received duplicate federate ID: %d.", fed_id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + } + send_reject(socket_id, FEDERATE_ID_IN_USE); + return -1; + } else if (hot_swap_in_progress || rti_remote->phase != execution_phase) { + lf_print_warning("RTI rejects the connection of transient federate %d, \ because a hot swap is already in progress for federate %d. \n\ Only one hot swap operation is allowed at a time.", - fed_id, hot_swap_federate->enclave.id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + fed_id, hot_swap_federate->enclave.id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + } + send_reject(socket_id, FEDERATE_ID_IN_USE); + return -1; + } } - send_reject(socket_id, FEDERATE_ID_IN_USE); - return -1; } } } - } - } - federate_info_t* fed_twin = GET_FED_INFO(fed_id); - federate_info_t* fed; - // If the federate is already connected (making the request a duplicate), and that - // the federate is transient, and it is the execution phase, then mark that a hot - // swap is in progreass and initialize the hot_swap_federate. - // Otherwise, proceed with a normal transinet connection - if (fed_twin->enclave.state != NOT_CONNECTED && is_transient && fed_twin->is_transient && - rti_remote->phase == execution_phase && !hot_swap_in_progress) { - // Allocate memory for the new federate and initilize it - hot_swap_federate = (federate_info_t*)malloc(sizeof(federate_info_t)); - initialize_federate(hot_swap_federate, fed_id); - - // Set that hot swap is in progress - hot_swap_in_progress = true; - // free(fed); // Free the old memory to prevent memory leak - fed = hot_swap_federate; - lf_print("RTI: Hot Swap starting for federate %d.", fed_id); - } else { - fed = fed_twin; - fed->is_transient = is_transient; - } + federate_info_t* fed_twin = GET_FED_INFO(fed_id); + federate_info_t* fed; + // If the federate is already connected (making the request a duplicate), and that + // the federate is transient, and it is the execution phase, then mark that a hot + // swap is in progreass and initialize the hot_swap_federate. + // Otherwise, proceed with a normal transinet connection + if (fed_twin->enclave.state != NOT_CONNECTED && is_transient && fed_twin->is_transient && + rti_remote->phase == execution_phase && !hot_swap_in_progress) { + // Allocate memory for the new federate and initilize it + hot_swap_federate = (federate_info_t*)malloc(sizeof(federate_info_t)); + initialize_federate(hot_swap_federate, fed_id); + + // Set that hot swap is in progress + hot_swap_in_progress = true; + // free(fed); // Free the old memory to prevent memory leak + fed = hot_swap_federate; + lf_print("RTI: Hot Swap starting for federate %d.", fed_id); + } else { + fed = fed_twin; + fed->is_transient = is_transient; + } - // The MSG_TYPE_FED_IDS message has the right federation ID. + // The MSG_TYPE_FED_IDS message has the right federation ID. - // Get the peer address from the connected socket_id. Then assign it as the federate's socket server. - struct sockaddr_in peer_addr; - socklen_t addr_len = sizeof(peer_addr); - if (getpeername(*socket_id, (struct sockaddr*)&peer_addr, &addr_len) != 0) { - lf_print_error("RTI failed to get peer address."); - } - fed->server_ip_addr = peer_addr.sin_addr; + // Get the peer address from the connected socket_id. Then assign it as the federate's socket server. + struct sockaddr_in peer_addr; + socklen_t addr_len = sizeof(peer_addr); + if (getpeername(*socket_id, (struct sockaddr*)&peer_addr, &addr_len) != 0) { + lf_print_error("RTI failed to get peer address."); + } + fed->server_ip_addr = peer_addr.sin_addr; #if LOG_LEVEL >= LOG_LEVEL_DEBUG - // Create the human readable format and copy that into - // the .server_hostname field of the federate. - char str[INET_ADDRSTRLEN + 1]; - inet_ntop(AF_INET, &fed->server_ip_addr, str, INET_ADDRSTRLEN); - strncpy(fed->server_hostname, str, INET_ADDRSTRLEN); + // Create the human readable format and copy that into + // the .server_hostname field of the federate. + char str[INET_ADDRSTRLEN + 1]; + inet_ntop(AF_INET, &fed->server_ip_addr, str, INET_ADDRSTRLEN); + strncpy(fed->server_hostname, str, INET_ADDRSTRLEN); - LF_PRINT_DEBUG("RTI got address %s from federate %d.", fed->server_hostname, fed_id); + LF_PRINT_DEBUG("RTI got address %s from federate %d.", fed->server_hostname, fed_id); #endif - fed->socket = *socket_id; - - // Set the federate's state as pending - // because it is waiting for the start time to be - // sent by the RTI before beginning its execution. - fed->enclave.state = PENDING; - - LF_PRINT_DEBUG("RTI responding with MSG_TYPE_ACK to federate %d.", fed_id); - // Send an MSG_TYPE_ACK message. - unsigned char ack_message = MSG_TYPE_ACK; - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_ACK, fed_id, NULL); - } - LF_MUTEX_LOCK(&rti_mutex); - if (write_to_socket_close_on_error(&fed->socket, 1, &ack_message)) { - LF_MUTEX_UNLOCK(&rti_mutex); - lf_print_error("RTI failed to write MSG_TYPE_ACK message to federate %d.", fed_id); - return -1; - } - LF_MUTEX_UNLOCK(&rti_mutex); - - LF_PRINT_DEBUG("RTI sent MSG_TYPE_ACK to federate %d.", fed_id); + fed->socket = *socket_id; - return (int32_t)fed_id; -} + // Set the federate's state as pending + // because it is waiting for the start time to be + // sent by the RTI before beginning its execution. + fed->enclave.state = PENDING; -/** - * Listen for a MSG_TYPE_NEIGHBOR_STRUCTURE message, and upon receiving it, fill - * out the relevant information in the federate's struct. - * - * In case of a hot swap, check that no changes were made to the connections, compared - * to the first instance that joigned. This means that the first instance to join - * __is__ the reference. - * - * @return 1 on success and 0 on failure. - */ -static int receive_connection_information(int* socket_id, uint16_t fed_id) { - LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_NEIGHBOR_STRUCTURE from federate %d.", fed_id); - unsigned char connection_info_header[MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE]; - read_from_socket_fail_on_error(socket_id, MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE, connection_info_header, NULL, - "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message header from federate %d.", - fed_id); - - if (connection_info_header[0] != MSG_TYPE_NEIGHBOR_STRUCTURE) { - lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " - "Rejecting federate.", - fed_id, connection_info_header[0]); - send_reject(socket_id, UNEXPECTED_MESSAGE); - return 0; - } else { - // In case of a transient federate that is joining again, or a hot swap, then - // check that the connection information did not change. - federate_info_t* fed = GET_FED_INFO(fed_id); - federate_info_t* temp_fed = NULL; - if (lf_tag_compare(fed->effective_start_tag, NEVER_TAG) != 0) { - if (hot_swap_in_progress) { - fed = hot_swap_federate; - } else { - temp_fed = (federate_info_t*)calloc(1, sizeof(federate_info_t)); - initialize_federate(temp_fed, fed_id); - fed = temp_fed; + LF_PRINT_DEBUG("RTI responding with MSG_TYPE_ACK to federate %d.", fed_id); + // Send an MSG_TYPE_ACK message. + unsigned char ack_message = MSG_TYPE_ACK; + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_ACK, fed_id, NULL); } - } - // Read the number of upstream and downstream connections - fed->enclave.num_upstream = extract_int32(&(connection_info_header[1])); - fed->enclave.num_downstream = extract_int32(&(connection_info_header[1 + sizeof(int32_t)])); - LF_PRINT_DEBUG("RTI got %d upstreams and %d downstreams from federate %d.", fed->enclave.num_upstream, - fed->enclave.num_downstream, fed_id); - - // Allocate memory for the upstream and downstream pointers - if (fed->enclave.num_upstream > 0) { - fed->enclave.upstream = (uint16_t*)malloc(sizeof(uint16_t) * fed->enclave.num_upstream); - LF_ASSERT_NON_NULL(fed->enclave.upstream); - // Allocate memory for the upstream delay pointers - fed->enclave.upstream_delay = (interval_t*)malloc(sizeof(interval_t) * fed->enclave.num_upstream); - LF_ASSERT_NON_NULL(fed->enclave.upstream_delay); - } else { - fed->enclave.upstream = (uint16_t*)NULL; - fed->enclave.upstream_delay = (interval_t*)NULL; - } - if (fed->enclave.num_downstream > 0) { - fed->enclave.downstream = (uint16_t*)malloc(sizeof(uint16_t) * fed->enclave.num_downstream); - LF_ASSERT_NON_NULL(fed->enclave.downstream); - } else { - fed->enclave.downstream = (uint16_t*)NULL; - } - - size_t connections_info_body_size = ((sizeof(uint16_t) + sizeof(int64_t)) * fed->enclave.num_upstream) + - (sizeof(uint16_t) * fed->enclave.num_downstream); - unsigned char* connections_info_body = NULL; - if (connections_info_body_size > 0) { - connections_info_body = (unsigned char*)malloc(connections_info_body_size); - LF_ASSERT_NON_NULL(connections_info_body); - read_from_socket_fail_on_error(socket_id, connections_info_body_size, connections_info_body, NULL, - "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message body from federate %d.", - fed_id); - // Keep track of where we are in the buffer - size_t message_head = 0; - // First, read the info about upstream federates - for (int i = 0; i < fed->enclave.num_upstream; i++) { - fed->enclave.upstream[i] = extract_uint16(&(connections_info_body[message_head])); - message_head += sizeof(uint16_t); - fed->enclave.upstream_delay[i] = extract_int64(&(connections_info_body[message_head])); - message_head += sizeof(int64_t); + LF_MUTEX_LOCK(&rti_mutex); + if (write_to_socket_close_on_error(&fed->socket, 1, &ack_message)) { + LF_MUTEX_UNLOCK(&rti_mutex); + lf_print_error("RTI failed to write MSG_TYPE_ACK message to federate %d.", fed_id); + return -1; } + LF_MUTEX_UNLOCK(&rti_mutex); - // Next, read the info about downstream federates - for (int i = 0; i < fed->enclave.num_downstream; i++) { - fed->enclave.downstream[i] = extract_uint16(&(connections_info_body[message_head])); - message_head += sizeof(uint16_t); - } + LF_PRINT_DEBUG("RTI sent MSG_TYPE_ACK to federate %d.", fed_id); - free(connections_info_body); + return (int32_t)fed_id; } - // NOTE: In this design, changes in the connections are not allowed. This means that the first - // instance to join __is__ the reference. If this policy is to be changed, then it is in - // the following lines will be updated accordingly. - if (hot_swap_in_progress || temp_fed != NULL) { - if (temp_fed == NULL) { - temp_fed = hot_swap_federate; - } - // Now, compare the previous and the new neighberhood structure - // Start with the number of upstreams and downstreams - bool reject = false; - if ((fed->enclave.num_upstream != temp_fed->enclave.num_upstream) || - (fed->enclave.num_downstream != temp_fed->enclave.num_downstream)) { - reject = true; + /** + * Listen for a MSG_TYPE_NEIGHBOR_STRUCTURE message, and upon receiving it, fill + * out the relevant information in the federate's struct. + * + * In case of a hot swap, check that no changes were made to the connections, compared + * to the first instance that joigned. This means that the first instance to join + * __is__ the reference. + * + * @return 1 on success and 0 on failure. + */ + static int receive_connection_information(int* socket_id, uint16_t fed_id) { + LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_NEIGHBOR_STRUCTURE from federate %d.", fed_id); + unsigned char connection_info_header[MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE]; + read_from_socket_fail_on_error(socket_id, MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE, connection_info_header, NULL, + "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message header from federate %d.", + fed_id); + + if (connection_info_header[0] != MSG_TYPE_NEIGHBOR_STRUCTURE) { + lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " + "Rejecting federate.", + fed_id, connection_info_header[0]); + send_reject(socket_id, UNEXPECTED_MESSAGE); + return 0; } else { - // Then check all upstreams and their delays - for (int i = 0; i < fed->enclave.num_upstream; i++) { - if ((fed->enclave.upstream[i] != temp_fed->enclave.upstream[i]) || - (fed->enclave.upstream_delay[i] != temp_fed->enclave.upstream_delay[i])) { - reject = true; - break; + // In case of a transient federate that is joining again, or a hot swap, then + // check that the connection information did not change. + federate_info_t* fed = GET_FED_INFO(fed_id); + federate_info_t* temp_fed = NULL; + if (lf_tag_compare(fed->effective_start_tag, NEVER_TAG) != 0) { + if (hot_swap_in_progress) { + fed = hot_swap_federate; + } else { + temp_fed = (federate_info_t*)calloc(1, sizeof(federate_info_t)); + initialize_federate(temp_fed, fed_id); + fed = temp_fed; } } - if (!reject) { - // Finally, check all downstream federates - for (int i = 0; i < fed->enclave.num_downstream; i++) { - if (fed->enclave.downstream[i] != temp_fed->enclave.downstream[i]) { - reject = true; - break; - } - } + // Read the number of upstream and downstream connections + fed->enclave.num_upstream = extract_int32(&(connection_info_header[1])); + fed->enclave.num_downstream = extract_int32(&(connection_info_header[1 + sizeof(int32_t)])); + LF_PRINT_DEBUG("RTI got %d upstreams and %d downstreams from federate %d.", fed->enclave.num_upstream, + fed->enclave.num_downstream, fed_id); + + // Allocate memory for the upstream and downstream pointers + if (fed->enclave.num_upstream > 0) { + fed->enclave.upstream = (uint16_t*)malloc(sizeof(uint16_t) * fed->enclave.num_upstream); + LF_ASSERT_NON_NULL(fed->enclave.upstream); + // Allocate memory for the upstream delay pointers + fed->enclave.upstream_delay = (interval_t*)malloc(sizeof(interval_t) * fed->enclave.num_upstream); + LF_ASSERT_NON_NULL(fed->enclave.upstream_delay); + } else { + fed->enclave.upstream = (uint16_t*)NULL; + fed->enclave.upstream_delay = (interval_t*)NULL; } - } - if (reject) { - if (temp_fed != hot_swap_federate) { - free(temp_fed); + if (fed->enclave.num_downstream > 0) { + fed->enclave.downstream = (uint16_t*)malloc(sizeof(uint16_t) * fed->enclave.num_downstream); + LF_ASSERT_NON_NULL(fed->enclave.downstream); + } else { + fed->enclave.downstream = (uint16_t*)NULL; } - return 0; - } - } - } - LF_PRINT_DEBUG("RTI received neighbor structure from federate %d.", fed_id); - return 1; -} -/** - * Listen for a MSG_TYPE_UDP_PORT message, and upon receiving it, set up - * clock synchronization and perform the initial clock synchronization. - * Initial clock synchronization is performed only if the MSG_TYPE_UDP_PORT message - * payload is not UINT16_MAX. If it is also not 0, then this function sets - * up to perform runtime clock synchronization using the UDP port number - * specified in the payload to communicate with the federate's clock - * synchronization logic. - * @param socket_id The socket on which to listen. - * @param fed_id The federate ID. - * @return 1 for success, 0 for failure. - */ -static int receive_udp_message_and_set_up_clock_sync(int* socket_id, uint16_t fed_id) { - // Read the MSG_TYPE_UDP_PORT message from the federate regardless of the status of - // clock synchronization. This message will tell the RTI whether the federate - // is doing clock synchronization, and if it is, what port to use for UDP. - LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_UDP_PORT from federate %d.", fed_id); - unsigned char response[1 + sizeof(uint16_t)]; - read_from_socket_fail_on_error(socket_id, 1 + sizeof(uint16_t), response, NULL, - "RTI failed to read MSG_TYPE_UDP_PORT message from federate %d.", fed_id); - if (response[0] != MSG_TYPE_UDP_PORT) { - lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " - "Rejecting federate.", - fed_id, response[0]); - send_reject(socket_id, UNEXPECTED_MESSAGE); - return 0; - } else { - federate_info_t* fed; - if (hot_swap_in_progress) { - fed = hot_swap_federate; - } else { - fed = GET_FED_INFO(fed_id); - } - if (rti_remote->clock_sync_global_status >= clock_sync_init) { - // If no initial clock sync, no need perform initial clock sync. - uint16_t federate_UDP_port_number = extract_uint16(&(response[1])); + size_t connections_info_body_size = ((sizeof(uint16_t) + sizeof(int64_t)) * fed->enclave.num_upstream) + + (sizeof(uint16_t) * fed->enclave.num_downstream); + unsigned char* connections_info_body = NULL; + if (connections_info_body_size > 0) { + connections_info_body = (unsigned char*)malloc(connections_info_body_size); + LF_ASSERT_NON_NULL(connections_info_body); + read_from_socket_fail_on_error( + socket_id, connections_info_body_size, connections_info_body, NULL, + "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message body from federate %d.", fed_id); + // Keep track of where we are in the buffer + size_t message_head = 0; + // First, read the info about upstream federates + for (int i = 0; i < fed->enclave.num_upstream; i++) { + fed->enclave.upstream[i] = extract_uint16(&(connections_info_body[message_head])); + message_head += sizeof(uint16_t); + fed->enclave.upstream_delay[i] = extract_int64(&(connections_info_body[message_head])); + message_head += sizeof(int64_t); + } - LF_PRINT_DEBUG("RTI got MSG_TYPE_UDP_PORT %u from federate %d.", federate_UDP_port_number, fed_id); + // Next, read the info about downstream federates + for (int i = 0; i < fed->enclave.num_downstream; i++) { + fed->enclave.downstream[i] = extract_uint16(&(connections_info_body[message_head])); + message_head += sizeof(uint16_t); + } - // A port number of UINT16_MAX means initial clock sync should not be performed. - if (federate_UDP_port_number != UINT16_MAX) { - // Perform the initialization clock synchronization with the federate. - // Send the required number of messages for clock synchronization - for (int i = 0; i < rti_remote->clock_sync_exchanges_per_interval; i++) { - // Send the RTI's current physical time T1 to the federate. - send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, TCP); + free(connections_info_body); + } - // Listen for reply message, which should be T3. - size_t message_size = 1 + sizeof(uint16_t); - unsigned char buffer[message_size]; - read_from_socket_fail_on_error(socket_id, message_size, buffer, NULL, - "Socket to federate %d unexpectedly closed.", fed_id); - if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { - uint16_t fed_id = extract_uint16(&(buffer[1])); - LF_PRINT_DEBUG("RTI received T3 clock sync message from federate %d.", fed_id); - handle_physical_clock_sync_message(fed, TCP); + // NOTE: In this design, changes in the connections are not allowed. This means that the first + // instance to join __is__ the reference. If this policy is to be changed, then it is in + // the following lines will be updated accordingly. + if (hot_swap_in_progress || temp_fed != NULL) { + if (temp_fed == NULL) { + temp_fed = hot_swap_federate; + } + // Now, compare the previous and the new neighberhood structure + // Start with the number of upstreams and downstreams + bool reject = false; + if ((fed->enclave.num_upstream != temp_fed->enclave.num_upstream) || + (fed->enclave.num_downstream != temp_fed->enclave.num_downstream)) { + reject = true; } else { - lf_print_error("Unexpected message %u from federate %d.", buffer[0], fed_id); - send_reject(socket_id, UNEXPECTED_MESSAGE); + // Then check all upstreams and their delays + for (int i = 0; i < fed->enclave.num_upstream; i++) { + if ((fed->enclave.upstream[i] != temp_fed->enclave.upstream[i]) || + (fed->enclave.upstream_delay[i] != temp_fed->enclave.upstream_delay[i])) { + reject = true; + break; + } + } + if (!reject) { + // Finally, check all downstream federates + for (int i = 0; i < fed->enclave.num_downstream; i++) { + if (fed->enclave.downstream[i] != temp_fed->enclave.downstream[i]) { + reject = true; + break; + } + } + } + } + if (reject) { + if (temp_fed != hot_swap_federate) { + free(temp_fed); + } return 0; } } - LF_PRINT_DEBUG("RTI finished initial clock synchronization with federate %d.", fed_id); } - if (rti_remote->clock_sync_global_status >= clock_sync_on) { - // If no runtime clock sync, no need to set up the UDP port. - if (federate_UDP_port_number > 0) { - // Initialize the UDP_addr field of the federate struct - fed->UDP_addr.sin_family = AF_INET; - fed->UDP_addr.sin_port = htons(federate_UDP_port_number); - fed->UDP_addr.sin_addr = fed->server_ip_addr; - } + LF_PRINT_DEBUG("RTI received neighbor structure from federate %d.", fed_id); + return 1; + } + + /** + * Listen for a MSG_TYPE_UDP_PORT message, and upon receiving it, set up + * clock synchronization and perform the initial clock synchronization. + * Initial clock synchronization is performed only if the MSG_TYPE_UDP_PORT message + * payload is not UINT16_MAX. If it is also not 0, then this function sets + * up to perform runtime clock synchronization using the UDP port number + * specified in the payload to communicate with the federate's clock + * synchronization logic. + * @param socket_id The socket on which to listen. + * @param fed_id The federate ID. + * @return 1 for success, 0 for failure. + */ + static int receive_udp_message_and_set_up_clock_sync(int* socket_id, uint16_t fed_id) { + // Read the MSG_TYPE_UDP_PORT message from the federate regardless of the status of + // clock synchronization. This message will tell the RTI whether the federate + // is doing clock synchronization, and if it is, what port to use for UDP. + LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_UDP_PORT from federate %d.", fed_id); + unsigned char response[1 + sizeof(uint16_t)]; + read_from_socket_fail_on_error(socket_id, 1 + sizeof(uint16_t), response, NULL, + "RTI failed to read MSG_TYPE_UDP_PORT message from federate %d.", fed_id); + if (response[0] != MSG_TYPE_UDP_PORT) { + lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " + "Rejecting federate.", + fed_id, response[0]); + send_reject(socket_id, UNEXPECTED_MESSAGE); + return 0; } else { - // Disable clock sync after initial round. - fed->clock_synchronization_enabled = false; + federate_info_t* fed; + if (hot_swap_in_progress) { + fed = hot_swap_federate; + } else { + fed = GET_FED_INFO(fed_id); + } + if (rti_remote->clock_sync_global_status >= clock_sync_init) { + // If no initial clock sync, no need perform initial clock sync. + uint16_t federate_UDP_port_number = extract_uint16(&(response[1])); + + LF_PRINT_DEBUG("RTI got MSG_TYPE_UDP_PORT %u from federate %d.", federate_UDP_port_number, fed_id); + + // A port number of UINT16_MAX means initial clock sync should not be performed. + if (federate_UDP_port_number != UINT16_MAX) { + // Perform the initialization clock synchronization with the federate. + // Send the required number of messages for clock synchronization + for (int i = 0; i < rti_remote->clock_sync_exchanges_per_interval; i++) { + // Send the RTI's current physical time T1 to the federate. + send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, TCP); + + // Listen for reply message, which should be T3. + size_t message_size = 1 + sizeof(uint16_t); + unsigned char buffer[message_size]; + read_from_socket_fail_on_error(socket_id, message_size, buffer, NULL, + "Socket to federate %d unexpectedly closed.", fed_id); + if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { + uint16_t fed_id = extract_uint16(&(buffer[1])); + LF_PRINT_DEBUG("RTI received T3 clock sync message from federate %d.", fed_id); + handle_physical_clock_sync_message(fed, TCP); + } else { + lf_print_error("Unexpected message %u from federate %d.", buffer[0], fed_id); + send_reject(socket_id, UNEXPECTED_MESSAGE); + return 0; + } + } + LF_PRINT_DEBUG("RTI finished initial clock synchronization with federate %d.", fed_id); + } + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + // If no runtime clock sync, no need to set up the UDP port. + if (federate_UDP_port_number > 0) { + // Initialize the UDP_addr field of the federate struct + fed->UDP_addr.sin_family = AF_INET; + fed->UDP_addr.sin_port = htons(federate_UDP_port_number); + fed->UDP_addr.sin_addr = fed->server_ip_addr; + } + } else { + // Disable clock sync after initial round. + fed->clock_synchronization_enabled = false; + } + } else { + // No clock synchronization at all. + LF_PRINT_DEBUG("RTI: No clock synchronization for federate %d.", fed_id); + // Clock synchronization is universally disabled via the clock-sync command-line parameter + // (-c off was passed to the RTI). + // Note that the federates are still going to send a + // MSG_TYPE_UDP_PORT message but with a payload (port) of -1. + fed->clock_synchronization_enabled = false; + } } - } else { - // No clock synchronization at all. - LF_PRINT_DEBUG("RTI: No clock synchronization for federate %d.", fed_id); - // Clock synchronization is universally disabled via the clock-sync command-line parameter - // (-c off was passed to the RTI). - // Note that the federates are still going to send a - // MSG_TYPE_UDP_PORT message but with a payload (port) of -1. - fed->clock_synchronization_enabled = false; + return 1; } - } - return 1; -} #ifdef __RTI_AUTH__ -/** - * Authenticate incoming federate by performing HMAC-based authentication. - * - * @param socket Socket for the incoming federate tryting to authenticate. - * @return True if authentication is successful and false otherwise. - */ -static bool authenticate_federate(int* socket) { - // Wait for MSG_TYPE_FED_NONCE from federate. - size_t fed_id_length = sizeof(uint16_t); - unsigned char buffer[1 + fed_id_length + NONCE_LENGTH]; - read_from_socket_fail_on_error(socket, 1 + fed_id_length + NONCE_LENGTH, buffer, NULL, - "Failed to read MSG_TYPE_FED_NONCE"); - if (buffer[0] != MSG_TYPE_FED_NONCE) { - lf_print_error_and_exit("Received unexpected response %u from the FED (see net_common.h).", buffer[0]); - } - unsigned int hmac_length = SHA256_HMAC_LENGTH; - size_t federation_id_length = strnlen(rti_remote->federation_id, 255); - // HMAC tag is created with MSG_TYPE, federate ID, received federate nonce. - unsigned char mac_buf[1 + fed_id_length + NONCE_LENGTH]; - mac_buf[0] = MSG_TYPE_RTI_RESPONSE; - memcpy(&mac_buf[1], &buffer[1], fed_id_length); - memcpy(&mac_buf[1 + fed_id_length], &buffer[1 + fed_id_length], NONCE_LENGTH); - unsigned char hmac_tag[hmac_length]; - unsigned char* ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, mac_buf, - 1 + fed_id_length + NONCE_LENGTH, hmac_tag, &hmac_length); - if (ret == NULL) { - lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_RTI_RESPONSE."); - } - // Make buffer for message type, RTI's nonce, and HMAC tag. - unsigned char sender[1 + NONCE_LENGTH + hmac_length]; - sender[0] = MSG_TYPE_RTI_RESPONSE; - unsigned char rti_nonce[NONCE_LENGTH]; - RAND_bytes(rti_nonce, NONCE_LENGTH); - memcpy(&sender[1], rti_nonce, NONCE_LENGTH); - memcpy(&sender[1 + NONCE_LENGTH], hmac_tag, hmac_length); - if (write_to_socket(*socket, 1 + NONCE_LENGTH + hmac_length, sender)) { - lf_print_error("Failed to send nonce to federate."); - } - - // Wait for MSG_TYPE_FED_RESPONSE - unsigned char received[1 + hmac_length]; - read_from_socket_fail_on_error(socket, 1 + hmac_length, received, NULL, "Failed to read federate response."); - if (received[0] != MSG_TYPE_FED_RESPONSE) { - lf_print_error_and_exit("Received unexpected response %u from the federate (see net_common.h).", received[0]); - return false; - } - // HMAC tag is created with MSG_TYPE_FED_RESPONSE and RTI's nonce. - unsigned char mac_buf2[1 + NONCE_LENGTH]; - mac_buf2[0] = MSG_TYPE_FED_RESPONSE; - memcpy(&mac_buf2[1], rti_nonce, NONCE_LENGTH); - unsigned char rti_tag[hmac_length]; - ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, mac_buf2, 1 + NONCE_LENGTH, rti_tag, - &hmac_length); - if (ret == NULL) { - lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_FED_RESPONSE."); - } - // Compare received tag and created tag. - if (memcmp(&received[1], rti_tag, hmac_length) != 0) { - // Federation IDs do not match. Send back a HMAC_DOES_NOT_MATCH message. - lf_print_warning("HMAC authentication failed. Rejecting the federate."); - send_reject(socket, HMAC_DOES_NOT_MATCH); - return false; - } else { - LF_PRINT_LOG("Federate's HMAC verified."); - return true; - } -} -#endif + /** + * Authenticate incoming federate by performing HMAC-based authentication. + * + * @param socket Socket for the incoming federate tryting to authenticate. + * @return True if authentication is successful and false otherwise. + */ + static bool authenticate_federate(int* socket) { + // Wait for MSG_TYPE_FED_NONCE from federate. + size_t fed_id_length = sizeof(uint16_t); + unsigned char buffer[1 + fed_id_length + NONCE_LENGTH]; + read_from_socket_fail_on_error(socket, 1 + fed_id_length + NONCE_LENGTH, buffer, NULL, + "Failed to read MSG_TYPE_FED_NONCE"); + if (buffer[0] != MSG_TYPE_FED_NONCE) { + lf_print_error_and_exit("Received unexpected response %u from the FED (see net_common.h).", buffer[0]); + } + unsigned int hmac_length = SHA256_HMAC_LENGTH; + size_t federation_id_length = strnlen(rti_remote->federation_id, 255); + // HMAC tag is created with MSG_TYPE, federate ID, received federate nonce. + unsigned char mac_buf[1 + fed_id_length + NONCE_LENGTH]; + mac_buf[0] = MSG_TYPE_RTI_RESPONSE; + memcpy(&mac_buf[1], &buffer[1], fed_id_length); + memcpy(&mac_buf[1 + fed_id_length], &buffer[1 + fed_id_length], NONCE_LENGTH); + unsigned char hmac_tag[hmac_length]; + unsigned char* ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, mac_buf, + 1 + fed_id_length + NONCE_LENGTH, hmac_tag, &hmac_length); + if (ret == NULL) { + lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_RTI_RESPONSE."); + } + // Make buffer for message type, RTI's nonce, and HMAC tag. + unsigned char sender[1 + NONCE_LENGTH + hmac_length]; + sender[0] = MSG_TYPE_RTI_RESPONSE; + unsigned char rti_nonce[NONCE_LENGTH]; + RAND_bytes(rti_nonce, NONCE_LENGTH); + memcpy(&sender[1], rti_nonce, NONCE_LENGTH); + memcpy(&sender[1 + NONCE_LENGTH], hmac_tag, hmac_length); + if (write_to_socket(*socket, 1 + NONCE_LENGTH + hmac_length, sender)) { + lf_print_error("Failed to send nonce to federate."); + } -// FIXME: The socket descriptor here (parameter) is not used. Should be removed? -void lf_connect_to_persistent_federates(int socket_descriptor) { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote.number_of_transient_federates; i++) { - // Wait for an incoming connection request. - struct sockaddr client_fd; - uint32_t client_length = sizeof(client_fd); - // The following blocks until a federate connects. - int socket_id = -1; - while (1) { - socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); - if (socket_id >= 0) { - // Got a socket - break; - } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { - lf_print_error_system_failure("RTI failed to accept the socket."); + // Wait for MSG_TYPE_FED_RESPONSE + unsigned char received[1 + hmac_length]; + read_from_socket_fail_on_error(socket, 1 + hmac_length, received, NULL, "Failed to read federate response."); + if (received[0] != MSG_TYPE_FED_RESPONSE) { + lf_print_error_and_exit("Received unexpected response %u from the federate (see net_common.h).", received[0]); + return false; + } + // HMAC tag is created with MSG_TYPE_FED_RESPONSE and RTI's nonce. + unsigned char mac_buf2[1 + NONCE_LENGTH]; + mac_buf2[0] = MSG_TYPE_FED_RESPONSE; + memcpy(&mac_buf2[1], rti_nonce, NONCE_LENGTH); + unsigned char rti_tag[hmac_length]; + ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, mac_buf2, 1 + NONCE_LENGTH, rti_tag, + &hmac_length); + if (ret == NULL) { + lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_FED_RESPONSE."); + } + // Compare received tag and created tag. + if (memcmp(&received[1], rti_tag, hmac_length) != 0) { + // Federation IDs do not match. Send back a HMAC_DOES_NOT_MATCH message. + lf_print_warning("HMAC authentication failed. Rejecting the federate."); + send_reject(socket, HMAC_DOES_NOT_MATCH); + return false; } else { - // Try again - lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); - continue; + LF_PRINT_LOG("Federate's HMAC verified."); + return true; } } +#endif + + // FIXME: The socket descriptor here (parameter) is not used. Should be removed? + void lf_connect_to_persistent_federates(int socket_descriptor) { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote.number_of_transient_federates; i++) { + // Wait for an incoming connection request. + struct sockaddr client_fd; + uint32_t client_length = sizeof(client_fd); + // The following blocks until a federate connects. + int socket_id = -1; + while (1) { + socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); + if (socket_id >= 0) { + // Got a socket + break; + } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { + lf_print_error_system_failure("RTI failed to accept the socket."); + } else { + // Try again + lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); + continue; + } + } // Wait for the first message from the federate when RTI -a option is on. #ifdef __RTI_AUTH__ - if (rti_remote->authentication_enabled) { - if (!authenticate_federate(&socket_id)) { - lf_print_warning("RTI failed to authenticate the incoming federate."); - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - socket_id = -1; - // Ignore the federate that failed authentication. - i--; - continue; - } - } + if (rti_remote->authentication_enabled) { + if (!authenticate_federate(&socket_id)) { + lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + socket_id = -1; + // Ignore the federate that failed authentication. + i--; + continue; + } + } #endif - // The first message from the federate should contain its ID and the federation ID. - int32_t fed_id = receive_and_check_fed_id_message(&socket_id); - if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && - receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - federate_info_t* fed = GET_FED_INFO(fed_id); - lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); - - // If the federate is transient, then do not count it. - if (fed->is_transient) { - rti_remote->number_of_connected_transient_federates++; - assert(rti_remote->number_of_connected_transient_federates <= rti_remote->number_of_transient_federates); - i--; - lf_print("RTI: Transient federate %d joined.", fed->enclave.id); + // The first message from the federate should contain its ID and the federation ID. + int32_t fed_id = receive_and_check_fed_id_message(&socket_id); + if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && + receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { + + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + federate_info_t* fed = GET_FED_INFO(fed_id); + lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + + // If the federate is transient, then do not count it. + if (fed->is_transient) { + rti_remote->number_of_connected_transient_federates++; + assert(rti_remote->number_of_connected_transient_federates <= rti_remote->number_of_transient_federates); + i--; + lf_print("RTI: Transient federate %d joined.", fed->enclave.id); + } + } else { + // Received message was rejected. Try again. + i--; + } } - } else { - // Received message was rejected. Try again. - i--; - } - } - // All federates have connected. - LF_PRINT_DEBUG("All persistent federates have connected to RTI."); - - if (rti_remote->clock_sync_global_status >= clock_sync_on) { - // Create the thread that performs periodic PTP clock synchronization sessions - // over the UDP channel, but only if the UDP channel is open and at least one - // federate is performing runtime clock synchronization. - bool clock_sync_enabled = false; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed_info = GET_FED_INFO(i); - if (fed_info->clock_synchronization_enabled) { - clock_sync_enabled = true; - break; + // All federates have connected. + LF_PRINT_DEBUG("All persistent federates have connected to RTI."); + + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + // Create the thread that performs periodic PTP clock synchronization sessions + // over the UDP channel, but only if the UDP channel is open and at least one + // federate is performing runtime clock synchronization. + bool clock_sync_enabled = false; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed_info = GET_FED_INFO(i); + if (fed_info->clock_synchronization_enabled) { + clock_sync_enabled = true; + break; + } + } + if (rti_remote->final_port_UDP != UINT16_MAX && clock_sync_enabled) { + lf_thread_create(&rti_remote->clock_thread, clock_synchronization_thread, NULL); + } } } - if (rti_remote->final_port_UDP != UINT16_MAX && clock_sync_enabled) { - lf_thread_create(&rti_remote->clock_thread, clock_synchronization_thread, NULL); - } - } -} - -/** - * @brief A request for immediate stop to the federate - * - * @param fed: the deferate to stop - */ -void send_stop(federate_info_t* fed) { - // Reply with a stop granted to all federates - unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; - outgoing_buffer[0] = MSG_TYPE_STOP; - lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); - } - write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, - "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); + /** + * @brief A request for immediate stop to the federate + * + * @param fed: the deferate to stop + */ + void send_stop(federate_info_t * fed) { + // Reply with a stop granted to all federates + unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; + outgoing_buffer[0] = MSG_TYPE_STOP; + lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); - LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); -} - -/** - * @brief A request for immediate stop to the federate - * - * @param fed: the deferate to stop - */ -void send_stop(federate_info_t* fed) { - // Reply with a stop granted to all federates - unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; - outgoing_buffer[0] = MSG_TYPE_STOP; - lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); + } + write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, + "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); - } - write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, - "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); + LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); + } - LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); -} + /** + * @brief A request for immediate stop to the federate + * + * @param fed: the deferate to stop + */ + void send_stop(federate_info_t * fed) { + // Reply with a stop granted to all federates + unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; + outgoing_buffer[0] = MSG_TYPE_STOP; + lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); -void* lf_connect_to_transient_federates_thread(void* nothing) { - // This loop will continue to accept connections of transient federates, as - // soon as there is room, or enable hot swap - - while (!rti_remote->all_persistent_federates_exited) { - // Continue waiting for an incoming connection requests from transients - // to join, or for hot swap. - // Wait for an incoming connection request. - struct sockaddr client_fd; - uint32_t client_length = sizeof(client_fd); - // The following blocks until a federate connects. - int socket_id = -1; - while (1) { - if (!rti_remote->all_persistent_federates_exited) { - return NULL; - } - socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); - if (socket_id >= 0) { - // Got a socket - break; - } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { - lf_print_error_system_failure("RTI failed to accept the socket."); - } else { - // Try again - lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); - continue; + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); } + write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, + "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); + + LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); } + void* lf_connect_to_transient_federates_thread(void* nothing) { + // This loop will continue to accept connections of transient federates, as + // soon as there is room, or enable hot swap + + while (!rti_remote->all_persistent_federates_exited) { + // Continue waiting for an incoming connection requests from transients + // to join, or for hot swap. + // Wait for an incoming connection request. + struct sockaddr client_fd; + uint32_t client_length = sizeof(client_fd); + // The following blocks until a federate connects. + int socket_id = -1; + while (1) { + if (!rti_remote->all_persistent_federates_exited) { + return NULL; + } + socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); + if (socket_id >= 0) { + // Got a socket + break; + } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { + lf_print_error_system_failure("RTI failed to accept the socket."); + } else { + // Try again + lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); + continue; + } + } + // Wait for the first message from the federate when RTI -a option is on. #ifdef __RTI_AUTH__ - if (rti_remote->authentication_enabled) { - if (!authenticate_federate(&socket_id)) { - lf_print_warning("RTI failed to authenticate the incoming federate."); - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - socket_id = -1; - // Ignore the federate that failed authentication. - i--; - continue; - } - } + if (rti_remote->authentication_enabled) { + if (!authenticate_federate(&socket_id)) { + lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + socket_id = -1; + // Ignore the federate that failed authentication. + i--; + continue; + } + } #endif - // The first message from the federate should contain its ID and the federation ID. - // The function also detects if a hot swap request is initiated. - int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in*)&client_fd); - - if (fed_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && - receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - LF_MUTEX_LOCK(&rti_mutex); - if (hot_swap_in_progress) { - lf_print("RTI: Hot swap confirmed for federate %d.", fed_id); - - // Then send STOP - federate_info_t* fed_old = GET_FED_INFO(fed_id); - hot_swap_federate->enclave.completed = fed_old->enclave.completed; - - LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); - send_stop(fed_old); - LF_MUTEX_UNLOCK(&rti_mutex); - - // Wait for the old federate to send MSG_TYPE_RESIGN - LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); - // FIXME: Should this have a timeout? - while (!hot_swap_old_resigned) - ; - - // The latest LTC is the tag at which the old federate resigned. This is useful - // for computing the effective_start_time of the new joining federate. - hot_swap_federate->enclave.completed = fed_old->enclave.completed; - - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - lf_thread_create(&(hot_swap_federate->thread_id), federate_info_thread_TCP, hot_swap_federate); - - // Redirect the federate in rti_remote - rti_remote->base.scheduling_nodes[fed_id] = (scheduling_node_t*)hot_swap_federate; - - // Free the old federate memory and reset the Hot wap indicators - // FIXME: Is this enough to free the memory allocated to the federate? - free(fed_old); - lf_mutex_lock(&rti_mutex); - hot_swap_in_progress = false; - lf_mutex_unlock(&rti_mutex); - - lf_print("RTI: Hot swap succeeded for federate %d.", fed_id); - } else { - lf_mutex_unlock(&rti_mutex); - - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - federate_info_t* fed = GET_FED_INFO(fed_id); - lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); - lf_print("RTI: Transient federate %d joined.", fed_id); - } - rti_remote->number_of_connected_transient_federates++; - } else { - // If a hot swap was initialed, but the connection information or/and clock - // synchronization fail, then reset hot_swap_in_profress, and free the memory - // allocated for hot_swap_federate - if (hot_swap_in_progress) { - lf_print("RTI: Hot swap canceled for federate %d.", fed_id); - lf_mutex_lock(&rti_mutex); - hot_swap_in_progress = false; - lf_mutex_unlock(&rti_mutex); - - // FIXME: Is this enough to free the memory of a federate_info_t data structure? - free(hot_swap_federate); + // The first message from the federate should contain its ID and the federation ID. + // The function also detects if a hot swap request is initiated. + int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in*)&client_fd); + + if (fed_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && + receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { + LF_MUTEX_LOCK(&rti_mutex); + if (hot_swap_in_progress) { + lf_print("RTI: Hot swap confirmed for federate %d.", fed_id); + + // Then send STOP + federate_info_t* fed_old = GET_FED_INFO(fed_id); + hot_swap_federate->enclave.completed = fed_old->enclave.completed; + + LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); + send_stop(fed_old); + LF_MUTEX_UNLOCK(&rti_mutex); + + // Wait for the old federate to send MSG_TYPE_RESIGN + LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); + // FIXME: Should this have a timeout? + while (!hot_swap_old_resigned) + ; + + // The latest LTC is the tag at which the old federate resigned. This is useful + // for computing the effective_start_time of the new joining federate. + hot_swap_federate->enclave.completed = fed_old->enclave.completed; + + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + lf_thread_create(&(hot_swap_federate->thread_id), federate_info_thread_TCP, hot_swap_federate); + + // Redirect the federate in rti_remote + rti_remote->base.scheduling_nodes[fed_id] = (scheduling_node_t*)hot_swap_federate; + + // Free the old federate memory and reset the Hot wap indicators + // FIXME: Is this enough to free the memory allocated to the federate? + free(fed_old); + lf_mutex_lock(&rti_mutex); + hot_swap_in_progress = false; + lf_mutex_unlock(&rti_mutex); + + lf_print("RTI: Hot swap succeeded for federate %d.", fed_id); + } else { + lf_mutex_unlock(&rti_mutex); + + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + federate_info_t* fed = GET_FED_INFO(fed_id); + lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + lf_print("RTI: Transient federate %d joined.", fed_id); + } + rti_remote->number_of_connected_transient_federates++; + } else { + // If a hot swap was initialed, but the connection information or/and clock + // synchronization fail, then reset hot_swap_in_profress, and free the memory + // allocated for hot_swap_federate + if (hot_swap_in_progress) { + lf_print("RTI: Hot swap canceled for federate %d.", fed_id); + lf_mutex_lock(&rti_mutex); + hot_swap_in_progress = false; + lf_mutex_unlock(&rti_mutex); + + // FIXME: Is this enough to free the memory of a federate_info_t data structure? + free(hot_swap_federate); + } + } } } - } -} -void* respond_to_erroneous_connections(void* nothing) { - initialize_lf_thread_id(); - while (true) { - // Wait for an incoming connection request. - // The following will block until either a federate attempts to connect - // or close(rti->socket_descriptor_TCP) is called. - int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, -1); - if (socket_id < 0) { - return NULL; - } - if (rti_remote->all_federates_exited) { + void* respond_to_erroneous_connections(void* nothing) { + initialize_lf_thread_id(); + while (true) { + // Wait for an incoming connection request. + // The following will block until either a federate attempts to connect + // or close(rti->socket_descriptor_TCP) is called. + int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, -1); + if (socket_id < 0) { + return NULL; + } + if (rti_remote->all_federates_exited) { + return NULL; + } + + lf_print_error("RTI received an unexpected connection request. Federation is running."); + unsigned char response[2]; + response[0] = MSG_TYPE_REJECT; + response[1] = FEDERATION_ID_DOES_NOT_MATCH; + // Ignore errors on this response. + if (write_to_socket(socket_id, 2, response)) { + lf_print_warning("RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); + } + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + } return NULL; } - lf_print_error("RTI received an unexpected connection request. Federation is running."); - unsigned char response[2]; - response[0] = MSG_TYPE_REJECT; - response[1] = FEDERATION_ID_DOES_NOT_MATCH; - // Ignore errors on this response. - if (write_to_socket(socket_id, 2, response)) { - lf_print_warning("RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); + void initialize_federate(federate_info_t * fed, uint16_t id) { + initialize_scheduling_node(&(fed->enclave), id); + fed->requested_stop = false; + fed->socket = -1; // No socket. + fed->clock_synchronization_enabled = true; + fed->in_transit_message_tags = pqueue_tag_init(10); + strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); + fed->server_ip_addr.s_addr = 0; + fed->server_port = -1; + fed->is_transient = true; + fed->effective_start_tag = NEVER_TAG; + fed->pending_grant = NEVER_TAG; + fed->pending_provisional_grant = NEVER_TAG; } - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - } - return NULL; -} - -void initialize_federate(federate_info_t* fed, uint16_t id) { - initialize_scheduling_node(&(fed->enclave), id); - fed->requested_stop = false; - fed->socket = -1; // No socket. - fed->clock_synchronization_enabled = true; - fed->in_transit_message_tags = pqueue_tag_init(10); - strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); - fed->server_ip_addr.s_addr = 0; - fed->server_port = -1; - fed->is_transient = true; - fed->effective_start_tag = NEVER_TAG; - fed->pending_grant = NEVER_TAG; - fed->pending_provisional_grant = NEVER_TAG; -} -void reset_transient_federate(federate_info_t* fed) { - fed->enclave.next_event = NEVER_TAG; - fed->enclave.state = NOT_CONNECTED; - // Reset of the federate-related attributes - fed->socket = -1; // No socket. - fed->clock_synchronization_enabled = true; - fed->in_transit_message_tags = pqueue_tag_init(10); - strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); - fed->server_ip_addr.s_addr = 0; - fed->server_port = -1; - fed->requested_stop = false; - fed->is_transient = true; - fed->effective_start_tag = NEVER_TAG; - fed->pending_grant = NEVER_TAG; - fed->pending_provisional_grant = NEVER_TAG; - // FIXME: There is room though to check if the interface has changed??? Do we allow this? -} - -int32_t start_rti_server(uint16_t port) { - _lf_initialize_clock(); - // Create the TCP socket server - if (create_server(port, &rti_remote->socket_descriptor_TCP, &rti_remote->final_port_TCP, TCP, true)) { - lf_print_error_system_failure("RTI failed to create TCP server: %s.", strerror(errno)); - }; - lf_print("RTI: Listening for federates."); - // Create the UDP socket server - // Try to get the rti_remote->final_port_TCP + 1 port - if (rti_remote->clock_sync_global_status >= clock_sync_on) { - if (create_server(rti_remote->final_port_TCP + 1, &rti_remote->socket_descriptor_UDP, &rti_remote->final_port_UDP, - UDP, true)) { - lf_print_error_system_failure("RTI failed to create UDP server: %s.", strerror(errno)); + void reset_transient_federate(federate_info_t * fed) { + fed->enclave.next_event = NEVER_TAG; + fed->enclave.state = NOT_CONNECTED; + // Reset of the federate-related attributes + fed->socket = -1; // No socket. + fed->clock_synchronization_enabled = true; + fed->in_transit_message_tags = pqueue_tag_init(10); + strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); + fed->server_ip_addr.s_addr = 0; + fed->server_port = -1; + fed->requested_stop = false; + fed->is_transient = true; + fed->effective_start_tag = NEVER_TAG; + fed->pending_grant = NEVER_TAG; + fed->pending_provisional_grant = NEVER_TAG; + // FIXME: There is room though to check if the interface has changed??? Do we allow this? } - } - return rti_remote->socket_descriptor_TCP; -} -/** - * Iterate over the federates and sets 'has_upstream_transient_federates'. - * Once done, check that no transient federate has an upstream transient federate, - * and compute the number of persistent federates that do have upstream transients, - * which is the maximun number of delayed grants that can be pending at the same time. - * This is useful for initialyzing the queue of delayed grants. - - * @return -1, if there is more than one level of transiency, else, the number of - * persistents that have an upstream transient - */ -static int set_has_upstream_transient_federates_parameter_and_check() { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - for (int j = 0; j < fed->enclave.num_upstream; j++) { - federate_info_t* upstream_fed = GET_FED_INFO(fed->enclave.upstream[j]); - if (upstream_fed->is_transient) { - fed->has_upstream_transient_federates = true; - break; + int32_t start_rti_server(uint16_t port) { + _lf_initialize_clock(); + // Create the TCP socket server + if (create_server(port, &rti_remote->socket_descriptor_TCP, &rti_remote->final_port_TCP, TCP, true)) { + lf_print_error_system_failure("RTI failed to create TCP server: %s.", strerror(errno)); + }; + lf_print("RTI: Listening for federates."); + // Create the UDP socket server + // Try to get the rti_remote->final_port_TCP + 1 port + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + if (create_server(rti_remote->final_port_TCP + 1, &rti_remote->socket_descriptor_UDP, + &rti_remote->final_port_UDP, UDP, true)) { + lf_print_error_system_failure("RTI failed to create UDP server: %s.", strerror(errno)); + } } + return rti_remote->socket_descriptor_TCP; } - } - // Now check that no transient has an upstream transient - // FIXME: Do we really need this? Or should it be the job of the validator? - int max_number_of_delayed_grants = 0; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->is_transient && fed->has_upstream_transient_federates) { - return -1; - } - if (!fed->is_transient && fed->has_upstream_transient_federates) { - max_number_of_delayed_grants++; - } - } + /** + * Iterate over the federates and sets 'has_upstream_transient_federates'. + * Once done, check that no transient federate has an upstream transient federate, + * and compute the number of persistent federates that do have upstream transients, + * which is the maximun number of delayed grants that can be pending at the same time. + * This is useful for initialyzing the queue of delayed grants. + + * @return -1, if there is more than one level of transiency, else, the number of + * persistents that have an upstream transient + */ + static int set_has_upstream_transient_federates_parameter_and_check() { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + for (int j = 0; j < fed->enclave.num_upstream; j++) { + federate_info_t* upstream_fed = GET_FED_INFO(fed->enclave.upstream[j]); + if (upstream_fed->is_transient) { + fed->has_upstream_transient_federates = true; + break; + } + } + } - return max_number_of_delayed_grants; -} + // Now check that no transient has an upstream transient + // FIXME: Do we really need this? Or should it be the job of the validator? + int max_number_of_delayed_grants = 0; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->is_transient && fed->has_upstream_transient_federates) { + return -1; + } + if (!fed->is_transient && fed->has_upstream_transient_federates) { + max_number_of_delayed_grants++; + } + } -void wait_for_federates(int socket_descriptor) { - // Wait for connections from persistent federates and create a thread for each. - lf_connect_to_persistent_federates(socket_descriptor); - // Wait for connections from persistent federates and create a thread for each. - lf_connect_to_persistent_federates(socket_descriptor); - - // Set has_upstream_transient_federates parameter in all federates and check - // that there is no more than one level of transiency - if (rti_remote->number_of_transient_federates > 0) { - int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); - if (max_number_of_pending_grants == -1) { - lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); + return max_number_of_delayed_grants; } - rti_remote->delayed_grants = pqueue_delayed_grants_init(max_number_of_pending_grants); - } - // All persistent federates have connected. - lf_print("RTI: All expected persistent federates have connected. Starting execution."); - if (rti_remote->number_of_transient_federates > 0) { - lf_print("RTI: Transient Federates can join and leave the federation at anytime."); - } + void wait_for_federates(int socket_descriptor) { + // Wait for connections from persistent federates and create a thread for each. + lf_connect_to_persistent_federates(socket_descriptor); + // Wait for connections from persistent federates and create a thread for each. + lf_connect_to_persistent_federates(socket_descriptor); + + // Set has_upstream_transient_federates parameter in all federates and check + // that there is no more than one level of transiency + if (rti_remote->number_of_transient_federates > 0) { + int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); + if (max_number_of_pending_grants == -1) { + lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); + } + rti_remote->delayed_grants = pqueue_delayed_grants_init(max_number_of_pending_grants); + } - // The socket server will only continue to accept connections from transient - // federates. - // In case some other federation's federates are trying to join the wrong - // federation, need to respond. Start a separate thread to do that. - lf_thread_t responder_thread; - lf_thread_t transient_thread; - lf_thread_t delayed_grants_thread; - - // If the federation does not include transient federates, then respond to - // erronous connections. Otherwise, continue to accept transients joining and - // respond to duplicate joing requests. - if (rti_remote->number_of_transient_federates == 0) { - lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); - } else if (rti_remote->number_of_transient_federates > 0) { - lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); - lf_thread_create(&delayed_grants_thread, lf_delayed_grants_thread, NULL); - } + // All persistent federates have connected. + lf_print("RTI: All expected persistent federates have connected. Starting execution."); + if (rti_remote->number_of_transient_federates > 0) { + lf_print("RTI: Transient Federates can join and leave the federation at anytime."); + } - // Wait for persistent federate threads to exit. - void* thread_exit_status; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (!fed->is_transient) { - lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); - lf_thread_join(fed->thread_id, &thread_exit_status); - pqueue_tag_free(fed->in_transit_message_tags); - lf_print("RTI: Persistent federate %d thread exited.", fed->enclave.id); - } - } + // The socket server will only continue to accept connections from transient + // federates. + // In case some other federation's federates are trying to join the wrong + // federation, need to respond. Start a separate thread to do that. + lf_thread_t responder_thread; + lf_thread_t transient_thread; + lf_thread_t delayed_grants_thread; + + // If the federation does not include transient federates, then respond to + // erronous connections. Otherwise, continue to accept transients joining and + // respond to duplicate joing requests. + if (rti_remote->number_of_transient_federates == 0) { + lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); + } else if (rti_remote->number_of_transient_federates > 0) { + lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); + lf_thread_create(&delayed_grants_thread, lf_delayed_grants_thread, NULL); + } - rti_remote->all_persistent_federates_exited = true; - rti_remote->phase = shutdown_phase; - lf_print("RTI: All persistent threads exited."); + // Wait for persistent federate threads to exit. + void* thread_exit_status; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (!fed->is_transient) { + lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); + lf_thread_join(fed->thread_id, &thread_exit_status); + pqueue_tag_free(fed->in_transit_message_tags); + lf_print("RTI: Persistent federate %d thread exited.", fed->enclave.id); + } + } - // Wait for transient federate threads to exit, if any. - if (rti_remote->number_of_transient_federates > 0) { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->is_transient) { - lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); - lf_thread_join(fed->thread_id, &thread_exit_status); - pqueue_tag_free(fed->in_transit_message_tags); - lf_print("RTI: Transient federate %d thread exited.", fed->enclave.id); + rti_remote->all_persistent_federates_exited = true; + rti_remote->phase = shutdown_phase; + lf_print("RTI: All persistent threads exited."); + + // Wait for transient federate threads to exit, if any. + if (rti_remote->number_of_transient_federates > 0) { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->is_transient) { + lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); + lf_thread_join(fed->thread_id, &thread_exit_status); + pqueue_tag_free(fed->in_transit_message_tags); + lf_print("RTI: Transient federate %d thread exited.", fed->enclave.id); + } + } } - } - } - rti_remote->all_federates_exited = true; + rti_remote->all_federates_exited = true; - // Shutdown and close the socket that is listening for incoming connections - // so that the accept() call in respond_to_erroneous_connections returns. - // That thread should then check rti->all_federates_exited and it should exit. - if (shutdown(socket_descriptor, SHUT_RDWR)) { - LF_PRINT_LOG("On shut down TCP socket, received reply: %s", strerror(errno)); - } - // NOTE: In all common TCP/IP stacks, there is a time period, - // typically between 30 and 120 seconds, called the TIME_WAIT period, - // before the port is released after this close. This is because - // the OS is preventing another program from accidentally receiving - // duplicated packets intended for this program. - close(socket_descriptor); - - if (rti_remote->socket_descriptor_UDP > 0) { - if (shutdown(rti_remote->socket_descriptor_UDP, SHUT_RDWR)) { - LF_PRINT_LOG("On shut down UDP socket, received reply: %s", strerror(errno)); + // Shutdown and close the socket that is listening for incoming connections + // so that the accept() call in respond_to_erroneous_connections returns. + // That thread should then check rti->all_federates_exited and it should exit. + if (shutdown(socket_descriptor, SHUT_RDWR)) { + LF_PRINT_LOG("On shut down TCP socket, received reply: %s", strerror(errno)); + } + // NOTE: In all common TCP/IP stacks, there is a time period, + // typically between 30 and 120 seconds, called the TIME_WAIT period, + // before the port is released after this close. This is because + // the OS is preventing another program from accidentally receiving + // duplicated packets intended for this program. + close(socket_descriptor); + + if (rti_remote->socket_descriptor_UDP > 0) { + if (shutdown(rti_remote->socket_descriptor_UDP, SHUT_RDWR)) { + LF_PRINT_LOG("On shut down UDP socket, received reply: %s", strerror(errno)); + } + close(rti_remote->socket_descriptor_UDP); + } } - close(rti_remote->socket_descriptor_UDP); - } -} -void initialize_RTI(rti_remote_t* rti) { - rti_remote = rti; - - // Initialize thread synchronization primitives - LF_MUTEX_INIT(&rti_mutex); - LF_COND_INIT(&received_start_times, &rti_mutex); - LF_COND_INIT(&sent_start_time, &rti_mutex); - LF_COND_INIT(&updated_delayed_grants, &rti_mutex); - - initialize_rti_common(&rti_remote->base); - rti_remote->base.mutex = &rti_mutex; - - // federation_rti related initializations - rti_remote->max_start_time = 0LL; - rti_remote->num_feds_proposed_start = 0; - rti_remote->all_federates_exited = false; - rti_remote->federation_id = "Unidentified Federation"; - rti_remote->user_specified_port = 0; - rti_remote->final_port_TCP = 0; - rti_remote->socket_descriptor_TCP = -1; - rti_remote->final_port_UDP = UINT16_MAX; - rti_remote->socket_descriptor_UDP = -1; - rti_remote->clock_sync_global_status = clock_sync_init; - rti_remote->clock_sync_period_ns = MSEC(10); - rti_remote->clock_sync_exchanges_per_interval = 10; - rti_remote->authentication_enabled = false; - rti_remote->base.tracing_enabled = false; - rti_remote->stop_in_progress = false; - rti_remote->number_of_transient_federates = 0; - rti_remote->phase = startup_phase; -} + void initialize_RTI(rti_remote_t * rti) { + rti_remote = rti; + + // Initialize thread synchronization primitives + LF_MUTEX_INIT(&rti_mutex); + LF_COND_INIT(&received_start_times, &rti_mutex); + LF_COND_INIT(&sent_start_time, &rti_mutex); + LF_COND_INIT(&updated_delayed_grants, &rti_mutex); + + initialize_rti_common(&rti_remote->base); + rti_remote->base.mutex = &rti_mutex; + + // federation_rti related initializations + rti_remote->max_start_time = 0LL; + rti_remote->num_feds_proposed_start = 0; + rti_remote->all_federates_exited = false; + rti_remote->federation_id = "Unidentified Federation"; + rti_remote->user_specified_port = 0; + rti_remote->final_port_TCP = 0; + rti_remote->socket_descriptor_TCP = -1; + rti_remote->final_port_UDP = UINT16_MAX; + rti_remote->socket_descriptor_UDP = -1; + rti_remote->clock_sync_global_status = clock_sync_init; + rti_remote->clock_sync_period_ns = MSEC(10); + rti_remote->clock_sync_exchanges_per_interval = 10; + rti_remote->authentication_enabled = false; + rti_remote->base.tracing_enabled = false; + rti_remote->stop_in_progress = false; + rti_remote->number_of_transient_federates = 0; + rti_remote->phase = startup_phase; + } -void free_scheduling_nodes(scheduling_node_t** scheduling_nodes, uint16_t number_of_scheduling_nodes) { - for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { - // FIXME: Gives error freeing memory not allocated!!!! - scheduling_node_t* node = scheduling_nodes[i]; - if (node->upstream != NULL) - free(node->upstream); - if (node->downstream != NULL) - free(node->downstream); - } - free(scheduling_nodes); -} + void free_scheduling_nodes(scheduling_node_t * *scheduling_nodes, uint16_t number_of_scheduling_nodes) { + for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { + // FIXME: Gives error freeing memory not allocated!!!! + scheduling_node_t* node = scheduling_nodes[i]; + if (node->upstream != NULL) + free(node->upstream); + if (node->downstream != NULL) + free(node->downstream); + } + free(scheduling_nodes); + } #endif // STANDALONE_RTI From 7a9c8c166b6e85eb1ee2d771925b7dc5e1ee625e Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 14 Mar 2024 23:01:15 +0100 Subject: [PATCH 090/148] Add comments and more formatting --- core/federated/RTI/main.c | 5 +---- core/federated/federate.c | 24 +----------------------- 2 files changed, 2 insertions(+), 27 deletions(-) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index 005b784ae..889bedcce 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -339,10 +339,7 @@ int main(int argc, const char* argv[]) { } lf_print("Starting RTI for a total of %d federates, with %d being transient, in federation ID %s", - rti.base.number_of_scheduling_nodes, rti.number_of_transient_federates, - - rti.federation_id); - + rti.base.number_of_scheduling_nodes, rti.number_of_transient_federates, rti.federation_id); assert(rti.base.number_of_scheduling_nodes < UINT16_MAX); // Allocate memory for the federates diff --git a/core/federated/federate.c b/core/federated/federate.c index 3435a5d6d..aced99ca1 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1371,29 +1371,7 @@ static void handle_stop_granted_message() { } /** - * @brief Stop the execution of a federate. - * Every enclave within the federate will stop at one microstep later than its - * current tag. Unlike lf_request_stop(), this process does not require any - * involvement from the RTI, nor does it necessitate any consensus. - * - * This function is particularly useful for testing transient federates. - */ -void lf_stop() { - environment_t* env; - int num_env = _lf_get_environments(&env); - - for (int i = 0; i < num_env; i++) { - tag_t new_stop_tag; - new_stop_tag.time = env[i].current_tag.time; - new_stop_tag.microstep = env[i].current_tag.microstep + 1; - _lf_set_stop_tag(&env[i], new_stop_tag); - } - - LF_PRINT_LOG("Federate is stopping."); -} - -/** - * Handle a MSG_TYPE_STOP message from the RTI. + * @brief Handle a MSG_TYPE_STOP message from the RTI. * * This function simply calls lf_stop(). */ From c51bbe07b51e60820a3523c8d3d0ee7242bc7c47 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 14 Mar 2024 23:02:49 +0100 Subject: [PATCH 091/148] Fix bugs due to rebase + Tracepoint lf_stop() --- core/federated/RTI/rti_remote.c | 3740 +++++++++++++++--------------- core/threaded/reactor_threaded.c | 4 +- 2 files changed, 1844 insertions(+), 1900 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index fbd79e0ca..ea26c18f6 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -327,2215 +327,2161 @@ void* pending_grant_thread(void* federate) { lf_mutex_unlock(&rti_mutex); } -/** - * Notify a tag advance grant (TAG) message to the specified federate after - * the physical time reaches the tag. A thread is created to this end. - * - * If a provisionl tag advance grant is pending, cancel it. If there is another - * pending tag advance grant, do not proceed with the thread creation. - * - * @param fed The federate. - * @param tag The tag to grant. - */ -void notify_tag_advance_grant_delayed(scheduling_node_t* e, tag_t tag) { - federate_info_t* fed = GET_FED_INFO(e->id); - static void notify_tag_advance_grant_delayed(scheduling_node_t * e, tag_t tag) { - federate_info_t* fed = (federate_info_t*)GET_FED_INFO(e->id); - - // Check wether there is already a pending grant - // And check the pending provisional grant as well - lf_mutex_lock(&rti_mutex); - if (lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) { - // If a tag is issued, then stop any possible provisional tag grant - fed->pending_grant = tag; - fed->pending_provisional_grant = NEVER_TAG; - lf_thread_create(&(fed->pending_grant_thread_id), pending_grant_thread, fed); - } else { - // If there is already a pending tag grant, then let it be sent first - // FIXME: Is this correct? - } - lf_mutex_unlock(&rti_mutex); +void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { + if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || + lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { + return; + } + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (e->state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); } - void notify_tag_advance_grant(scheduling_node_t * e, tag_t tag) { - if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || - lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { - return; - } - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (e->state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); - } - - // Check if sending the tag advance grant needs to be delayed or not. - // Delay is needed when a federate has at least one absent upstream transient. + // Check if sending the tag advance grant needs to be delayed or not. + // Delay is needed when a federate has at least one absent upstream transient. - // Check if sending the tag advance grant needs to be delayed or not - // Delay is needed when a federate has, at least one, absent upstream transient - federate_info_t* fed = GET_FED_INFO(e->id); - if (!fed->has_upstream_transient_federates) { - notify_tag_advance_grant_immediate(e, tag); + // Check if sending the tag advance grant needs to be delayed or not + // Delay is needed when a federate has, at least one, absent upstream transient + federate_info_t* fed = GET_FED_INFO(e->id); + if (!fed->has_upstream_transient_federates) { + notify_tag_advance_grant_immediate(e, tag); + } else { + if (get_num_absent_upstream_transients(fed) > 0) { + notify_tag_advance_grant_delayed(fed, tag); } else { - if (get_num_absent_upstream_transients(fed) > 0) { - notify_tag_advance_grant_delayed(fed, tag); - } else { - notify_tag_advance_grant_immediate(e, tag); - } + notify_tag_advance_grant_immediate(e, tag); } } +} - /** - * Notify a provisional tag advance grant (PTAG) message to the specified federate - * immediately. - * - * This function will keep a record of this TAG in the enclave's last_provisionally_granted - * field. - * - * @param e The scheduling node. - * @param tag The tag to grant. - */ - void notify_provisional_tag_advance_grant_immediate(scheduling_node_t * e, tag_t tag) { - size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); - unsigned char buffer[message_length]; - buffer[0] = MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT; - encode_int64(tag.time, &(buffer[1])); - encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); - - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_PTAG, e->id, &tag); - } - // This function is called in notify_advance_grant_if_safe(), which is a long - // function. During this call, the socket might close, causing the following write_to_socket - // to fail. Consider a failure here a soft failure and update the federate's status. - if (write_to_socket(((federate_info_t*)e)->socket, message_length, buffer)) { - lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); - e->state = NOT_CONNECTED; - } else { - e->last_provisionally_granted = tag; - LF_PRINT_LOG("RTI sent to federate %d the Provisional Tag Advance Grant (PTAG) " PRINTF_TAG ".", e->id, - tag.time - start_time, tag.microstep); - - // Send PTAG to all upstream federates, if they have not had - // a later or equal PTAG or TAG sent previously and if their transitive - // NET is greater than or equal to the tag. - // This is needed to stimulate absent messages from upstream and break deadlocks. - // The scenario this deals with is illustrated in `test/C/src/federated/FeedbackDelay2.lf` - // and `test/C/src/federated/FeedbackDelay4.lf`. - // Note that this is transitive. - // NOTE: This is not needed for enclaves because zero-delay loops are prohibited. - // It's only needed for federates, which is why this is implemented here. - for (int j = 0; j < e->num_upstream; j++) { - scheduling_node_t* upstream = rti_remote->base.scheduling_nodes[e->upstream[j]]; - - // Ignore this federate if it has resigned. - if (upstream->state == NOT_CONNECTED) - continue; - - tag_t earliest = earliest_future_incoming_message_tag(upstream); - tag_t strict_earliest = eimt_strict(upstream); // Non-ZDC version. +/** + * Notify a provisional tag advance grant (PTAG) message to the specified federate + * immediately. + * + * This function will keep a record of this TAG in the enclave's last_provisionally_granted + * field. + * + * @param e The scheduling node. + * @param tag The tag to grant. + */ +void notify_provisional_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { + size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); + unsigned char buffer[message_length]; + buffer[0] = MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT; + encode_int64(tag.time, &(buffer[1])); + encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); - // If these tags are equal, then a TAG or PTAG should have already been granted, - // in which case, another will not be sent. But it may not have been already granted. - if (lf_tag_compare(earliest, tag) > 0) { - notify_tag_advance_grant(upstream, tag); - } else if (lf_tag_compare(earliest, tag) == 0 && lf_tag_compare(strict_earliest, tag) > 0) { - notify_provisional_tag_advance_grant(upstream, tag); - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_PTAG, e->id, &tag); + } + // This function is called in notify_advance_grant_if_safe(), which is a long + // function. During this call, the socket might close, causing the following write_to_socket + // to fail. Consider a failure here a soft failure and update the federate's status. + if (write_to_socket(((federate_info_t*)e)->socket, message_length, buffer)) { + lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); + e->state = NOT_CONNECTED; + } else { + e->last_provisionally_granted = tag; + LF_PRINT_LOG("RTI sent to federate %d the Provisional Tag Advance Grant (PTAG) " PRINTF_TAG ".", e->id, + tag.time - start_time, tag.microstep); + + // Send PTAG to all upstream federates, if they have not had + // a later or equal PTAG or TAG sent previously and if their transitive + // NET is greater than or equal to the tag. + // This is needed to stimulate absent messages from upstream and break deadlocks. + // The scenario this deals with is illustrated in `test/C/src/federated/FeedbackDelay2.lf` + // and `test/C/src/federated/FeedbackDelay4.lf`. + // Note that this is transitive. + // NOTE: This is not needed for enclaves because zero-delay loops are prohibited. + // It's only needed for federates, which is why this is implemented here. + for (int j = 0; j < e->num_upstream; j++) { + scheduling_node_t* upstream = rti_remote->base.scheduling_nodes[e->upstream[j]]; + + // Ignore this federate if it has resigned. + if (upstream->state == NOT_CONNECTED) + continue; + + tag_t earliest = earliest_future_incoming_message_tag(upstream); + tag_t strict_earliest = eimt_strict(upstream); // Non-ZDC version. + + // If these tags are equal, then a TAG or PTAG should have already been granted, + // in which case, another will not be sent. But it may not have been already granted. + if (lf_tag_compare(earliest, tag) > 0) { + notify_tag_advance_grant(upstream, tag); + } else if (lf_tag_compare(earliest, tag) == 0 && lf_tag_compare(strict_earliest, tag) > 0) { + notify_provisional_tag_advance_grant(upstream, tag); } } } +} - /** - * Notify a provisional tag advance grant (PTAG) message to the specified federate - * after the physical time reaches the tag. A thread is created to this end. - * - * If a tag advance grant or a provisional one is pending, then do not proceed - * with the thread creation. - * - * @param e The scheduling node. - * @param tag The provisional tag to grant. - */ - static void notify_provisional_tag_advance_grant_delayed(scheduling_node_t * e, tag_t tag) { - federate_info_t* fed = (federate_info_t*)e; - - // Proceed with the delayed provisional tag grant notification only if - // there is no pending grant and no provisional pending grant - lf_mutex_lock(&rti_mutex); - if ((lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) && - (lf_tag_compare(fed->pending_provisional_grant, NEVER_TAG) >= 0)) { - fed->pending_provisional_grant = tag; - lf_thread_create(&(fed->pending_provisional_grant_thread_id), pending_provisional_grant_thread, fed); - } - lf_mutex_unlock(&rti_mutex); +void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { + if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || + lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { + return; + } + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (e->state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); } - void notify_provisional_tag_advance_grant(scheduling_node_t * e, tag_t tag) { - if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || - lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { - return; - } - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (e->state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); + // Check if sending the tag advance grant needs to be delayed or not + // Delay is needed when a federate has, at least one, absent upstream transient + federate_info_t* fed = GET_FED_INFO(e->id); + if (!fed->has_upstream_transient_federates) { + notify_provisional_tag_advance_grant_immediate(e, tag); + } else { + if (get_num_absent_upstream_transients(fed) > 0) { + notify_provisional_tag_advance_grant_delayed(fed, tag); + } else { + notify_provisional_tag_advance_grant_immediate(e, tag); } - - // Check if sending the tag advance grant needs to be delayed or not - // Delay is needed when a federate has, at least one, absent upstream transient federate_info_t* fed = GET_FED_INFO(e->id); if (!fed->has_upstream_transient_federates) { notify_provisional_tag_advance_grant_immediate(e, tag); } else { if (get_num_absent_upstream_transients(fed) > 0) { - notify_provisional_tag_advance_grant_delayed(fed, tag); + notify_grant_delayed(fed, tag, true); } else { notify_provisional_tag_advance_grant_immediate(e, tag); } - federate_info_t* fed = GET_FED_INFO(e->id); - if (!fed->has_upstream_transient_federates) { - notify_provisional_tag_advance_grant_immediate(e, tag); - } else { - if (get_num_absent_upstream_transients(fed) > 0) { - notify_grant_delayed(fed, tag, true); - } else { - notify_provisional_tag_advance_grant_immediate(e, tag); - } - } } + } - void update_federate_next_event_tag_locked(uint16_t federate_id, tag_t next_event_tag) { - federate_info_t* fed = GET_FED_INFO(federate_id); - tag_t min_in_transit_tag = pqueue_tag_peek_tag(fed->in_transit_message_tags); - if (lf_tag_compare(min_in_transit_tag, next_event_tag) < 0) { - next_event_tag = min_in_transit_tag; - } - update_scheduling_node_next_event_tag_locked(&(fed->enclave), next_event_tag); + void update_federate_next_event_tag_locked(uint16_t federate_id, tag_t next_event_tag) { + federate_info_t* fed = GET_FED_INFO(federate_id); + tag_t min_in_transit_tag = pqueue_tag_peek_tag(fed->in_transit_message_tags); + if (lf_tag_compare(min_in_transit_tag, next_event_tag) < 0) { + next_event_tag = min_in_transit_tag; } + update_scheduling_node_next_event_tag_locked(&(fed->enclave), next_event_tag); + } - void handle_port_absent_message(federate_info_t * sending_federate, unsigned char* buffer) { - size_t message_size = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int64_t) + sizeof(uint32_t); - - read_from_socket_fail_on_error(&sending_federate->socket, message_size, &(buffer[1]), NULL, - " RTI failed to read port absent message from federate %u.", - sending_federate->enclave.id); - - uint16_t reactor_port_id = extract_uint16(&(buffer[1])); - uint16_t federate_id = extract_uint16(&(buffer[1 + sizeof(uint16_t)])); - tag_t tag = extract_tag(&(buffer[1 + 2 * sizeof(uint16_t)])); - - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_PORT_ABS, sending_federate->enclave.id, &tag); - } - - // Need to acquire the mutex lock to ensure that the thread handling - // messages coming from the socket connected to the destination does not - // issue a TAG before this message has been forwarded. - LF_MUTEX_LOCK(&rti_mutex); + void handle_port_absent_message(federate_info_t * sending_federate, unsigned char* buffer) { + size_t message_size = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int64_t) + sizeof(uint32_t); - // If the destination federate is no longer connected, issue a warning - // and return. - federate_info_t* fed = GET_FED_INFO(federate_id); - if (fed->enclave.state == NOT_CONNECTED) { - LF_MUTEX_UNLOCK(&rti_mutex); - lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); - LF_PRINT_LOG("Fed status: next_event " PRINTF_TAG ", " - "completed " PRINTF_TAG ", " - "last_granted " PRINTF_TAG ", " - "last_provisionally_granted " PRINTF_TAG ".", - fed->enclave.next_event.time - start_time, fed->enclave.next_event.microstep, - fed->enclave.completed.time - start_time, fed->enclave.completed.microstep, - fed->enclave.last_granted.time - start_time, fed->enclave.last_granted.microstep, - fed->enclave.last_provisionally_granted.time - start_time, - fed->enclave.last_provisionally_granted.microstep); - return; - } + read_from_socket_fail_on_error(&sending_federate->socket, message_size, &(buffer[1]), NULL, + " RTI failed to read port absent message from federate %u.", + sending_federate->enclave.id); - LF_PRINT_LOG("RTI forwarding port absent message for port %u to federate %u.", reactor_port_id, federate_id); + uint16_t reactor_port_id = extract_uint16(&(buffer[1])); + uint16_t federate_id = extract_uint16(&(buffer[1 + sizeof(uint16_t)])); + tag_t tag = extract_tag(&(buffer[1 + 2 * sizeof(uint16_t)])); - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (fed->enclave.state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); - } - - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_PORT_ABS, federate_id, &tag); - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_PORT_ABS, sending_federate->enclave.id, &tag); + } - // Forward the message. - write_to_socket_fail_on_error(&fed->socket, message_size + 1, buffer, &rti_mutex, - "RTI failed to forward message to federate %d.", federate_id); + // Need to acquire the mutex lock to ensure that the thread handling + // messages coming from the socket connected to the destination does not + // issue a TAG before this message has been forwarded. + LF_MUTEX_LOCK(&rti_mutex); + // If the destination federate is no longer connected, issue a warning + // and return. + federate_info_t* fed = GET_FED_INFO(federate_id); + if (fed->enclave.state == NOT_CONNECTED) { LF_MUTEX_UNLOCK(&rti_mutex); + lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); + LF_PRINT_LOG("Fed status: next_event " PRINTF_TAG ", " + "completed " PRINTF_TAG ", " + "last_granted " PRINTF_TAG ", " + "last_provisionally_granted " PRINTF_TAG ".", + fed->enclave.next_event.time - start_time, fed->enclave.next_event.microstep, + fed->enclave.completed.time - start_time, fed->enclave.completed.microstep, + fed->enclave.last_granted.time - start_time, fed->enclave.last_granted.microstep, + fed->enclave.last_provisionally_granted.time - start_time, + fed->enclave.last_provisionally_granted.microstep); + return; } - void handle_timed_message(federate_info_t * sending_federate, unsigned char* buffer) { - size_t header_size = - 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t) + sizeof(int64_t) + sizeof(uint32_t); - // Read the header, minus the first byte which has already been read. - read_from_socket_fail_on_error(&sending_federate->socket, header_size - 1, &(buffer[1]), NULL, - "RTI failed to read the timed message header from remote federate."); - // Extract the header information. of the sender - uint16_t reactor_port_id; - uint16_t federate_id; - size_t length; - tag_t intended_tag; - // Extract information from the header. - extract_timed_header(&(buffer[1]), &reactor_port_id, &federate_id, &length, &intended_tag); - - size_t total_bytes_to_read = length + header_size; - size_t bytes_to_read = length; - - if (FED_COM_BUFFER_SIZE < header_size + 1) { - lf_print_error_and_exit("Buffer size (%d) is not large enough to " - "read the header plus one byte.", - FED_COM_BUFFER_SIZE); - } + LF_PRINT_LOG("RTI forwarding port absent message for port %u to federate %u.", reactor_port_id, federate_id); - // Cut up the payload in chunks. - if (bytes_to_read > FED_COM_BUFFER_SIZE - header_size) { - bytes_to_read = FED_COM_BUFFER_SIZE - header_size; - } - - LF_PRINT_LOG("RTI received message from federate %d for federate %u port %u with intended tag " PRINTF_TAG - ". Forwarding.", - sending_federate->enclave.id, federate_id, reactor_port_id, intended_tag.time - lf_time_start(), - intended_tag.microstep); + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (fed->enclave.state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); + } - read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, &(buffer[header_size]), NULL, - "RTI failed to read timed message from federate %d.", federate_id); - size_t bytes_read = bytes_to_read + header_size; - // Following only works for string messages. - // LF_PRINT_DEBUG("Message received by RTI: %s.", buffer + header_size); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_PORT_ABS, federate_id, &tag); + } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_TAGGED_MSG, sending_federate->enclave.id, &intended_tag); - } + // Forward the message. + write_to_socket_fail_on_error(&fed->socket, message_size + 1, buffer, &rti_mutex, + "RTI failed to forward message to federate %d.", federate_id); - // Need to acquire the mutex lock to ensure that the thread handling - // messages coming from the socket connected to the destination does not - // issue a TAG before this message has been forwarded. - LF_MUTEX_LOCK(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); + } - // If the destination federate is no longer connected, issue a warning, - // remove the message from the socket and return. - federate_info_t* fed = GET_FED_INFO(federate_id); - if (fed->enclave.state == NOT_CONNECTED) { - lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); - LF_PRINT_LOG("Fed status: next_event " PRINTF_TAG ", " - "completed " PRINTF_TAG ", " - "last_granted " PRINTF_TAG ", " - "last_provisionally_granted " PRINTF_TAG ".", - fed->enclave.next_event.time - start_time, fed->enclave.next_event.microstep, - fed->enclave.completed.time - start_time, fed->enclave.completed.microstep, - fed->enclave.last_granted.time - start_time, fed->enclave.last_granted.microstep, - fed->enclave.last_provisionally_granted.time - start_time, - fed->enclave.last_provisionally_granted.microstep); - // If the message was larger than the buffer, we must empty out the remainder also. - size_t total_bytes_read = bytes_read; - while (total_bytes_read < total_bytes_to_read) { - bytes_to_read = total_bytes_to_read - total_bytes_read; - if (bytes_to_read > FED_COM_BUFFER_SIZE) { - bytes_to_read = FED_COM_BUFFER_SIZE; - } - read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, - "RTI failed to clear message chunks."); - total_bytes_read += bytes_to_read; - } - LF_MUTEX_UNLOCK(&rti_mutex); - return; - } else { - if (lf_tag_compare(intended_tag, fed->effective_start_tag) < 0) { - // Do not forward the message if the federate is connected, but its - // start_time is not reached yet - lf_mutex_unlock(&rti_mutex); - return; - } - } + void handle_timed_message(federate_info_t * sending_federate, unsigned char* buffer) { + size_t header_size = + 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t) + sizeof(int64_t) + sizeof(uint32_t); + // Read the header, minus the first byte which has already been read. + read_from_socket_fail_on_error(&sending_federate->socket, header_size - 1, &(buffer[1]), NULL, + "RTI failed to read the timed message header from remote federate."); + // Extract the header information. of the sender + uint16_t reactor_port_id; + uint16_t federate_id; + size_t length; + tag_t intended_tag; + // Extract information from the header. + extract_timed_header(&(buffer[1]), &reactor_port_id, &federate_id, &length, &intended_tag); + + size_t total_bytes_to_read = length + header_size; + size_t bytes_to_read = length; + + if (FED_COM_BUFFER_SIZE < header_size + 1) { + lf_print_error_and_exit("Buffer size (%d) is not large enough to " + "read the header plus one byte.", + FED_COM_BUFFER_SIZE); + } - LF_PRINT_DEBUG("RTI forwarding message to port %d of federate %hu of length %zu.", reactor_port_id, federate_id, - length); + // Cut up the payload in chunks. + if (bytes_to_read > FED_COM_BUFFER_SIZE - header_size) { + bytes_to_read = FED_COM_BUFFER_SIZE - header_size; + } - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (fed->enclave.state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); - } + LF_PRINT_LOG("RTI received message from federate %d for federate %u port %u with intended tag " PRINTF_TAG + ". Forwarding.", + sending_federate->enclave.id, federate_id, reactor_port_id, intended_tag.time - lf_time_start(), + intended_tag.microstep); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_TAGGED_MSG, federate_id, &intended_tag); - } + read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, &(buffer[header_size]), NULL, + "RTI failed to read timed message from federate %d.", federate_id); + size_t bytes_read = bytes_to_read + header_size; + // Following only works for string messages. + // LF_PRINT_DEBUG("Message received by RTI: %s.", buffer + header_size); - write_to_socket_fail_on_error(&fed->socket, bytes_read, buffer, &rti_mutex, - "RTI failed to forward message to federate %d.", federate_id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_TAGGED_MSG, sending_federate->enclave.id, &intended_tag); + } - // The message length may be longer than the buffer, - // in which case we have to handle it in chunks. + // Need to acquire the mutex lock to ensure that the thread handling + // messages coming from the socket connected to the destination does not + // issue a TAG before this message has been forwarded. + LF_MUTEX_LOCK(&rti_mutex); + + // If the destination federate is no longer connected, issue a warning, + // remove the message from the socket and return. + federate_info_t* fed = GET_FED_INFO(federate_id); + if (fed->enclave.state == NOT_CONNECTED) { + lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); + LF_PRINT_LOG("Fed status: next_event " PRINTF_TAG ", " + "completed " PRINTF_TAG ", " + "last_granted " PRINTF_TAG ", " + "last_provisionally_granted " PRINTF_TAG ".", + fed->enclave.next_event.time - start_time, fed->enclave.next_event.microstep, + fed->enclave.completed.time - start_time, fed->enclave.completed.microstep, + fed->enclave.last_granted.time - start_time, fed->enclave.last_granted.microstep, + fed->enclave.last_provisionally_granted.time - start_time, + fed->enclave.last_provisionally_granted.microstep); + // If the message was larger than the buffer, we must empty out the remainder also. size_t total_bytes_read = bytes_read; while (total_bytes_read < total_bytes_to_read) { - LF_PRINT_DEBUG("Forwarding message in chunks."); bytes_to_read = total_bytes_to_read - total_bytes_read; if (bytes_to_read > FED_COM_BUFFER_SIZE) { bytes_to_read = FED_COM_BUFFER_SIZE; } read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, - "RTI failed to read message chunks."); + "RTI failed to clear message chunks."); total_bytes_read += bytes_to_read; - - // FIXME: a mutex needs to be held for this so that other threads - // do not write to destination_socket and cause interleaving. However, - // holding the rti_mutex might be very expensive. Instead, each outgoing - // socket should probably have its own mutex. - write_to_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, &rti_mutex, - "RTI failed to send message chunks."); } - - // Record this in-transit message in federate's in-transit message queue. - if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) { - // Add a record of this message to the list of in-transit messages to this federate. - pqueue_tag_insert_if_no_match(fed->in_transit_message_tags, intended_tag); - LF_PRINT_DEBUG("RTI: Adding a message with tag " PRINTF_TAG - " to the list of in-transit messages for federate %d.", - intended_tag.time - lf_time_start(), intended_tag.microstep, federate_id); - } else { - lf_print_error("RTI: Federate %d has already completed tag " PRINTF_TAG - ", but there is an in-transit message with tag " PRINTF_TAG " from federate %hu. " - "This is going to cause an STP violation under centralized coordination.", - federate_id, fed->enclave.completed.time - lf_time_start(), fed->enclave.completed.microstep, - intended_tag.time - lf_time_start(), intended_tag.microstep, sending_federate->enclave.id); - // FIXME: Drop the federate? + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } else { + if (lf_tag_compare(intended_tag, fed->effective_start_tag) < 0) { + // Do not forward the message if the federate is connected, but its + // start_time is not reached yet + lf_mutex_unlock(&rti_mutex); + return; } + } - // If the message tag is less than the most recently received NET from the federate, - // then update the federate's next event tag to match the message tag. - if (lf_tag_compare(intended_tag, fed->enclave.next_event) < 0) { - update_federate_next_event_tag_locked(federate_id, intended_tag); - } + LF_PRINT_DEBUG("RTI forwarding message to port %d of federate %hu of length %zu.", reactor_port_id, federate_id, + length); - LF_MUTEX_UNLOCK(&rti_mutex); + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (fed->enclave.state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); } - void handle_latest_tag_confirmed(federate_info_t * fed) { - unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; - read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, - "RTI failed to read the content of the logical tag complete from federate %d.", - fed->enclave.id); - tag_t completed = extract_tag(buffer); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_LTC, fed->enclave.id, &completed); - } - _logical_tag_complete(&(fed->enclave), completed); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_TAGGED_MSG, federate_id, &intended_tag); + } - // FIXME: Should this function be in the enclave version? - LF_MUTEX_LOCK(&rti_mutex); - // See if we can remove any of the recorded in-transit messages for this. - pqueue_tag_remove_up_to(fed->in_transit_message_tags, completed); - LF_MUTEX_UNLOCK(&rti_mutex); + write_to_socket_fail_on_error(&fed->socket, bytes_read, buffer, &rti_mutex, + "RTI failed to forward message to federate %d.", federate_id); + + // The message length may be longer than the buffer, + // in which case we have to handle it in chunks. + size_t total_bytes_read = bytes_read; + while (total_bytes_read < total_bytes_to_read) { + LF_PRINT_DEBUG("Forwarding message in chunks."); + bytes_to_read = total_bytes_to_read - total_bytes_read; + if (bytes_to_read > FED_COM_BUFFER_SIZE) { + bytes_to_read = FED_COM_BUFFER_SIZE; + } + read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, + "RTI failed to read message chunks."); + total_bytes_read += bytes_to_read; + + // FIXME: a mutex needs to be held for this so that other threads + // do not write to destination_socket and cause interleaving. However, + // holding the rti_mutex might be very expensive. Instead, each outgoing + // socket should probably have its own mutex. + write_to_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, &rti_mutex, + "RTI failed to send message chunks."); } - void handle_next_event_tag(federate_info_t * fed) { - unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; - read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, - "RTI failed to read the content of the next event tag from federate %d.", - fed->enclave.id); + // Record this in-transit message in federate's in-transit message queue. + if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) { + // Add a record of this message to the list of in-transit messages to this federate. + pqueue_tag_insert_if_no_match(fed->in_transit_message_tags, intended_tag); + LF_PRINT_DEBUG("RTI: Adding a message with tag " PRINTF_TAG + " to the list of in-transit messages for federate %d.", + intended_tag.time - lf_time_start(), intended_tag.microstep, federate_id); + } else { + lf_print_error("RTI: Federate %d has already completed tag " PRINTF_TAG + ", but there is an in-transit message with tag " PRINTF_TAG " from federate %hu. " + "This is going to cause an STP violation under centralized coordination.", + federate_id, fed->enclave.completed.time - lf_time_start(), fed->enclave.completed.microstep, + intended_tag.time - lf_time_start(), intended_tag.microstep, sending_federate->enclave.id); + // FIXME: Drop the federate? + } - // Acquire a mutex lock to ensure that this state does not change while a - // message is in transport or being used to determine a TAG. - LF_MUTEX_LOCK(&rti_mutex); // FIXME: Instead of using a mutex, it might be more efficient to use a - // select() mechanism to read and process federates' buffers in an orderly fashion. + // If the message tag is less than the most recently received NET from the federate, + // then update the federate's next event tag to match the message tag. + if (lf_tag_compare(intended_tag, fed->enclave.next_event) < 0) { + update_federate_next_event_tag_locked(federate_id, intended_tag); + } - tag_t intended_tag = extract_tag(buffer); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_NET, fed->enclave.id, &intended_tag); - } - LF_PRINT_LOG("RTI received from federate %d the Next Event Tag (NET) " PRINTF_TAG, fed->enclave.id, - intended_tag.time - start_time, intended_tag.microstep); - update_federate_next_event_tag_locked(fed->enclave.id, intended_tag); - LF_MUTEX_UNLOCK(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); + } + + void handle_latest_tag_confirmed(federate_info_t * fed) { + unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, + "RTI failed to read the content of the logical tag complete from federate %d.", + fed->enclave.id); + tag_t completed = extract_tag(buffer); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_LTC, fed->enclave.id, &completed); } + _logical_tag_complete(&(fed->enclave), completed); - /////////////////// STOP functions //////////////////// - - /** - * Boolean used to prevent the RTI from sending the - * MSG_TYPE_STOP_GRANTED message multiple times. - */ - bool stop_granted_already_sent_to_federates = false; - - /** - * Once the RTI has seen proposed tags from all connected federates, - * it will broadcast a MSG_TYPE_STOP_GRANTED carrying the _RTI.max_stop_tag. - * This function also checks the most recently received NET from - * each federate and resets that be no greater than the _RTI.max_stop_tag. - * - * This function assumes the caller holds the rti_mutex lock. - */ - static void broadcast_stop_time_to_federates_locked() { - if (stop_granted_already_sent_to_federates == true) { - return; - } - stop_granted_already_sent_to_federates = true; + // FIXME: Should this function be in the enclave version? + LF_MUTEX_LOCK(&rti_mutex); + // See if we can remove any of the recorded in-transit messages for this. + pqueue_tag_remove_up_to(fed->in_transit_message_tags, completed); + LF_MUTEX_UNLOCK(&rti_mutex); + } - // Reply with a stop granted to all federates - unsigned char outgoing_buffer[MSG_TYPE_STOP_GRANTED_LENGTH]; - ENCODE_STOP_GRANTED(outgoing_buffer, rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); + void handle_next_event_tag(federate_info_t * fed) { + unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, + "RTI failed to read the content of the next event tag from federate %d.", + fed->enclave.id); - // Iterate over federates and send each the message. - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->enclave.state == NOT_CONNECTED) { - continue; - } - if (lf_tag_compare(fed->enclave.next_event, rti_remote->base.max_stop_tag) >= 0) { - // Need the next_event to be no greater than the stop tag. - fed->enclave.next_event = rti_remote->base.max_stop_tag; - } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_STOP_GRN, fed->enclave.id, &rti_remote->base.max_stop_tag); - } - write_to_socket_fail_on_error(&fed->socket, MSG_TYPE_STOP_GRANTED_LENGTH, outgoing_buffer, &rti_mutex, - "RTI failed to send MSG_TYPE_STOP_GRANTED message to federate %d.", - fed->enclave.id); - } + // Acquire a mutex lock to ensure that this state does not change while a + // message is in transport or being used to determine a TAG. + LF_MUTEX_LOCK(&rti_mutex); // FIXME: Instead of using a mutex, it might be more efficient to use a + // select() mechanism to read and process federates' buffers in an orderly fashion. - LF_PRINT_LOG("RTI sent to federates MSG_TYPE_STOP_GRANTED with tag " PRINTF_TAG, - rti_remote->base.max_stop_tag.time - start_time, rti_remote->base.max_stop_tag.microstep); - } - - /** - * Mark a federate requesting stop. If the number of federates handling stop reaches - * the number of persistent federates, broadcast MSG_TYPE_STOP_GRANTED to every federate. - * This function assumes the _RTI.mutex is already locked. - * @param fed The federate that has requested a stop. - * @return 1 if stop time has been sent to all federates and 0 otherwise. - */ - static int mark_federate_requesting_stop(federate_info_t * fed) { - if (!fed->requested_stop) { - // Increment the number of federates handling stop only if it is persistent - if (!fed->is_transient) - rti_remote->base.num_scheduling_nodes_handling_stop++; - fed->requested_stop = true; - } - if (rti_remote->base.num_scheduling_nodes_handling_stop == - (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { - // We now have information about the stop time of all - // federates. - broadcast_stop_time_to_federates_locked(); - return 1; - } - return 0; + tag_t intended_tag = extract_tag(buffer); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_NET, fed->enclave.id, &intended_tag); } + LF_PRINT_LOG("RTI received from federate %d the Next Event Tag (NET) " PRINTF_TAG, fed->enclave.id, + intended_tag.time - start_time, intended_tag.microstep); + update_federate_next_event_tag_locked(fed->enclave.id, intended_tag); + LF_MUTEX_UNLOCK(&rti_mutex); + } - /** - * Thread to time out if federates do not reply to stop request. - */ - static void* wait_for_stop_request_reply(void* args) { - initialize_lf_thread_id(); - // Divide the time into small chunks and check periodically. - interval_t chunk = MAX_TIME_FOR_REPLY_TO_STOP_REQUEST / 30; - int count = 0; - while (count++ < 30) { - if (stop_granted_already_sent_to_federates) - return NULL; - lf_sleep(chunk); - } - // If we reach here, then error out. - lf_print_error_and_exit("Received only %d stop request replies within timeout " PRINTF_TIME "ns. RTI is exiting.", - rti_remote->base.num_scheduling_nodes_handling_stop, MAX_TIME_FOR_REPLY_TO_STOP_REQUEST); - return NULL; - } + /////////////////// STOP functions //////////////////// - void handle_stop_request_message(federate_info_t * fed) { - LF_PRINT_DEBUG("RTI handling stop_request from federate %d.", fed->enclave.id); + /** + * Boolean used to prevent the RTI from sending the + * MSG_TYPE_STOP_GRANTED message multiple times. + */ + bool stop_granted_already_sent_to_federates = false; - size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; - unsigned char buffer[bytes_to_read]; - read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, NULL, - "RTI failed to read the MSG_TYPE_STOP_REQUEST payload from federate %d.", - fed->enclave.id); + /** + * Once the RTI has seen proposed tags from all connected federates, + * it will broadcast a MSG_TYPE_STOP_GRANTED carrying the _RTI.max_stop_tag. + * This function also checks the most recently received NET from + * each federate and resets that be no greater than the _RTI.max_stop_tag. + * + * This function assumes the caller holds the rti_mutex lock. + */ + static void broadcast_stop_time_to_federates_locked() { + if (stop_granted_already_sent_to_federates == true) { + return; + } + stop_granted_already_sent_to_federates = true; - // Extract the proposed stop tag for the federate - tag_t proposed_stop_tag = extract_tag(buffer); + // Reply with a stop granted to all federates + unsigned char outgoing_buffer[MSG_TYPE_STOP_GRANTED_LENGTH]; + ENCODE_STOP_GRANTED(outgoing_buffer, rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); + // Iterate over federates and send each the message. + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->enclave.state == NOT_CONNECTED) { + continue; + } + if (lf_tag_compare(fed->enclave.next_event, rti_remote->base.max_stop_tag) >= 0) { + // Need the next_event to be no greater than the stop tag. + fed->enclave.next_event = rti_remote->base.max_stop_tag; + } if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_STOP_REQ, fed->enclave.id, &proposed_stop_tag); + tracepoint_rti_to_federate(send_STOP_GRN, fed->enclave.id, &rti_remote->base.max_stop_tag); } + write_to_socket_fail_on_error(&fed->socket, MSG_TYPE_STOP_GRANTED_LENGTH, outgoing_buffer, &rti_mutex, + "RTI failed to send MSG_TYPE_STOP_GRANTED message to federate %d.", + fed->enclave.id); + } - LF_PRINT_LOG("RTI received from federate %d a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", - fed->enclave.id, proposed_stop_tag.time - start_time, proposed_stop_tag.microstep); + LF_PRINT_LOG("RTI sent to federates MSG_TYPE_STOP_GRANTED with tag " PRINTF_TAG, + rti_remote->base.max_stop_tag.time - start_time, rti_remote->base.max_stop_tag.microstep); + } - // Acquire a mutex lock to ensure that this state does change while a - // message is in transport or being used to determine a TAG. - LF_MUTEX_LOCK(&rti_mutex); + /** + * Mark a federate requesting stop. If the number of federates handling stop reaches + * the number of persistent federates, broadcast MSG_TYPE_STOP_GRANTED to every federate. + * This function assumes the _RTI.mutex is already locked. + * @param fed The federate that has requested a stop. + * @return 1 if stop time has been sent to all federates and 0 otherwise. + */ + static int mark_federate_requesting_stop(federate_info_t * fed) { + if (!fed->requested_stop) { + // Increment the number of federates handling stop only if it is persistent + if (!fed->is_transient) + rti_remote->base.num_scheduling_nodes_handling_stop++; + fed->requested_stop = true; + } + if (rti_remote->base.num_scheduling_nodes_handling_stop == + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { + // We now have information about the stop time of all + // federates. + broadcast_stop_time_to_federates_locked(); + return 1; + } + return 0; + } - // Check whether we have already received a stop_tag - // from this federate - if (fed->requested_stop) { - // If stop request messages have already been broadcast, treat this as if it were a reply. - if (rti_remote->stop_in_progress) { - mark_federate_requesting_stop(fed); - } - LF_MUTEX_UNLOCK(&rti_mutex); - return; - } + /** + * Thread to time out if federates do not reply to stop request. + */ + static void* wait_for_stop_request_reply(void* args) { + initialize_lf_thread_id(); + // Divide the time into small chunks and check periodically. + interval_t chunk = MAX_TIME_FOR_REPLY_TO_STOP_REQUEST / 30; + int count = 0; + while (count++ < 30) { + if (stop_granted_already_sent_to_federates) + return NULL; + lf_sleep(chunk); + } + // If we reach here, then error out. + lf_print_error_and_exit("Received only %d stop request replies within timeout " PRINTF_TIME "ns. RTI is exiting.", + rti_remote->base.num_scheduling_nodes_handling_stop, MAX_TIME_FOR_REPLY_TO_STOP_REQUEST); + return NULL; + } - // Update the maximum stop tag received from federates - if (lf_tag_compare(proposed_stop_tag, rti_remote->base.max_stop_tag) > 0) { - rti_remote->base.max_stop_tag = proposed_stop_tag; - } + void handle_stop_request_message(federate_info_t * fed) { + LF_PRINT_DEBUG("RTI handling stop_request from federate %d.", fed->enclave.id); - // If all federates have replied, send stop request granted. - if (mark_federate_requesting_stop(fed)) { - // Have send stop request granted to all federates. Nothing more to do. - LF_MUTEX_UNLOCK(&rti_mutex); - return; - } + size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; + unsigned char buffer[bytes_to_read]; + read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, NULL, + "RTI failed to read the MSG_TYPE_STOP_REQUEST payload from federate %d.", + fed->enclave.id); + + // Extract the proposed stop tag for the federate + tag_t proposed_stop_tag = extract_tag(buffer); + + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_STOP_REQ, fed->enclave.id, &proposed_stop_tag); + } - // Forward the stop request to all other federates that have not - // also issued a stop request. - unsigned char stop_request_buffer[MSG_TYPE_STOP_REQUEST_LENGTH]; - ENCODE_STOP_REQUEST(stop_request_buffer, rti_remote->base.max_stop_tag.time, - rti_remote->base.max_stop_tag.microstep); + LF_PRINT_LOG("RTI received from federate %d a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", + fed->enclave.id, proposed_stop_tag.time - start_time, proposed_stop_tag.microstep); - // Iterate over federates and send each the MSG_TYPE_STOP_REQUEST message - // if we do not have a stop_time already for them. Do not do this more than once. + // Acquire a mutex lock to ensure that this state does change while a + // message is in transport or being used to determine a TAG. + LF_MUTEX_LOCK(&rti_mutex); + + // Check whether we have already received a stop_tag + // from this federate + if (fed->requested_stop) { + // If stop request messages have already been broadcast, treat this as if it were a reply. if (rti_remote->stop_in_progress) { - LF_MUTEX_UNLOCK(&rti_mutex); - return; + mark_federate_requesting_stop(fed); } - rti_remote->stop_in_progress = true; - // Need a timeout here in case a federate never replies. - lf_thread_t timeout_thread; - lf_thread_create(&timeout_thread, wait_for_stop_request_reply, NULL); + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* f = GET_FED_INFO(i); - if (f->enclave.id != fed->enclave.id && f->requested_stop == false) { - if (f->enclave.state == NOT_CONNECTED) { - mark_federate_requesting_stop(f); - continue; - } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); - } - write_to_socket_fail_on_error(&f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, &rti_mutex, - "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", - f->enclave.id); + // Update the maximum stop tag received from federates + if (lf_tag_compare(proposed_stop_tag, rti_remote->base.max_stop_tag) > 0) { + rti_remote->base.max_stop_tag = proposed_stop_tag; + } + + // If all federates have replied, send stop request granted. + if (mark_federate_requesting_stop(fed)) { + // Have send stop request granted to all federates. Nothing more to do. + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } + + // Forward the stop request to all other federates that have not + // also issued a stop request. + unsigned char stop_request_buffer[MSG_TYPE_STOP_REQUEST_LENGTH]; + ENCODE_STOP_REQUEST(stop_request_buffer, rti_remote->base.max_stop_tag.time, + rti_remote->base.max_stop_tag.microstep); + + // Iterate over federates and send each the MSG_TYPE_STOP_REQUEST message + // if we do not have a stop_time already for them. Do not do this more than once. + if (rti_remote->stop_in_progress) { + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } + rti_remote->stop_in_progress = true; + // Need a timeout here in case a federate never replies. + lf_thread_t timeout_thread; + lf_thread_create(&timeout_thread, wait_for_stop_request_reply, NULL); + + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* f = GET_FED_INFO(i); + if (f->enclave.id != fed->enclave.id && f->requested_stop == false) { + if (f->enclave.state == NOT_CONNECTED) { + mark_federate_requesting_stop(f); + continue; } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); + } + write_to_socket_fail_on_error(&f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, &rti_mutex, + "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", + f->enclave.id); } - LF_PRINT_LOG("RTI forwarded to federates MSG_TYPE_STOP_REQUEST with tag (" PRINTF_TIME ", %u).", - rti_remote->base.max_stop_tag.time - start_time, rti_remote->base.max_stop_tag.microstep); - LF_MUTEX_UNLOCK(&rti_mutex); } + LF_PRINT_LOG("RTI forwarded to federates MSG_TYPE_STOP_REQUEST with tag (" PRINTF_TIME ", %u).", + rti_remote->base.max_stop_tag.time - start_time, rti_remote->base.max_stop_tag.microstep); + LF_MUTEX_UNLOCK(&rti_mutex); + } - void handle_stop_request_reply(federate_info_t * fed) { - size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_REPLY_LENGTH - 1; - unsigned char buffer_stop_time[bytes_to_read]; - read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer_stop_time, NULL, - "RTI failed to read the reply to MSG_TYPE_STOP_REQUEST message from federate %d.", - fed->enclave.id); + void handle_stop_request_reply(federate_info_t * fed) { + size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_REPLY_LENGTH - 1; + unsigned char buffer_stop_time[bytes_to_read]; + read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer_stop_time, NULL, + "RTI failed to read the reply to MSG_TYPE_STOP_REQUEST message from federate %d.", + fed->enclave.id); - tag_t federate_stop_tag = extract_tag(buffer_stop_time); + tag_t federate_stop_tag = extract_tag(buffer_stop_time); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_STOP_REQ_REP, fed->enclave.id, &federate_stop_tag); - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_STOP_REQ_REP, fed->enclave.id, &federate_stop_tag); + } - LF_PRINT_LOG("RTI received from federate %d STOP reply tag " PRINTF_TAG ".", fed->enclave.id, - federate_stop_tag.time - start_time, federate_stop_tag.microstep); + LF_PRINT_LOG("RTI received from federate %d STOP reply tag " PRINTF_TAG ".", fed->enclave.id, + federate_stop_tag.time - start_time, federate_stop_tag.microstep); - // Acquire the mutex lock so that we can change the state of the RTI - LF_MUTEX_LOCK(&rti_mutex); - // If the federate has not requested stop before, count the reply - if (lf_tag_compare(federate_stop_tag, rti_remote->base.max_stop_tag) > 0) { - rti_remote->base.max_stop_tag = federate_stop_tag; - } - mark_federate_requesting_stop(fed); - LF_MUTEX_UNLOCK(&rti_mutex); + // Acquire the mutex lock so that we can change the state of the RTI + LF_MUTEX_LOCK(&rti_mutex); + // If the federate has not requested stop before, count the reply + if (lf_tag_compare(federate_stop_tag, rti_remote->base.max_stop_tag) > 0) { + rti_remote->base.max_stop_tag = federate_stop_tag; } + mark_federate_requesting_stop(fed); + LF_MUTEX_UNLOCK(&rti_mutex); + } - ////////////////////////////////////////////////// + ////////////////////////////////////////////////// - void handle_address_query(uint16_t fed_id) { - federate_info_t* fed = GET_FED_INFO(fed_id); - // Use buffer both for reading and constructing the reply. - // The length is what is needed for the reply. - unsigned char buffer[1 + sizeof(int32_t)]; - read_from_socket_fail_on_error(&fed->socket, sizeof(uint16_t), (unsigned char*)buffer, NULL, - "Failed to read address query."); - uint16_t remote_fed_id = extract_uint16(buffer); + void handle_address_query(uint16_t fed_id) { + federate_info_t* fed = GET_FED_INFO(fed_id); + // Use buffer both for reading and constructing the reply. + // The length is what is needed for the reply. + unsigned char buffer[1 + sizeof(int32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(uint16_t), (unsigned char*)buffer, NULL, + "Failed to read address query."); + uint16_t remote_fed_id = extract_uint16(buffer); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_ADR_QR, fed_id, NULL); - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_ADR_QR, fed_id, NULL); + } - LF_PRINT_DEBUG("RTI received address query from %d for %d.", fed_id, remote_fed_id); + LF_PRINT_DEBUG("RTI received address query from %d for %d.", fed_id, remote_fed_id); - // NOTE: server_port initializes to -1, which means the RTI does not know - // the port number because it has not yet received an MSG_TYPE_ADDRESS_ADVERTISEMENT message - // from this federate. In that case, it will respond by sending -1. + // NOTE: server_port initializes to -1, which means the RTI does not know + // the port number because it has not yet received an MSG_TYPE_ADDRESS_ADVERTISEMENT message + // from this federate. In that case, it will respond by sending -1. - // Response message is MSG_TYPE_ADDRESS_QUERY_REPLY. - buffer[0] = MSG_TYPE_ADDRESS_QUERY_REPLY; + // Response message is MSG_TYPE_ADDRESS_QUERY_REPLY. + buffer[0] = MSG_TYPE_ADDRESS_QUERY_REPLY; - // Encode the port number. - federate_info_t* remote_fed = GET_FED_INFO(remote_fed_id); + // Encode the port number. + federate_info_t* remote_fed = GET_FED_INFO(remote_fed_id); - // Send the port number (which could be -1). - LF_MUTEX_LOCK(&rti_mutex); - encode_int32(remote_fed->server_port, (unsigned char*)&buffer[1]); - write_to_socket_fail_on_error(&fed->socket, sizeof(int32_t) + 1, (unsigned char*)buffer, &rti_mutex, - "Failed to write port number to socket of federate %d.", fed_id); - - // Send the server IP address to federate. - write_to_socket_fail_on_error(&fed->socket, sizeof(remote_fed->server_ip_addr), - (unsigned char*)&remote_fed->server_ip_addr, &rti_mutex, - "Failed to write ip address to socket of federate %d.", fed_id); - LF_MUTEX_UNLOCK(&rti_mutex); + // Send the port number (which could be -1). + LF_MUTEX_LOCK(&rti_mutex); + encode_int32(remote_fed->server_port, (unsigned char*)&buffer[1]); + write_to_socket_fail_on_error(&fed->socket, sizeof(int32_t) + 1, (unsigned char*)buffer, &rti_mutex, + "Failed to write port number to socket of federate %d.", fed_id); + + // Send the server IP address to federate. + write_to_socket_fail_on_error(&fed->socket, sizeof(remote_fed->server_ip_addr), + (unsigned char*)&remote_fed->server_ip_addr, &rti_mutex, + "Failed to write ip address to socket of federate %d.", fed_id); + LF_MUTEX_UNLOCK(&rti_mutex); - LF_PRINT_DEBUG("Replied to address query from federate %d with address %s:%d.", fed_id, - remote_fed->server_hostname, remote_fed->server_port); + LF_PRINT_DEBUG("Replied to address query from federate %d with address %s:%d.", fed_id, remote_fed->server_hostname, + remote_fed->server_port); + } + + void handle_address_ad(uint16_t federate_id) { + federate_info_t* fed = GET_FED_INFO(federate_id); + // Read the port number of the federate that can be used for physical + // connections to other federates + int32_t server_port = -1; + unsigned char buffer[sizeof(int32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(int32_t), (unsigned char*)buffer, NULL, + "Error reading port data from federate %d.", federate_id); + + server_port = extract_int32(buffer); + + assert(server_port < 65536); + + LF_MUTEX_LOCK(&rti_mutex); + fed->server_port = server_port; + LF_MUTEX_UNLOCK(&rti_mutex); + + LF_PRINT_LOG("Received address advertisement with port %d from federate %d.", server_port, federate_id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_ADR_AD, federate_id, NULL); } + } - void handle_address_ad(uint16_t federate_id) { - federate_info_t* fed = GET_FED_INFO(federate_id); - // Read the port number of the federate that can be used for physical - // connections to other federates - int32_t server_port = -1; - unsigned char buffer[sizeof(int32_t)]; - read_from_socket_fail_on_error(&fed->socket, sizeof(int32_t), (unsigned char*)buffer, NULL, - "Error reading port data from federate %d.", federate_id); + /** + * Send the start time to the federate my_fed. + * This function assumes the caller does not hold the mutex. + * + * If it is the startup phase, the start_time will be the maximum received timestamps + * plus an offset. The federate will then receive identical federation_start_time + * and federate_start_tag.time (the federate_start_tag.microstep will be 0). + * If, however, the startup phase is passed, the federate will receive different + * values than sateted above. + * + * @param my_fed the federate to send the start time to. + * @param federation_start_time the federation start_time + * @param federate_start_tag the federate effective start tag + */ + void send_start_tag(federate_info_t * my_fed, instant_t federation_start_time, tag_t federate_start_tag) { + // Send back to the federate the maximum time plus an offset on a TIMESTAMP_START + // message. + // In the startup phase, federates will receive identical start_time and + // effective_start_tag + unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_START_LENGTH]; + start_time_buffer[0] = MSG_TYPE_TIMESTAMP_START; + encode_int64(swap_bytes_if_big_endian_int64(start_time), &start_time_buffer[1]); + encode_tag(&(start_time_buffer[1 + sizeof(instant_t)]), federate_start_tag); - server_port = extract_int32(buffer); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_TIMESTAMP, my_fed->enclave.id, &federate_start_tag); + } + if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_START_LENGTH, start_time_buffer)) { + lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); + } - assert(server_port < 65536); + LF_MUTEX_LOCK(&rti_mutex); + // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP + // message has been sent. That MSG_TYPE_TIMESTAMP message grants time advance to + // the federate to the start time. + my_fed->enclave.state = GRANTED; + lf_cond_broadcast(&sent_start_time); + LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); + LF_MUTEX_UNLOCK(&rti_mutex); + } - LF_MUTEX_LOCK(&rti_mutex); - fed->server_port = server_port; - LF_MUTEX_UNLOCK(&rti_mutex); + void handle_timestamp(federate_info_t * my_fed) { + unsigned char buffer[sizeof(int64_t)]; + // Read bytes from the socket. We need 8 bytes. + read_from_socket_fail_on_error(&my_fed->socket, sizeof(int64_t), (unsigned char*)&buffer, NULL, + "ERROR reading timestamp from federate %d.\n", my_fed->enclave.id); - LF_PRINT_LOG("Received address advertisement with port %d from federate %d.", server_port, federate_id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_ADR_AD, federate_id, NULL); - } + int64_t timestamp = swap_bytes_if_big_endian_int64(*((int64_t*)(&buffer))); + if (rti_remote->base.tracing_enabled) { + tag_t tag = {.time = timestamp, .microstep = 0}; + tracepoint_rti_from_federate(receive_TIMESTAMP, my_fed->enclave.id, &tag); } + LF_PRINT_DEBUG("RTI received timestamp message with time: " PRINTF_TIME ".", timestamp); - /** - * Send to the start time to the federate my_fed. - * This function assumes the caller does not hold the mutex. - * - * If it is the startup phase, the start_time will be the maximum received timestamps - * plus an offset. The federate will then receive identical federation_start_time - * and federate_start_tag.time (the federate_start_tag.microstep will be 0). - * If, however, the startup phase is passed, the federate will receive different - * values than sateted above. - * - * @param my_fed the federate to send the start time to. - * @param federation_start_time the federation start_time - * @param federate_start_tag the federate effective start tag - */ - void send_start_tag(federate_info_t * my_fed, instant_t federation_start_time, tag_t federate_start_tag) { - // Send back to the federate the maximum time plus an offset on a TIMESTAMP_START - // message. - // In the startup phase, federates will receive identical start_time and - // effective_start_tag - unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_START_LENGTH]; - start_time_buffer[0] = MSG_TYPE_TIMESTAMP_START; - encode_int64(swap_bytes_if_big_endian_int64(start_time), &start_time_buffer[1]); - encode_tag(&(start_time_buffer[1 + sizeof(instant_t)]), federate_start_tag); + LF_MUTEX_LOCK(&rti_mutex); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_TIMESTAMP, my_fed->enclave.id, &federate_start_tag); + // Processing the TIMESTAMP depends on whether it is the startup phase (all + // persistent federates joined) or not. + if (rti_remote->phase == + startup_phase) { // This is equivalent to: rti_remote->num_feds_proposed_start < + // (rti_remote->number_of_enclaves - rti_remote->number_of_transient_federates) + if (timestamp > rti_remote->max_start_time) { + rti_remote->max_start_time = timestamp; } - if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_START_LENGTH, start_time_buffer)) { - lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); + // Check that persistent federates did propose a start_time + if (!my_fed->is_transient) { + rti_remote->num_feds_proposed_start++; + } + if (rti_remote->num_feds_proposed_start == + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { + // All federates have proposed a start time. + lf_cond_broadcast(&received_start_times); + rti_remote->phase = execution_phase; + } else { + // Some federates have not yet proposed a start time. + // wait for a notification. + while (rti_remote->num_feds_proposed_start < + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { + // FIXME: Should have a timeout here? + lf_cond_wait(&received_start_times); + } } - LF_MUTEX_LOCK(&rti_mutex); - // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP - // message has been sent. That MSG_TYPE_TIMESTAMP message grants time advance to - // the federate to the start time. - my_fed->enclave.state = GRANTED; - lf_cond_broadcast(&sent_start_time); - LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); LF_MUTEX_UNLOCK(&rti_mutex); - } - void handle_timestamp(federate_info_t * my_fed) { - unsigned char buffer[sizeof(int64_t)]; - // Read bytes from the socket. We need 8 bytes. - read_from_socket_fail_on_error(&my_fed->socket, sizeof(int64_t), (unsigned char*)&buffer, NULL, - "ERROR reading timestamp from federate %d.\n", my_fed->enclave.id); + // Send back to the federate the maximum time plus an offset on a TIMESTAMP + // message. + // Add an offset to this start time to get everyone starting together. + start_time = rti_remote->max_start_time + DELAY_START; + my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; + send_start_tag(my_fed, start_time, my_fed->effective_start_tag); + } else if (rti_remote->phase == shutdown_phase) { + // Do not answer the federate if the federation is in hsutdown phase + // Or maybe send and error message? + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } else { // The federation is the execution phase + // A transient has joined after the startup phase + // At this point, we already hold the mutex - int64_t timestamp = swap_bytes_if_big_endian_int64(*((int64_t*)(&buffer))); - if (rti_remote->base.tracing_enabled) { - tag_t tag = {.time = timestamp, .microstep = 0}; - tracepoint_rti_from_federate(receive_TIMESTAMP, my_fed->enclave.id, &tag); + // This is rather a possible extreme corner case, where a transient sends its timestamp, and only + // enters the if section after all persistents have joined. + if (timestamp < start_time) { + timestamp = start_time; } - LF_PRINT_DEBUG("RTI received timestamp message with time: " PRINTF_TIME ".", timestamp); - LF_MUTEX_LOCK(&rti_mutex); + //// Algorithm for computing the effective_start_time of a joining transient + // The effective_start_time will be the max among all the following tags: + // 1. At tag: (joining time, 0 microstep) + // 2. The latest completed logical tag + 1 microstep + // 3. The latest granted (P)TAG + 1 microstep, of every downstream federate + // 4. The maximun tag of messages from the upstream federates + 1 microstep - // Processing the TIMESTAMP depends on whether it is the startup phase (all - // persistent federates joined) or not. - if (rti_remote->phase == - startup_phase) { // This is equivalent to: rti_remote->num_feds_proposed_start < - // (rti_remote->number_of_enclaves - rti_remote->number_of_transient_federates) - if (timestamp > rti_remote->max_start_time) { - rti_remote->max_start_time = timestamp; - } - // Check that persistent federates did propose a start_time - if (!my_fed->is_transient) { - rti_remote->num_feds_proposed_start++; - } - if (rti_remote->num_feds_proposed_start == - (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { - // All federates have proposed a start time. - lf_cond_broadcast(&received_start_times); - rti_remote->phase = execution_phase; - } else { - // Some federates have not yet proposed a start time. - // wait for a notification. - while (rti_remote->num_feds_proposed_start < - (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { - // FIXME: Should have a timeout here? - lf_cond_wait(&received_start_times); - } - } + // Condition 1. + my_fed->effective_start_tag = (tag_t){.time = timestamp, .microstep = 0u}; - LF_MUTEX_UNLOCK(&rti_mutex); - - // Send back to the federate the maximum time plus an offset on a TIMESTAMP - // message. - // Add an offset to this start time to get everyone starting together. - start_time = rti_remote->max_start_time + DELAY_START; - my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; - send_start_tag(my_fed, start_time, my_fed->effective_start_tag); - } else if (rti_remote->phase == shutdown_phase) { - // Do not answer the federate if the federation is in hsutdown phase - // Or maybe send and error message? - LF_MUTEX_UNLOCK(&rti_mutex); - return; - } else { // The federation is the execution phase - // A transient has joined after the startup phase - // At this point, we already hold the mutex - - // This is rather a possible extreme corner case, where a transient sends its timestamp, and only - // enters the if section after all persistents have joined. - if (timestamp < start_time) { - timestamp = start_time; - } + // Condition 2. + // FIXME: Not sure if this corner case can happen, but better to be on the safe side. + if (lf_tag_compare(my_fed->enclave.completed, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = my_fed->enclave.completed; + my_fed->effective_start_tag.microstep++; + } - //// Algorithm for computing the effective_start_time of a joining transient - // The effective_start_time will be the max among all the following tags: - // 1. At tag: (joining time, 0 microstep) - // 2. The latest completed logical tag + 1 microstep - // 3. The latest granted (P)TAG + 1 microstep, of every downstream federate - // 4. The maximun tag of messages from the upstream federates + 1 microstep + // Condition 3. Iterate over the downstream federates + for (int j = 0; j < my_fed->enclave.num_downstream; j++) { + federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); - // Condition 1. - my_fed->effective_start_tag = (tag_t){.time = timestamp, .microstep = 0u}; + // Get the max over the TAG of the downstreams + if (lf_tag_compare(downstream->enclave.last_granted, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = downstream->enclave.last_granted; + my_fed->effective_start_tag.microstep++; + } - // Condition 2. - // FIXME: Not sure if this corner case can happen, but better to be on the safe side. - if (lf_tag_compare(my_fed->enclave.completed, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = my_fed->enclave.completed; + // Get the max over the PTAG of the downstreams + if (lf_tag_compare(downstream->enclave.last_provisionally_granted, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = downstream->enclave.last_provisionally_granted; my_fed->effective_start_tag.microstep++; } + } - // Condition 3. Iterate over the downstream federates - for (int j = 0; j < my_fed->enclave.num_downstream; j++) { - federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); + // Condition 4. Iterate over the messages from the upstream federates + for (int j = 0; j < my_fed->enclave.num_upstream; j++) { + federate_info_t* upstream = GET_FED_INFO(my_fed->enclave.upstream[j]); - // Get the max over the TAG of the downstreams - if (lf_tag_compare(downstream->enclave.last_granted, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = downstream->enclave.last_granted; - my_fed->effective_start_tag.microstep++; - } + // Get the max over the TAG of the upstreams + size_t queue_size = pqueue_tag_size(upstream->in_transit_message_tags); + if (queue_size != 0) { + pqueue_t* pq = (pqueue_t*)(upstream->in_transit_message_tags); + pqueue_tag_element_t* message_with_max_tag = (pqueue_tag_element_t*)(pq->d[queue_size]); + tag_t max_tag = message_with_max_tag->tag; - // Get the max over the PTAG of the downstreams - if (lf_tag_compare(downstream->enclave.last_provisionally_granted, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = downstream->enclave.last_provisionally_granted; + if (lf_tag_compare(max_tag, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = max_tag; my_fed->effective_start_tag.microstep++; } } + } - // Condition 4. Iterate over the messages from the upstream federates - for (int j = 0; j < my_fed->enclave.num_upstream; j++) { - federate_info_t* upstream = GET_FED_INFO(my_fed->enclave.upstream[j]); - - // Get the max over the TAG of the upstreams - size_t queue_size = pqueue_tag_size(upstream->in_transit_message_tags); - if (queue_size != 0) { - pqueue_t* pq = (pqueue_t*)(upstream->in_transit_message_tags); - pqueue_tag_element_t* message_with_max_tag = (pqueue_tag_element_t*)(pq->d[queue_size]); - tag_t max_tag = message_with_max_tag->tag; + // For every downstream that has a pending grant that is higher then the + // effective_start_time of the federate, cancel it + for (int j = 0; j < my_fed->enclave.num_downstream; j++) { + federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); - if (lf_tag_compare(max_tag, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = max_tag; - my_fed->effective_start_tag.microstep++; - } - } + // Ignore this federate if it has resigned. + if (downstream->enclave.state == NOT_CONNECTED) { + continue; } - // For every downstream that has a pending grant that is higher then the - // effective_start_time of the federate, cancel it - for (int j = 0; j < my_fed->enclave.num_downstream; j++) { - federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); - - // Ignore this federate if it has resigned. - if (downstream->enclave.state == NOT_CONNECTED) { - continue; - } - - // Check the pending grants, if any, and keep it only if it is - // sonner than the effective start tag - pqueue_delayed_grant_element_t* dge = - pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, downstream->enclave.id); - if (dge != NULL && lf_tag_compare(dge->base.tag, my_fed->effective_start_tag) > 0) { - pqueue_delayed_grants_remove(rti_remote->delayed_grants, dge); - } + // Check the pending grants, if any, and keep it only if it is + // sonner than the effective start tag + pqueue_delayed_grant_element_t* dge = + pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, downstream->enclave.id); + if (dge != NULL && lf_tag_compare(dge->base.tag, my_fed->effective_start_tag) > 0) { + pqueue_delayed_grants_remove(rti_remote->delayed_grants, dge); } + } - LF_MUTEX_UNLOCK(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); - // Once the effective start time set, sent it to the joining transient, - // together with the start time of the federation. + // Once the effective start time set, sent it to the joining transient, + // together with the start time of the federation. - // Send the start time - send_start_tag(my_fed, start_time, my_fed->effective_start_tag); - } + // Send the start time + send_start_tag(my_fed, start_time, my_fed->effective_start_tag); } + } - void send_physical_clock(unsigned char message_type, federate_info_t* fed, socket_type_t socket_type) { - if (fed->enclave.state == NOT_CONNECTED) { - lf_print_warning("Clock sync: RTI failed to send physical time to federate %d. Socket not connected.\n", - fed->enclave.id); + void send_physical_clock(unsigned char message_type, federate_info_t* fed, socket_type_t socket_type) { + if (fed->enclave.state == NOT_CONNECTED) { + lf_print_warning("Clock sync: RTI failed to send physical time to federate %d. Socket not connected.\n", + fed->enclave.id); + return; + } + unsigned char buffer[sizeof(int64_t) + 1]; + buffer[0] = message_type; + int64_t current_physical_time = lf_time_physical(); + encode_int64(current_physical_time, &(buffer[1])); + + // Send the message + if (socket_type == UDP) { + // FIXME: UDP_addr is never initialized. + LF_PRINT_DEBUG("Clock sync: RTI sending UDP message type %u.", buffer[0]); + ssize_t bytes_written = sendto(rti_remote->socket_descriptor_UDP, buffer, 1 + sizeof(int64_t), 0, + (struct sockaddr*)&fed->UDP_addr, sizeof(fed->UDP_addr)); + if (bytes_written < (ssize_t)sizeof(int64_t) + 1) { + lf_print_warning("Clock sync: RTI failed to send physical time to federate %d: %s\n", fed->enclave.id, + strerror(errno)); return; } - unsigned char buffer[sizeof(int64_t) + 1]; - buffer[0] = message_type; - int64_t current_physical_time = lf_time_physical(); - encode_int64(current_physical_time, &(buffer[1])); - - // Send the message - if (socket_type == UDP) { - // FIXME: UDP_addr is never initialized. - LF_PRINT_DEBUG("Clock sync: RTI sending UDP message type %u.", buffer[0]); - ssize_t bytes_written = sendto(rti_remote->socket_descriptor_UDP, buffer, 1 + sizeof(int64_t), 0, - (struct sockaddr*)&fed->UDP_addr, sizeof(fed->UDP_addr)); - if (bytes_written < (ssize_t)sizeof(int64_t) + 1) { - lf_print_warning("Clock sync: RTI failed to send physical time to federate %d: %s\n", fed->enclave.id, - strerror(errno)); - return; - } - } else if (socket_type == TCP) { - LF_PRINT_DEBUG("Clock sync: RTI sending TCP message type %u.", buffer[0]); - LF_MUTEX_LOCK(&rti_mutex); - write_to_socket_fail_on_error(&fed->socket, 1 + sizeof(int64_t), buffer, &rti_mutex, - "Clock sync: RTI failed to send physical time to federate %d.", fed->enclave.id); - LF_MUTEX_UNLOCK(&rti_mutex); - } - LF_PRINT_DEBUG("Clock sync: RTI sent PHYSICAL_TIME_SYNC_MESSAGE with timestamp " PRINTF_TIME " to federate %d.", - current_physical_time, fed->enclave.id); - } - - void handle_physical_clock_sync_message(federate_info_t * my_fed, socket_type_t socket_type) { - // Lock the mutex to prevent interference between sending the two - // coded probe messages. + } else if (socket_type == TCP) { + LF_PRINT_DEBUG("Clock sync: RTI sending TCP message type %u.", buffer[0]); LF_MUTEX_LOCK(&rti_mutex); - // Reply with a T4 type message - send_physical_clock(MSG_TYPE_CLOCK_SYNC_T4, my_fed, socket_type); - // Send the corresponding coded probe immediately after, - // but only if this is a UDP channel. - if (socket_type == UDP) { - send_physical_clock(MSG_TYPE_CLOCK_SYNC_CODED_PROBE, my_fed, socket_type); - } + write_to_socket_fail_on_error(&fed->socket, 1 + sizeof(int64_t), buffer, &rti_mutex, + "Clock sync: RTI failed to send physical time to federate %d.", fed->enclave.id); LF_MUTEX_UNLOCK(&rti_mutex); } + LF_PRINT_DEBUG("Clock sync: RTI sent PHYSICAL_TIME_SYNC_MESSAGE with timestamp " PRINTF_TIME " to federate %d.", + current_physical_time, fed->enclave.id); + } - void* clock_synchronization_thread(void* noargs) { - initialize_lf_thread_id(); - // Wait until all federates have been notified of the start time. - // FIXME: Use lf_ version of this when merged with master. - LF_MUTEX_LOCK(&rti_mutex); - while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) { - lf_cond_wait(&received_start_times); - } - LF_MUTEX_UNLOCK(&rti_mutex); + void handle_physical_clock_sync_message(federate_info_t * my_fed, socket_type_t socket_type) { + // Lock the mutex to prevent interference between sending the two + // coded probe messages. + LF_MUTEX_LOCK(&rti_mutex); + // Reply with a T4 type message + send_physical_clock(MSG_TYPE_CLOCK_SYNC_T4, my_fed, socket_type); + // Send the corresponding coded probe immediately after, + // but only if this is a UDP channel. + if (socket_type == UDP) { + send_physical_clock(MSG_TYPE_CLOCK_SYNC_CODED_PROBE, my_fed, socket_type); + } + LF_MUTEX_UNLOCK(&rti_mutex); + } - // Wait until the start time before starting clock synchronization. - // The above wait ensures that start_time has been set. - interval_t ns_to_wait = start_time - lf_time_physical(); + void* clock_synchronization_thread(void* noargs) { + initialize_lf_thread_id(); + // Wait until all federates have been notified of the start time. + // FIXME: Use lf_ version of this when merged with master. + LF_MUTEX_LOCK(&rti_mutex); + while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) { + lf_cond_wait(&received_start_times); + } + LF_MUTEX_UNLOCK(&rti_mutex); - if (ns_to_wait > 0LL) { - lf_sleep(ns_to_wait); - } + // Wait until the start time before starting clock synchronization. + // The above wait ensures that start_time has been set. + interval_t ns_to_wait = start_time - lf_time_physical(); - // Initiate a clock synchronization every rti->clock_sync_period_ns - bool any_federates_connected = true; - while (any_federates_connected) { - // Sleep - lf_sleep(rti_remote->clock_sync_period_ns); // Can be interrupted - any_federates_connected = false; - for (int fed_id = 0; fed_id < rti_remote->base.number_of_scheduling_nodes; fed_id++) { - federate_info_t* fed = GET_FED_INFO(fed_id); - if (fed->enclave.state == NOT_CONNECTED) { - // FIXME: We need better error handling here, but clock sync failure - // should not stop execution. - lf_print_error("Clock sync failed with federate %d. Not connected.", fed_id); - continue; - } else if (!fed->clock_synchronization_enabled) { - continue; - } - // Send the RTI's current physical time to the federate - // Send on UDP. - LF_PRINT_DEBUG("RTI sending T1 message to initiate clock sync round."); - send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, UDP); - - // Listen for reply message, which should be T3. - size_t message_size = 1 + sizeof(uint16_t); - unsigned char buffer[message_size]; - // Maximum number of messages that we discard before giving up on this cycle. - // If the T3 message from this federate does not arrive and we keep receiving - // other messages, then give up on this federate and move to the next federate. - int remaining_attempts = 5; - while (remaining_attempts > 0) { - remaining_attempts--; - int read_failed = read_from_socket(rti_remote->socket_descriptor_UDP, message_size, buffer); - // If any errors occur, either discard the message or the clock sync round. - if (!read_failed) { - if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { - uint16_t fed_id_2 = extract_uint16(&(buffer[1])); - // Check that this message came from the correct federate. - if (fed_id_2 != fed->enclave.id) { - // Message is from the wrong federate. Discard the message. - lf_print_warning("Clock sync: Received T3 message from federate %d, " - "but expected one from %d. Discarding message.", - fed_id_2, fed->enclave.id); - continue; - } - LF_PRINT_DEBUG("Clock sync: RTI received T3 message from federate %d.", fed_id_2); - handle_physical_clock_sync_message(GET_FED_INFO(fed_id_2), UDP); - break; - } else { - // The message is not a T3 message. Discard the message and - // continue waiting for the T3 message. This is possibly a message - // from a previous cycle that was discarded. - lf_print_warning("Clock sync: Unexpected UDP message %u. Expected %u from federate %d. " - "Discarding message.", - buffer[0], MSG_TYPE_CLOCK_SYNC_T3, fed->enclave.id); + if (ns_to_wait > 0LL) { + lf_sleep(ns_to_wait); + } + + // Initiate a clock synchronization every rti->clock_sync_period_ns + bool any_federates_connected = true; + while (any_federates_connected) { + // Sleep + lf_sleep(rti_remote->clock_sync_period_ns); // Can be interrupted + any_federates_connected = false; + for (int fed_id = 0; fed_id < rti_remote->base.number_of_scheduling_nodes; fed_id++) { + federate_info_t* fed = GET_FED_INFO(fed_id); + if (fed->enclave.state == NOT_CONNECTED) { + // FIXME: We need better error handling here, but clock sync failure + // should not stop execution. + lf_print_error("Clock sync failed with federate %d. Not connected.", fed_id); + continue; + } else if (!fed->clock_synchronization_enabled) { + continue; + } + // Send the RTI's current physical time to the federate + // Send on UDP. + LF_PRINT_DEBUG("RTI sending T1 message to initiate clock sync round."); + send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, UDP); + + // Listen for reply message, which should be T3. + size_t message_size = 1 + sizeof(uint16_t); + unsigned char buffer[message_size]; + // Maximum number of messages that we discard before giving up on this cycle. + // If the T3 message from this federate does not arrive and we keep receiving + // other messages, then give up on this federate and move to the next federate. + int remaining_attempts = 5; + while (remaining_attempts > 0) { + remaining_attempts--; + int read_failed = read_from_socket(rti_remote->socket_descriptor_UDP, message_size, buffer); + // If any errors occur, either discard the message or the clock sync round. + if (!read_failed) { + if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { + uint16_t fed_id_2 = extract_uint16(&(buffer[1])); + // Check that this message came from the correct federate. + if (fed_id_2 != fed->enclave.id) { + // Message is from the wrong federate. Discard the message. + lf_print_warning("Clock sync: Received T3 message from federate %d, " + "but expected one from %d. Discarding message.", + fed_id_2, fed->enclave.id); continue; } + LF_PRINT_DEBUG("Clock sync: RTI received T3 message from federate %d.", fed_id_2); + handle_physical_clock_sync_message(GET_FED_INFO(fed_id_2), UDP); + break; } else { - lf_print_warning("Clock sync: Read from UDP socket failed: %s. " - "Skipping clock sync round for federate %d.", - strerror(errno), fed->enclave.id); - remaining_attempts = -1; + // The message is not a T3 message. Discard the message and + // continue waiting for the T3 message. This is possibly a message + // from a previous cycle that was discarded. + lf_print_warning("Clock sync: Unexpected UDP message %u. Expected %u from federate %d. " + "Discarding message.", + buffer[0], MSG_TYPE_CLOCK_SYNC_T3, fed->enclave.id); + continue; } + } else { + lf_print_warning("Clock sync: Read from UDP socket failed: %s. " + "Skipping clock sync round for federate %d.", + strerror(errno), fed->enclave.id); + remaining_attempts = -1; } - if (remaining_attempts > 0) { - any_federates_connected = true; - } + } + if (remaining_attempts > 0) { + any_federates_connected = true; } } - return NULL; - } - - /** - * Handle MSG_TYPE_FAILED sent by a federate. This message is sent by a federate - * that is exiting in failure. In this case, the RTI will - * also terminate abnormally, returning a non-zero exit code when it exits. - * - * This function assumes the caller does not hold the mutex. - * - * @param my_fed The federate sending a MSG_TYPE_FAILED message. - */ - static void handle_federate_failed(federate_info_t * my_fed) { - // Nothing more to do. Close the socket and exit. - LF_MUTEX_LOCK(&rti_mutex); + } + return NULL; + } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_FAILED, my_fed->enclave.id, NULL); - } + /** + * Handle MSG_TYPE_FAILED sent by a federate. This message is sent by a federate + * that is exiting in failure. In this case, the RTI will + * also terminate abnormally, returning a non-zero exit code when it exits. + * + * This function assumes the caller does not hold the mutex. + * + * @param my_fed The federate sending a MSG_TYPE_FAILED message. + */ + static void handle_federate_failed(federate_info_t * my_fed) { + // Nothing more to do. Close the socket and exit. + LF_MUTEX_LOCK(&rti_mutex); - // Set the flag telling the RTI to exit with an error code when it exits. - _lf_federate_reports_error = true; - lf_print_error("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_FAILED, my_fed->enclave.id, NULL); + } - my_fed->enclave.state = NOT_CONNECTED; + // Set the flag telling the RTI to exit with an error code when it exits. + _lf_federate_reports_error = true; + lf_print_error("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); - // Indicate that there will no further events from this federate. - my_fed->enclave.next_event = FOREVER_TAG; + my_fed->enclave.state = NOT_CONNECTED; - // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, - // the close should happen when receiving a 0 length message from the other end. - // Here, we just signal the other side that no further writes to the socket are - // forthcoming, which should result in the other end getting a zero-length reception. - shutdown(my_fed->socket, SHUT_RDWR); + // Indicate that there will no further events from this federate. + my_fed->enclave.next_event = FOREVER_TAG; - // We can now safely close the socket. - close(my_fed->socket); // from unistd.h + // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, + // the close should happen when receiving a 0 length message from the other end. + // Here, we just signal the other side that no further writes to the socket are + // forthcoming, which should result in the other end getting a zero-length reception. + shutdown(my_fed->socket, SHUT_RDWR); - // Check downstream federates to see whether they should now be granted a TAG. - // To handle cycles, need to create a boolean array to keep - // track of which upstream federates have been visited. - bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. - notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); - free(visited); + // We can now safely close the socket. + close(my_fed->socket); // from unistd.h - LF_MUTEX_UNLOCK(&rti_mutex); - } + // Check downstream federates to see whether they should now be granted a TAG. + // To handle cycles, need to create a boolean array to keep + // track of which upstream federates have been visited. + bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. + notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); + free(visited); - /** - * Handle MSG_TYPE_RESIGN sent by a federate. This message is sent at the time of termination - * after all shutdown events are processed on the federate. - * - * This function assumes the caller does not hold the mutex. - * - * @note At this point, the RTI might have outgoing messages to the federate. This - * function thus first performs a shutdown on the socket, which sends an EOF. It then - * waits for the remote socket to be closed before closing the socket itself. - * - * @param my_fed The federate sending a MSG_TYPE_RESIGN message. - */ - static void handle_federate_resign(federate_info_t * my_fed) { - // Nothing more to do. Close the socket and exit. - LF_MUTEX_LOCK(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); + } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_RESIGN, my_fed->enclave.id, NULL); - } + /** + * Handle MSG_TYPE_RESIGN sent by a federate. This message is sent at the time of termination + * after all shutdown events are processed on the federate. + * + * This function assumes the caller does not hold the mutex. + * + * @note At this point, the RTI might have outgoing messages to the federate. This + * function thus first performs a shutdown on the socket, which sends an EOF. It then + * waits for the remote socket to be closed before closing the socket itself. + * + * @param my_fed The federate sending a MSG_TYPE_RESIGN message. + */ + static void handle_federate_resign(federate_info_t * my_fed) { + // Nothing more to do. Close the socket and exit. + LF_MUTEX_LOCK(&rti_mutex); - lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_RESIGN, my_fed->enclave.id, NULL); + } - my_fed->enclave.state = NOT_CONNECTED; + lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); - // Indicate that there will no further events from this federate. - my_fed->enclave.next_event = FOREVER_TAG; + my_fed->enclave.state = NOT_CONNECTED; - // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, - // the close should happen when receiving a 0 length message from the other end. - // Here, we just signal the other side that no further writes to the socket are - // forthcoming, which should result in the other end getting a zero-length reception. - shutdown(my_fed->socket, SHUT_WR); + // Indicate that there will no further events from this federate. + my_fed->enclave.next_event = FOREVER_TAG; - // Wait for the federate to send an EOF or a socket error to occur. - // Discard any incoming bytes. Normally, this read should return 0 because - // the federate is resigning and should itself invoke shutdown. - unsigned char buffer[10]; - while (read(my_fed->socket, buffer, 10) > 0) - ; + // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, + // the close should happen when receiving a 0 length message from the other end. + // Here, we just signal the other side that no further writes to the socket are + // forthcoming, which should result in the other end getting a zero-length reception. + shutdown(my_fed->socket, SHUT_WR); - // We can now safely close the socket. - close(my_fed->socket); // from unistd.h + // Wait for the federate to send an EOF or a socket error to occur. + // Discard any incoming bytes. Normally, this read should return 0 because + // the federate is resigning and should itself invoke shutdown. + unsigned char buffer[10]; + while (read(my_fed->socket, buffer, 10) > 0) + ; - // Check downstream federates to see whether they should now be granted a TAG. - // To handle cycles, need to create a boolean array to keep - // track of which upstream federates have been visited. - bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. - notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); - free(visited); + // We can now safely close the socket. + close(my_fed->socket); // from unistd.h - LF_MUTEX_UNLOCK(&rti_mutex); - } + // Check downstream federates to see whether they should now be granted a TAG. + // To handle cycles, need to create a boolean array to keep + // track of which upstream federates have been visited. + bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. + notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); + free(visited); - void* federate_info_thread_TCP(void* fed) { - initialize_lf_thread_id(); - federate_info_t* my_fed = (federate_info_t*)fed; - - // Buffer for incoming messages. - // This does not constrain the message size because messages - // are forwarded piece by piece. - unsigned char buffer[FED_COM_BUFFER_SIZE]; - - // Listen for messages from the federate. - while (my_fed->enclave.state != NOT_CONNECTED) { - // Read no more than one byte to get the message type. - int read_failed = read_from_socket(my_fed->socket, 1, buffer); - if (read_failed) { - // Socket is closed - lf_print_error("RTI: Socket to federate %d is closed. Exiting the thread.", my_fed->enclave.id); - my_fed->enclave.state = NOT_CONNECTED; - my_fed->socket = -1; - // FIXME: We need better error handling here, but do not stop execution here. - break; - } - LF_PRINT_DEBUG("RTI: Received message type %u from federate %d.", buffer[0], my_fed->enclave.id); - switch (buffer[0]) { - case MSG_TYPE_TIMESTAMP: - handle_timestamp(my_fed); - break; - case MSG_TYPE_ADDRESS_QUERY: - handle_address_query(my_fed->enclave.id); - break; - case MSG_TYPE_ADDRESS_ADVERTISEMENT: - handle_address_ad(my_fed->enclave.id); - break; - case MSG_TYPE_TAGGED_MESSAGE: - handle_timed_message(my_fed, buffer); - break; - case MSG_TYPE_RESIGN: - handle_federate_resign(my_fed); - break; - case MSG_TYPE_NEXT_EVENT_TAG: - handle_next_event_tag(my_fed); - break; - case MSG_TYPE_LATEST_TAG_CONFIRMED: - handle_latest_tag_confirmed(my_fed); - break; - case MSG_TYPE_STOP_REQUEST: - handle_stop_request_message(my_fed); // FIXME: Reviewed until here. - // Need to also look at - // notify_advance_grant_if_safe() - // and notify_downstream_advance_grant_if_safe() - break; - case MSG_TYPE_STOP_REQUEST_REPLY: - handle_stop_request_reply(my_fed); - break; - case MSG_TYPE_PORT_ABSENT: - handle_port_absent_message(my_fed, buffer); - break; - case MSG_TYPE_FAILED: - handle_federate_failed(my_fed); - return NULL; - default: - lf_print_error("RTI received from federate %d an unrecognized TCP message type: %u.", my_fed->enclave.id, - buffer[0]); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_UNIDENTIFIED, my_fed->enclave.id, NULL); - } + LF_MUTEX_UNLOCK(&rti_mutex); + } + + void* federate_info_thread_TCP(void* fed) { + initialize_lf_thread_id(); + federate_info_t* my_fed = (federate_info_t*)fed; + + // Buffer for incoming messages. + // This does not constrain the message size because messages + // are forwarded piece by piece. + unsigned char buffer[FED_COM_BUFFER_SIZE]; + + // Listen for messages from the federate. + while (my_fed->enclave.state != NOT_CONNECTED) { + // Read no more than one byte to get the message type. + int read_failed = read_from_socket(my_fed->socket, 1, buffer); + if (read_failed) { + // Socket is closed + lf_print_error("RTI: Socket to federate %d is closed. Exiting the thread.", my_fed->enclave.id); + my_fed->enclave.state = NOT_CONNECTED; + my_fed->socket = -1; + // FIXME: We need better error handling here, but do not stop execution here. + break; + } + LF_PRINT_DEBUG("RTI: Received message type %u from federate %d.", buffer[0], my_fed->enclave.id); + switch (buffer[0]) { + case MSG_TYPE_TIMESTAMP: + handle_timestamp(my_fed); + break; + case MSG_TYPE_ADDRESS_QUERY: + handle_address_query(my_fed->enclave.id); + break; + case MSG_TYPE_ADDRESS_ADVERTISEMENT: + handle_address_ad(my_fed->enclave.id); + break; + case MSG_TYPE_TAGGED_MESSAGE: + handle_timed_message(my_fed, buffer); + break; + case MSG_TYPE_RESIGN: + handle_federate_resign(my_fed); + break; + case MSG_TYPE_NEXT_EVENT_TAG: + handle_next_event_tag(my_fed); + break; + case MSG_TYPE_LATEST_TAG_CONFIRMED: + handle_latest_tag_confirmed(my_fed); + break; + case MSG_TYPE_STOP_REQUEST: + handle_stop_request_message(my_fed); // FIXME: Reviewed until here. + // Need to also look at + // notify_advance_grant_if_safe() + // and notify_downstream_advance_grant_if_safe() + break; + case MSG_TYPE_STOP_REQUEST_REPLY: + handle_stop_request_reply(my_fed); + break; + case MSG_TYPE_PORT_ABSENT: + handle_port_absent_message(my_fed, buffer); + break; + case MSG_TYPE_FAILED: + handle_federate_failed(my_fed); + return NULL; + default: + lf_print_error("RTI received from federate %d an unrecognized TCP message type: %u.", my_fed->enclave.id, + buffer[0]); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_UNIDENTIFIED, my_fed->enclave.id, NULL); } } + } - // Nothing more to do. Close the socket and exit. - // Prevent multiple threads from closing the same socket at the same time. - LF_MUTEX_LOCK(&rti_mutex); - close(my_fed->socket); // from unistd.h - // Manual clean, in case of a transient federate - if (my_fed->is_transient) { - // FIXME: Aren't there transit messages anymore??? - // free_in_transit_message_q(my_fed->in_transit_message_tags); - lf_print("RTI: Transient Federate %d thread exited.", my_fed->enclave.id); - - // Update the number of connected transient federates - rti_remote->number_of_connected_transient_federates--; - - // Reset the status of the leaving federate - reset_transient_federate(my_fed); - } - // Signal the hot swap mechanism, if needed - if (hot_swap_in_progress && hot_swap_federate->enclave.id == my_fed->enclave.id) { - hot_swap_old_resigned = true; - } - LF_MUTEX_UNLOCK(&rti_mutex); - return NULL; + // Nothing more to do. Close the socket and exit. + // Prevent multiple threads from closing the same socket at the same time. + LF_MUTEX_LOCK(&rti_mutex); + close(my_fed->socket); // from unistd.h + // Manual clean, in case of a transient federate + if (my_fed->is_transient) { + // FIXME: Aren't there transit messages anymore??? + // free_in_transit_message_q(my_fed->in_transit_message_tags); + lf_print("RTI: Transient Federate %d thread exited.", my_fed->enclave.id); + + // Update the number of connected transient federates + rti_remote->number_of_connected_transient_federates--; + + // Reset the status of the leaving federate + reset_transient_federate(my_fed); + } + // Signal the hot swap mechanism, if needed + if (hot_swap_in_progress && hot_swap_federate->enclave.id == my_fed->enclave.id) { + hot_swap_old_resigned = true; } + LF_MUTEX_UNLOCK(&rti_mutex); + return NULL; + } - void send_reject(int* socket_id, unsigned char error_code) { - LF_PRINT_DEBUG("RTI sending MSG_TYPE_REJECT."); - unsigned char response[2]; - response[0] = MSG_TYPE_REJECT; - response[1] = error_code; - LF_MUTEX_LOCK(&rti_mutex); - // NOTE: Ignore errors on this response. - if (write_to_socket(*socket_id, 2, response)) { - lf_print_warning("RTI failed to write MSG_TYPE_REJECT message on the socket."); - } - // Close the socket. - shutdown(*socket_id, SHUT_RDWR); - close(*socket_id); - *socket_id = -1; - LF_MUTEX_UNLOCK(&rti_mutex); + void send_reject(int* socket_id, unsigned char error_code) { + LF_PRINT_DEBUG("RTI sending MSG_TYPE_REJECT."); + unsigned char response[2]; + response[0] = MSG_TYPE_REJECT; + response[1] = error_code; + LF_MUTEX_LOCK(&rti_mutex); + // NOTE: Ignore errors on this response. + if (write_to_socket(*socket_id, 2, response)) { + lf_print_warning("RTI failed to write MSG_TYPE_REJECT message on the socket."); } - lf_print("handle_timestamp for transient 1157"); - - /** - * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload - * a federate ID and a federation ID. If the federation ID - * matches this federation, send an MSG_TYPE_ACK and otherwise send - * a MSG_TYPE_REJECT message. - * @param socket_id Pointer to the socket on which to listen. - * @param client_fd The socket address. - * @return The federate ID for success or -1 for failure. - */ - static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_in* client_fd) { - // Buffer for message ID, federate ID, type (persistent or transient), and federation ID length. - size_t length = 1 + sizeof(uint16_t) + 1 + 1; // Message ID, federate ID, length of fedration ID, type. - unsigned char buffer[length]; - - // Read bytes from the socket. We need 4 bytes. - if (read_from_socket_close_on_error(socket_id, length, buffer)) { - lf_print_error("RTI failed to read from accepted socket."); - return -1; - } + // Close the socket. + shutdown(*socket_id, SHUT_RDWR); + close(*socket_id); + *socket_id = -1; + LF_MUTEX_UNLOCK(&rti_mutex); + } + lf_print("handle_timestamp for transient 1157"); - uint16_t fed_id = rti_remote->base.number_of_scheduling_nodes; // Initialize to an invalid value. - bool is_transient = false; + /** + * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload + * a federate ID and a federation ID. If the federation ID + * matches this federation, send an MSG_TYPE_ACK and otherwise send + * a MSG_TYPE_REJECT message. + * @param socket_id Pointer to the socket on which to listen. + * @param client_fd The socket address. + * @return The federate ID for success or -1 for failure. + */ + static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_in* client_fd) { + // Buffer for message ID, federate ID, type (persistent or transient), and federation ID length. + size_t length = 1 + sizeof(uint16_t) + 1 + 1; // Message ID, federate ID, length of fedration ID, type. + unsigned char buffer[length]; + + // Read bytes from the socket. We need 4 bytes. + if (read_from_socket_close_on_error(socket_id, length, buffer)) { + lf_print_error("RTI failed to read from accepted socket."); + return -1; + } - // First byte received is the message type. - if (buffer[0] != MSG_TYPE_FED_IDS) { - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); - } - if (buffer[0] == MSG_TYPE_P2P_SENDING_FED_ID || buffer[0] == MSG_TYPE_P2P_TAGGED_MESSAGE) { - // The federate is trying to connect to a peer, not to the RTI. - // It has connected to the RTI instead. - // FIXME: This should not happen, but apparently has been observed. - // It should not happen because the peers get the port and IP address - // of the peer they want to connect to from the RTI. - // If the connection is a peer-to-peer connection between two - // federates, reject the connection with the WRONG_SERVER error. - send_reject(socket_id, WRONG_SERVER); - } else if (buffer[0] == MSG_TYPE_FED_NONCE) { - send_reject(socket_id, RTI_NOT_EXECUTED_WITH_AUTH); - lf_print_error("RTI not executed with HMAC authentication option using -a or --auth."); - } else { - send_reject(socket_id, UNEXPECTED_MESSAGE); - } - lf_print_error("RTI expected a MSG_TYPE_FED_IDS message. Got %u (see net_common.h).", buffer[0]); - return -1; - } else { - // Received federate ID. - fed_id = extract_uint16(buffer + 1); - is_transient = (buffer[sizeof(uint16_t) + 1] == 1) ? true : false; - if (is_transient) { - LF_PRINT_LOG("RTI received federate ID: %d, which is transient.", fed_id); - } else { - LF_PRINT_LOG("RTI received federate ID: %d, which is persistent.", fed_id); - } + uint16_t fed_id = rti_remote->base.number_of_scheduling_nodes; // Initialize to an invalid value. + bool is_transient = false; - // Read the federation ID. First read the length, which is one byte. - size_t federation_id_length = (size_t)buffer[sizeof(uint16_t) + 2]; - char federation_id_received[federation_id_length + 1]; // One extra for null terminator. - // Next read the actual federation ID. - if (read_from_socket_close_on_error(socket_id, federation_id_length, (unsigned char*)federation_id_received)) { - lf_print_error("RTI failed to read federation id from federate %d.", fed_id); - return -1; - } + // First byte received is the message type. + if (buffer[0] != MSG_TYPE_FED_IDS) { + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); + } + if (buffer[0] == MSG_TYPE_P2P_SENDING_FED_ID || buffer[0] == MSG_TYPE_P2P_TAGGED_MESSAGE) { + // The federate is trying to connect to a peer, not to the RTI. + // It has connected to the RTI instead. + // FIXME: This should not happen, but apparently has been observed. + // It should not happen because the peers get the port and IP address + // of the peer they want to connect to from the RTI. + // If the connection is a peer-to-peer connection between two + // federates, reject the connection with the WRONG_SERVER error. + send_reject(socket_id, WRONG_SERVER); + } else if (buffer[0] == MSG_TYPE_FED_NONCE) { + send_reject(socket_id, RTI_NOT_EXECUTED_WITH_AUTH); + lf_print_error("RTI not executed with HMAC authentication option using -a or --auth."); + } else { + send_reject(socket_id, UNEXPECTED_MESSAGE); + } + lf_print_error("RTI expected a MSG_TYPE_FED_IDS message. Got %u (see net_common.h).", buffer[0]); + return -1; + } else { + // Received federate ID. + fed_id = extract_uint16(buffer + 1); + is_transient = (buffer[sizeof(uint16_t) + 1] == 1) ? true : false; + if (is_transient) { + LF_PRINT_LOG("RTI received federate ID: %d, which is transient.", fed_id); + } else { + LF_PRINT_LOG("RTI received federate ID: %d, which is persistent.", fed_id); + } - // Terminate the string with a null. - federation_id_received[federation_id_length] = 0; + // Read the federation ID. First read the length, which is one byte. + size_t federation_id_length = (size_t)buffer[sizeof(uint16_t) + 2]; + char federation_id_received[federation_id_length + 1]; // One extra for null terminator. + // Next read the actual federation ID. + if (read_from_socket_close_on_error(socket_id, federation_id_length, (unsigned char*)federation_id_received)) { + lf_print_error("RTI failed to read federation id from federate %d.", fed_id); + return -1; + } - LF_PRINT_DEBUG("RTI received federation ID: %s.", federation_id_received); + // Terminate the string with a null. + federation_id_received[federation_id_length] = 0; + LF_PRINT_DEBUG("RTI received federation ID: %s.", federation_id_received); + + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(rti_remote->base.trace, receive_FED_ID, fed_id, NULL); + } + // Compare the received federation ID to mine. + if (strncmp(rti_remote->federation_id, federation_id_received, federation_id_length) != 0) { + // Federation IDs do not match. Send back a MSG_TYPE_REJECT message. + lf_print_warning("Federate from another federation %s attempted to connect to RTI in federation %s.", + federation_id_received, rti_remote->federation_id); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(rti_remote->base.trace, receive_FED_ID, fed_id, NULL); + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } - // Compare the received federation ID to mine. - if (strncmp(rti_remote->federation_id, federation_id_received, federation_id_length) != 0) { - // Federation IDs do not match. Send back a MSG_TYPE_REJECT message. - lf_print_warning("Federate from another federation %s attempted to connect to RTI in federation %s.", - federation_id_received, rti_remote->federation_id); + send_reject(socket_id, FEDERATION_ID_DOES_NOT_MATCH); + return -1; + } else { + if (fed_id >= rti_remote->base.number_of_scheduling_nodes) { + // Federate ID is out of range. + lf_print_error("RTI received federate ID %d, which is out of range.", fed_id); if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } - send_reject(socket_id, FEDERATION_ID_DOES_NOT_MATCH); + send_reject(socket_id, FEDERATE_ID_OUT_OF_RANGE); return -1; } else { - if (fed_id >= rti_remote->base.number_of_scheduling_nodes) { - // Federate ID is out of range. - lf_print_error("RTI received federate ID %d, which is out of range.", fed_id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); - } - send_reject(socket_id, FEDERATE_ID_OUT_OF_RANGE); - return -1; - } else { - // Find out if it is a new connection or a hot swap. - // Reject if: - // - duplicate of a connected persistent federate - // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that - // particular federate - // - or it is a hot swap, but it is not the execution phase yet - // Find out if it is a new connection or a hot swap. - // Reject if: - // - duplicate of a connected persistent federate - // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that - // particular federate - // - or it is a hot swap, but it is not the execution phase yet - if ((rti_remote->base.scheduling_nodes[fed_id])->state != NOT_CONNECTED) { - if (!is_transient) { - lf_print_error("RTI received duplicate federate ID: %d.", fed_id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); - } - send_reject(socket_id, FEDERATE_ID_IN_USE); - return -1; - } else if (hot_swap_in_progress || rti_remote->phase != execution_phase) { - lf_print_warning("RTI rejects the connection of transient federate %d, \ + // Find out if it is a new connection or a hot swap. + // Reject if: + // - duplicate of a connected persistent federate + // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that + // particular federate + // - or it is a hot swap, but it is not the execution phase yet + // Find out if it is a new connection or a hot swap. + // Reject if: + // - duplicate of a connected persistent federate + // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that + // particular federate + // - or it is a hot swap, but it is not the execution phase yet + if ((rti_remote->base.scheduling_nodes[fed_id])->state != NOT_CONNECTED) { + if (!is_transient) { + lf_print_error("RTI received duplicate federate ID: %d.", fed_id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + } + send_reject(socket_id, FEDERATE_ID_IN_USE); + return -1; + } else if (hot_swap_in_progress || rti_remote->phase != execution_phase) { + lf_print_warning("RTI rejects the connection of transient federate %d, \ because a hot swap is already in progress for federate %d. \n\ Only one hot swap operation is allowed at a time.", - fed_id, hot_swap_federate->enclave.id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); - } - send_reject(socket_id, FEDERATE_ID_IN_USE); - return -1; + fed_id, hot_swap_federate->enclave.id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); } + send_reject(socket_id, FEDERATE_ID_IN_USE); + return -1; } } } } + } - federate_info_t* fed_twin = GET_FED_INFO(fed_id); - federate_info_t* fed; - // If the federate is already connected (making the request a duplicate), and that - // the federate is transient, and it is the execution phase, then mark that a hot - // swap is in progreass and initialize the hot_swap_federate. - // Otherwise, proceed with a normal transinet connection - if (fed_twin->enclave.state != NOT_CONNECTED && is_transient && fed_twin->is_transient && - rti_remote->phase == execution_phase && !hot_swap_in_progress) { - // Allocate memory for the new federate and initilize it - hot_swap_federate = (federate_info_t*)malloc(sizeof(federate_info_t)); - initialize_federate(hot_swap_federate, fed_id); - - // Set that hot swap is in progress - hot_swap_in_progress = true; - // free(fed); // Free the old memory to prevent memory leak - fed = hot_swap_federate; - lf_print("RTI: Hot Swap starting for federate %d.", fed_id); - } else { - fed = fed_twin; - fed->is_transient = is_transient; - } + federate_info_t* fed_twin = GET_FED_INFO(fed_id); + federate_info_t* fed; + // If the federate is already connected (making the request a duplicate), and that + // the federate is transient, and it is the execution phase, then mark that a hot + // swap is in progreass and initialize the hot_swap_federate. + // Otherwise, proceed with a normal transinet connection + if (fed_twin->enclave.state != NOT_CONNECTED && is_transient && fed_twin->is_transient && + rti_remote->phase == execution_phase && !hot_swap_in_progress) { + // Allocate memory for the new federate and initilize it + hot_swap_federate = (federate_info_t*)malloc(sizeof(federate_info_t)); + initialize_federate(hot_swap_federate, fed_id); + + // Set that hot swap is in progress + hot_swap_in_progress = true; + // free(fed); // Free the old memory to prevent memory leak + fed = hot_swap_federate; + lf_print("RTI: Hot Swap starting for federate %d.", fed_id); + } else { + fed = fed_twin; + fed->is_transient = is_transient; + } - // The MSG_TYPE_FED_IDS message has the right federation ID. + // The MSG_TYPE_FED_IDS message has the right federation ID. - // Get the peer address from the connected socket_id. Then assign it as the federate's socket server. - struct sockaddr_in peer_addr; - socklen_t addr_len = sizeof(peer_addr); - if (getpeername(*socket_id, (struct sockaddr*)&peer_addr, &addr_len) != 0) { - lf_print_error("RTI failed to get peer address."); - } - fed->server_ip_addr = peer_addr.sin_addr; + // Get the peer address from the connected socket_id. Then assign it as the federate's socket server. + struct sockaddr_in peer_addr; + socklen_t addr_len = sizeof(peer_addr); + if (getpeername(*socket_id, (struct sockaddr*)&peer_addr, &addr_len) != 0) { + lf_print_error("RTI failed to get peer address."); + } + fed->server_ip_addr = peer_addr.sin_addr; #if LOG_LEVEL >= LOG_LEVEL_DEBUG - // Create the human readable format and copy that into - // the .server_hostname field of the federate. - char str[INET_ADDRSTRLEN + 1]; - inet_ntop(AF_INET, &fed->server_ip_addr, str, INET_ADDRSTRLEN); - strncpy(fed->server_hostname, str, INET_ADDRSTRLEN); + // Create the human readable format and copy that into + // the .server_hostname field of the federate. + char str[INET_ADDRSTRLEN + 1]; + inet_ntop(AF_INET, &fed->server_ip_addr, str, INET_ADDRSTRLEN); + strncpy(fed->server_hostname, str, INET_ADDRSTRLEN); - LF_PRINT_DEBUG("RTI got address %s from federate %d.", fed->server_hostname, fed_id); + LF_PRINT_DEBUG("RTI got address %s from federate %d.", fed->server_hostname, fed_id); #endif - fed->socket = *socket_id; + fed->socket = *socket_id; - // Set the federate's state as pending - // because it is waiting for the start time to be - // sent by the RTI before beginning its execution. - fed->enclave.state = PENDING; + // Set the federate's state as pending + // because it is waiting for the start time to be + // sent by the RTI before beginning its execution. + fed->enclave.state = PENDING; - LF_PRINT_DEBUG("RTI responding with MSG_TYPE_ACK to federate %d.", fed_id); - // Send an MSG_TYPE_ACK message. - unsigned char ack_message = MSG_TYPE_ACK; - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_ACK, fed_id, NULL); - } - LF_MUTEX_LOCK(&rti_mutex); - if (write_to_socket_close_on_error(&fed->socket, 1, &ack_message)) { - LF_MUTEX_UNLOCK(&rti_mutex); - lf_print_error("RTI failed to write MSG_TYPE_ACK message to federate %d.", fed_id); - return -1; - } + LF_PRINT_DEBUG("RTI responding with MSG_TYPE_ACK to federate %d.", fed_id); + // Send an MSG_TYPE_ACK message. + unsigned char ack_message = MSG_TYPE_ACK; + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_ACK, fed_id, NULL); + } + LF_MUTEX_LOCK(&rti_mutex); + if (write_to_socket_close_on_error(&fed->socket, 1, &ack_message)) { LF_MUTEX_UNLOCK(&rti_mutex); + lf_print_error("RTI failed to write MSG_TYPE_ACK message to federate %d.", fed_id); + return -1; + } + LF_MUTEX_UNLOCK(&rti_mutex); - LF_PRINT_DEBUG("RTI sent MSG_TYPE_ACK to federate %d.", fed_id); - - return (int32_t)fed_id; - } - - /** - * Listen for a MSG_TYPE_NEIGHBOR_STRUCTURE message, and upon receiving it, fill - * out the relevant information in the federate's struct. - * - * In case of a hot swap, check that no changes were made to the connections, compared - * to the first instance that joigned. This means that the first instance to join - * __is__ the reference. - * - * @return 1 on success and 0 on failure. - */ - static int receive_connection_information(int* socket_id, uint16_t fed_id) { - LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_NEIGHBOR_STRUCTURE from federate %d.", fed_id); - unsigned char connection_info_header[MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE]; - read_from_socket_fail_on_error(socket_id, MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE, connection_info_header, NULL, - "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message header from federate %d.", - fed_id); - - if (connection_info_header[0] != MSG_TYPE_NEIGHBOR_STRUCTURE) { - lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " - "Rejecting federate.", - fed_id, connection_info_header[0]); - send_reject(socket_id, UNEXPECTED_MESSAGE); - return 0; - } else { - // In case of a transient federate that is joining again, or a hot swap, then - // check that the connection information did not change. - federate_info_t* fed = GET_FED_INFO(fed_id); - federate_info_t* temp_fed = NULL; - if (lf_tag_compare(fed->effective_start_tag, NEVER_TAG) != 0) { - if (hot_swap_in_progress) { - fed = hot_swap_federate; - } else { - temp_fed = (federate_info_t*)calloc(1, sizeof(federate_info_t)); - initialize_federate(temp_fed, fed_id); - fed = temp_fed; - } - } - // Read the number of upstream and downstream connections - fed->enclave.num_upstream = extract_int32(&(connection_info_header[1])); - fed->enclave.num_downstream = extract_int32(&(connection_info_header[1 + sizeof(int32_t)])); - LF_PRINT_DEBUG("RTI got %d upstreams and %d downstreams from federate %d.", fed->enclave.num_upstream, - fed->enclave.num_downstream, fed_id); - - // Allocate memory for the upstream and downstream pointers - if (fed->enclave.num_upstream > 0) { - fed->enclave.upstream = (uint16_t*)malloc(sizeof(uint16_t) * fed->enclave.num_upstream); - LF_ASSERT_NON_NULL(fed->enclave.upstream); - // Allocate memory for the upstream delay pointers - fed->enclave.upstream_delay = (interval_t*)malloc(sizeof(interval_t) * fed->enclave.num_upstream); - LF_ASSERT_NON_NULL(fed->enclave.upstream_delay); + LF_PRINT_DEBUG("RTI sent MSG_TYPE_ACK to federate %d.", fed_id); + + return (int32_t)fed_id; + } + + /** + * Listen for a MSG_TYPE_NEIGHBOR_STRUCTURE message, and upon receiving it, fill + * out the relevant information in the federate's struct. + * + * In case of a hot swap, check that no changes were made to the connections, compared + * to the first instance that joigned. This means that the first instance to join + * __is__ the reference. + * + * @return 1 on success and 0 on failure. + */ + static int receive_connection_information(int* socket_id, uint16_t fed_id) { + LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_NEIGHBOR_STRUCTURE from federate %d.", fed_id); + unsigned char connection_info_header[MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE]; + read_from_socket_fail_on_error(socket_id, MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE, connection_info_header, NULL, + "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message header from federate %d.", + fed_id); + + if (connection_info_header[0] != MSG_TYPE_NEIGHBOR_STRUCTURE) { + lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " + "Rejecting federate.", + fed_id, connection_info_header[0]); + send_reject(socket_id, UNEXPECTED_MESSAGE); + return 0; + } else { + // In case of a transient federate that is joining again, or a hot swap, then + // check that the connection information did not change. + federate_info_t* fed = GET_FED_INFO(fed_id); + federate_info_t* temp_fed = NULL; + if (lf_tag_compare(fed->effective_start_tag, NEVER_TAG) != 0) { + if (hot_swap_in_progress) { + fed = hot_swap_federate; } else { - fed->enclave.upstream = (uint16_t*)NULL; - fed->enclave.upstream_delay = (interval_t*)NULL; + temp_fed = (federate_info_t*)calloc(1, sizeof(federate_info_t)); + initialize_federate(temp_fed, fed_id); + fed = temp_fed; } - if (fed->enclave.num_downstream > 0) { - fed->enclave.downstream = (uint16_t*)malloc(sizeof(uint16_t) * fed->enclave.num_downstream); - LF_ASSERT_NON_NULL(fed->enclave.downstream); - } else { - fed->enclave.downstream = (uint16_t*)NULL; + } + // Read the number of upstream and downstream connections + fed->enclave.num_upstream = extract_int32(&(connection_info_header[1])); + fed->enclave.num_downstream = extract_int32(&(connection_info_header[1 + sizeof(int32_t)])); + LF_PRINT_DEBUG("RTI got %d upstreams and %d downstreams from federate %d.", fed->enclave.num_upstream, + fed->enclave.num_downstream, fed_id); + + // Allocate memory for the upstream and downstream pointers + if (fed->enclave.num_upstream > 0) { + fed->enclave.upstream = (uint16_t*)malloc(sizeof(uint16_t) * fed->enclave.num_upstream); + LF_ASSERT_NON_NULL(fed->enclave.upstream); + // Allocate memory for the upstream delay pointers + fed->enclave.upstream_delay = (interval_t*)malloc(sizeof(interval_t) * fed->enclave.num_upstream); + LF_ASSERT_NON_NULL(fed->enclave.upstream_delay); + } else { + fed->enclave.upstream = (uint16_t*)NULL; + fed->enclave.upstream_delay = (interval_t*)NULL; + } + if (fed->enclave.num_downstream > 0) { + fed->enclave.downstream = (uint16_t*)malloc(sizeof(uint16_t) * fed->enclave.num_downstream); + LF_ASSERT_NON_NULL(fed->enclave.downstream); + } else { + fed->enclave.downstream = (uint16_t*)NULL; + } + + size_t connections_info_body_size = ((sizeof(uint16_t) + sizeof(int64_t)) * fed->enclave.num_upstream) + + (sizeof(uint16_t) * fed->enclave.num_downstream); + unsigned char* connections_info_body = NULL; + if (connections_info_body_size > 0) { + connections_info_body = (unsigned char*)malloc(connections_info_body_size); + LF_ASSERT_NON_NULL(connections_info_body); + read_from_socket_fail_on_error(socket_id, connections_info_body_size, connections_info_body, NULL, + "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message body from federate %d.", + fed_id); + // Keep track of where we are in the buffer + size_t message_head = 0; + // First, read the info about upstream federates + for (int i = 0; i < fed->enclave.num_upstream; i++) { + fed->enclave.upstream[i] = extract_uint16(&(connections_info_body[message_head])); + message_head += sizeof(uint16_t); + fed->enclave.upstream_delay[i] = extract_int64(&(connections_info_body[message_head])); + message_head += sizeof(int64_t); } - size_t connections_info_body_size = ((sizeof(uint16_t) + sizeof(int64_t)) * fed->enclave.num_upstream) + - (sizeof(uint16_t) * fed->enclave.num_downstream); - unsigned char* connections_info_body = NULL; - if (connections_info_body_size > 0) { - connections_info_body = (unsigned char*)malloc(connections_info_body_size); - LF_ASSERT_NON_NULL(connections_info_body); - read_from_socket_fail_on_error( - socket_id, connections_info_body_size, connections_info_body, NULL, - "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message body from federate %d.", fed_id); - // Keep track of where we are in the buffer - size_t message_head = 0; - // First, read the info about upstream federates - for (int i = 0; i < fed->enclave.num_upstream; i++) { - fed->enclave.upstream[i] = extract_uint16(&(connections_info_body[message_head])); - message_head += sizeof(uint16_t); - fed->enclave.upstream_delay[i] = extract_int64(&(connections_info_body[message_head])); - message_head += sizeof(int64_t); - } + // Next, read the info about downstream federates + for (int i = 0; i < fed->enclave.num_downstream; i++) { + fed->enclave.downstream[i] = extract_uint16(&(connections_info_body[message_head])); + message_head += sizeof(uint16_t); + } - // Next, read the info about downstream federates - for (int i = 0; i < fed->enclave.num_downstream; i++) { - fed->enclave.downstream[i] = extract_uint16(&(connections_info_body[message_head])); - message_head += sizeof(uint16_t); - } + free(connections_info_body); + } - free(connections_info_body); + // NOTE: In this design, changes in the connections are not allowed. This means that the first + // instance to join __is__ the reference. If this policy is to be changed, then it is in + // the following lines will be updated accordingly. + if (hot_swap_in_progress || temp_fed != NULL) { + if (temp_fed == NULL) { + temp_fed = hot_swap_federate; } - - // NOTE: In this design, changes in the connections are not allowed. This means that the first - // instance to join __is__ the reference. If this policy is to be changed, then it is in - // the following lines will be updated accordingly. - if (hot_swap_in_progress || temp_fed != NULL) { - if (temp_fed == NULL) { - temp_fed = hot_swap_federate; + // Now, compare the previous and the new neighberhood structure + // Start with the number of upstreams and downstreams + bool reject = false; + if ((fed->enclave.num_upstream != temp_fed->enclave.num_upstream) || + (fed->enclave.num_downstream != temp_fed->enclave.num_downstream)) { + reject = true; + } else { + // Then check all upstreams and their delays + for (int i = 0; i < fed->enclave.num_upstream; i++) { + if ((fed->enclave.upstream[i] != temp_fed->enclave.upstream[i]) || + (fed->enclave.upstream_delay[i] != temp_fed->enclave.upstream_delay[i])) { + reject = true; + break; + } } - // Now, compare the previous and the new neighberhood structure - // Start with the number of upstreams and downstreams - bool reject = false; - if ((fed->enclave.num_upstream != temp_fed->enclave.num_upstream) || - (fed->enclave.num_downstream != temp_fed->enclave.num_downstream)) { - reject = true; - } else { - // Then check all upstreams and their delays - for (int i = 0; i < fed->enclave.num_upstream; i++) { - if ((fed->enclave.upstream[i] != temp_fed->enclave.upstream[i]) || - (fed->enclave.upstream_delay[i] != temp_fed->enclave.upstream_delay[i])) { + if (!reject) { + // Finally, check all downstream federates + for (int i = 0; i < fed->enclave.num_downstream; i++) { + if (fed->enclave.downstream[i] != temp_fed->enclave.downstream[i]) { reject = true; break; } } - if (!reject) { - // Finally, check all downstream federates - for (int i = 0; i < fed->enclave.num_downstream; i++) { - if (fed->enclave.downstream[i] != temp_fed->enclave.downstream[i]) { - reject = true; - break; - } - } - } } - if (reject) { - if (temp_fed != hot_swap_federate) { - free(temp_fed); - } - return 0; + } + if (reject) { + if (temp_fed != hot_swap_federate) { + free(temp_fed); } + return 0; } } - LF_PRINT_DEBUG("RTI received neighbor structure from federate %d.", fed_id); - return 1; } + LF_PRINT_DEBUG("RTI received neighbor structure from federate %d.", fed_id); + return 1; + } - /** - * Listen for a MSG_TYPE_UDP_PORT message, and upon receiving it, set up - * clock synchronization and perform the initial clock synchronization. - * Initial clock synchronization is performed only if the MSG_TYPE_UDP_PORT message - * payload is not UINT16_MAX. If it is also not 0, then this function sets - * up to perform runtime clock synchronization using the UDP port number - * specified in the payload to communicate with the federate's clock - * synchronization logic. - * @param socket_id The socket on which to listen. - * @param fed_id The federate ID. - * @return 1 for success, 0 for failure. - */ - static int receive_udp_message_and_set_up_clock_sync(int* socket_id, uint16_t fed_id) { - // Read the MSG_TYPE_UDP_PORT message from the federate regardless of the status of - // clock synchronization. This message will tell the RTI whether the federate - // is doing clock synchronization, and if it is, what port to use for UDP. - LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_UDP_PORT from federate %d.", fed_id); - unsigned char response[1 + sizeof(uint16_t)]; - read_from_socket_fail_on_error(socket_id, 1 + sizeof(uint16_t), response, NULL, - "RTI failed to read MSG_TYPE_UDP_PORT message from federate %d.", fed_id); - if (response[0] != MSG_TYPE_UDP_PORT) { - lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " - "Rejecting federate.", - fed_id, response[0]); - send_reject(socket_id, UNEXPECTED_MESSAGE); - return 0; + /** + * Listen for a MSG_TYPE_UDP_PORT message, and upon receiving it, set up + * clock synchronization and perform the initial clock synchronization. + * Initial clock synchronization is performed only if the MSG_TYPE_UDP_PORT message + * payload is not UINT16_MAX. If it is also not 0, then this function sets + * up to perform runtime clock synchronization using the UDP port number + * specified in the payload to communicate with the federate's clock + * synchronization logic. + * @param socket_id The socket on which to listen. + * @param fed_id The federate ID. + * @return 1 for success, 0 for failure. + */ + static int receive_udp_message_and_set_up_clock_sync(int* socket_id, uint16_t fed_id) { + // Read the MSG_TYPE_UDP_PORT message from the federate regardless of the status of + // clock synchronization. This message will tell the RTI whether the federate + // is doing clock synchronization, and if it is, what port to use for UDP. + LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_UDP_PORT from federate %d.", fed_id); + unsigned char response[1 + sizeof(uint16_t)]; + read_from_socket_fail_on_error(socket_id, 1 + sizeof(uint16_t), response, NULL, + "RTI failed to read MSG_TYPE_UDP_PORT message from federate %d.", fed_id); + if (response[0] != MSG_TYPE_UDP_PORT) { + lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " + "Rejecting federate.", + fed_id, response[0]); + send_reject(socket_id, UNEXPECTED_MESSAGE); + return 0; + } else { + federate_info_t* fed; + if (hot_swap_in_progress) { + fed = hot_swap_federate; } else { - federate_info_t* fed; - if (hot_swap_in_progress) { - fed = hot_swap_federate; - } else { - fed = GET_FED_INFO(fed_id); - } - if (rti_remote->clock_sync_global_status >= clock_sync_init) { - // If no initial clock sync, no need perform initial clock sync. - uint16_t federate_UDP_port_number = extract_uint16(&(response[1])); - - LF_PRINT_DEBUG("RTI got MSG_TYPE_UDP_PORT %u from federate %d.", federate_UDP_port_number, fed_id); - - // A port number of UINT16_MAX means initial clock sync should not be performed. - if (federate_UDP_port_number != UINT16_MAX) { - // Perform the initialization clock synchronization with the federate. - // Send the required number of messages for clock synchronization - for (int i = 0; i < rti_remote->clock_sync_exchanges_per_interval; i++) { - // Send the RTI's current physical time T1 to the federate. - send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, TCP); - - // Listen for reply message, which should be T3. - size_t message_size = 1 + sizeof(uint16_t); - unsigned char buffer[message_size]; - read_from_socket_fail_on_error(socket_id, message_size, buffer, NULL, - "Socket to federate %d unexpectedly closed.", fed_id); - if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { - uint16_t fed_id = extract_uint16(&(buffer[1])); - LF_PRINT_DEBUG("RTI received T3 clock sync message from federate %d.", fed_id); - handle_physical_clock_sync_message(fed, TCP); - } else { - lf_print_error("Unexpected message %u from federate %d.", buffer[0], fed_id); - send_reject(socket_id, UNEXPECTED_MESSAGE); - return 0; - } + fed = GET_FED_INFO(fed_id); + } + if (rti_remote->clock_sync_global_status >= clock_sync_init) { + // If no initial clock sync, no need perform initial clock sync. + uint16_t federate_UDP_port_number = extract_uint16(&(response[1])); + + LF_PRINT_DEBUG("RTI got MSG_TYPE_UDP_PORT %u from federate %d.", federate_UDP_port_number, fed_id); + + // A port number of UINT16_MAX means initial clock sync should not be performed. + if (federate_UDP_port_number != UINT16_MAX) { + // Perform the initialization clock synchronization with the federate. + // Send the required number of messages for clock synchronization + for (int i = 0; i < rti_remote->clock_sync_exchanges_per_interval; i++) { + // Send the RTI's current physical time T1 to the federate. + send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, TCP); + + // Listen for reply message, which should be T3. + size_t message_size = 1 + sizeof(uint16_t); + unsigned char buffer[message_size]; + read_from_socket_fail_on_error(socket_id, message_size, buffer, NULL, + "Socket to federate %d unexpectedly closed.", fed_id); + if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { + uint16_t fed_id = extract_uint16(&(buffer[1])); + LF_PRINT_DEBUG("RTI received T3 clock sync message from federate %d.", fed_id); + handle_physical_clock_sync_message(fed, TCP); + } else { + lf_print_error("Unexpected message %u from federate %d.", buffer[0], fed_id); + send_reject(socket_id, UNEXPECTED_MESSAGE); + return 0; } - LF_PRINT_DEBUG("RTI finished initial clock synchronization with federate %d.", fed_id); } - if (rti_remote->clock_sync_global_status >= clock_sync_on) { - // If no runtime clock sync, no need to set up the UDP port. - if (federate_UDP_port_number > 0) { - // Initialize the UDP_addr field of the federate struct - fed->UDP_addr.sin_family = AF_INET; - fed->UDP_addr.sin_port = htons(federate_UDP_port_number); - fed->UDP_addr.sin_addr = fed->server_ip_addr; - } - } else { - // Disable clock sync after initial round. - fed->clock_synchronization_enabled = false; + LF_PRINT_DEBUG("RTI finished initial clock synchronization with federate %d.", fed_id); + } + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + // If no runtime clock sync, no need to set up the UDP port. + if (federate_UDP_port_number > 0) { + // Initialize the UDP_addr field of the federate struct + fed->UDP_addr.sin_family = AF_INET; + fed->UDP_addr.sin_port = htons(federate_UDP_port_number); + fed->UDP_addr.sin_addr = fed->server_ip_addr; } } else { - // No clock synchronization at all. - LF_PRINT_DEBUG("RTI: No clock synchronization for federate %d.", fed_id); - // Clock synchronization is universally disabled via the clock-sync command-line parameter - // (-c off was passed to the RTI). - // Note that the federates are still going to send a - // MSG_TYPE_UDP_PORT message but with a payload (port) of -1. + // Disable clock sync after initial round. fed->clock_synchronization_enabled = false; } + } else { + // No clock synchronization at all. + LF_PRINT_DEBUG("RTI: No clock synchronization for federate %d.", fed_id); + // Clock synchronization is universally disabled via the clock-sync command-line parameter + // (-c off was passed to the RTI). + // Note that the federates are still going to send a + // MSG_TYPE_UDP_PORT message but with a payload (port) of -1. + fed->clock_synchronization_enabled = false; } - return 1; } + return 1; + } #ifdef __RTI_AUTH__ - /** - * Authenticate incoming federate by performing HMAC-based authentication. - * - * @param socket Socket for the incoming federate tryting to authenticate. - * @return True if authentication is successful and false otherwise. - */ - static bool authenticate_federate(int* socket) { - // Wait for MSG_TYPE_FED_NONCE from federate. - size_t fed_id_length = sizeof(uint16_t); - unsigned char buffer[1 + fed_id_length + NONCE_LENGTH]; - read_from_socket_fail_on_error(socket, 1 + fed_id_length + NONCE_LENGTH, buffer, NULL, - "Failed to read MSG_TYPE_FED_NONCE"); - if (buffer[0] != MSG_TYPE_FED_NONCE) { - lf_print_error_and_exit("Received unexpected response %u from the FED (see net_common.h).", buffer[0]); - } - unsigned int hmac_length = SHA256_HMAC_LENGTH; - size_t federation_id_length = strnlen(rti_remote->federation_id, 255); - // HMAC tag is created with MSG_TYPE, federate ID, received federate nonce. - unsigned char mac_buf[1 + fed_id_length + NONCE_LENGTH]; - mac_buf[0] = MSG_TYPE_RTI_RESPONSE; - memcpy(&mac_buf[1], &buffer[1], fed_id_length); - memcpy(&mac_buf[1 + fed_id_length], &buffer[1 + fed_id_length], NONCE_LENGTH); - unsigned char hmac_tag[hmac_length]; - unsigned char* ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, mac_buf, - 1 + fed_id_length + NONCE_LENGTH, hmac_tag, &hmac_length); - if (ret == NULL) { - lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_RTI_RESPONSE."); - } - // Make buffer for message type, RTI's nonce, and HMAC tag. - unsigned char sender[1 + NONCE_LENGTH + hmac_length]; - sender[0] = MSG_TYPE_RTI_RESPONSE; - unsigned char rti_nonce[NONCE_LENGTH]; - RAND_bytes(rti_nonce, NONCE_LENGTH); - memcpy(&sender[1], rti_nonce, NONCE_LENGTH); - memcpy(&sender[1 + NONCE_LENGTH], hmac_tag, hmac_length); - if (write_to_socket(*socket, 1 + NONCE_LENGTH + hmac_length, sender)) { - lf_print_error("Failed to send nonce to federate."); - } + /** + * Authenticate incoming federate by performing HMAC-based authentication. + * + * @param socket Socket for the incoming federate tryting to authenticate. + * @return True if authentication is successful and false otherwise. + */ + static bool authenticate_federate(int* socket) { + // Wait for MSG_TYPE_FED_NONCE from federate. + size_t fed_id_length = sizeof(uint16_t); + unsigned char buffer[1 + fed_id_length + NONCE_LENGTH]; + read_from_socket_fail_on_error(socket, 1 + fed_id_length + NONCE_LENGTH, buffer, NULL, + "Failed to read MSG_TYPE_FED_NONCE"); + if (buffer[0] != MSG_TYPE_FED_NONCE) { + lf_print_error_and_exit("Received unexpected response %u from the FED (see net_common.h).", buffer[0]); + } + unsigned int hmac_length = SHA256_HMAC_LENGTH; + size_t federation_id_length = strnlen(rti_remote->federation_id, 255); + // HMAC tag is created with MSG_TYPE, federate ID, received federate nonce. + unsigned char mac_buf[1 + fed_id_length + NONCE_LENGTH]; + mac_buf[0] = MSG_TYPE_RTI_RESPONSE; + memcpy(&mac_buf[1], &buffer[1], fed_id_length); + memcpy(&mac_buf[1 + fed_id_length], &buffer[1 + fed_id_length], NONCE_LENGTH); + unsigned char hmac_tag[hmac_length]; + unsigned char* ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, mac_buf, + 1 + fed_id_length + NONCE_LENGTH, hmac_tag, &hmac_length); + if (ret == NULL) { + lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_RTI_RESPONSE."); + } + // Make buffer for message type, RTI's nonce, and HMAC tag. + unsigned char sender[1 + NONCE_LENGTH + hmac_length]; + sender[0] = MSG_TYPE_RTI_RESPONSE; + unsigned char rti_nonce[NONCE_LENGTH]; + RAND_bytes(rti_nonce, NONCE_LENGTH); + memcpy(&sender[1], rti_nonce, NONCE_LENGTH); + memcpy(&sender[1 + NONCE_LENGTH], hmac_tag, hmac_length); + if (write_to_socket(*socket, 1 + NONCE_LENGTH + hmac_length, sender)) { + lf_print_error("Failed to send nonce to federate."); + } - // Wait for MSG_TYPE_FED_RESPONSE - unsigned char received[1 + hmac_length]; - read_from_socket_fail_on_error(socket, 1 + hmac_length, received, NULL, "Failed to read federate response."); - if (received[0] != MSG_TYPE_FED_RESPONSE) { - lf_print_error_and_exit("Received unexpected response %u from the federate (see net_common.h).", received[0]); - return false; - } - // HMAC tag is created with MSG_TYPE_FED_RESPONSE and RTI's nonce. - unsigned char mac_buf2[1 + NONCE_LENGTH]; - mac_buf2[0] = MSG_TYPE_FED_RESPONSE; - memcpy(&mac_buf2[1], rti_nonce, NONCE_LENGTH); - unsigned char rti_tag[hmac_length]; - ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, mac_buf2, 1 + NONCE_LENGTH, rti_tag, - &hmac_length); - if (ret == NULL) { - lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_FED_RESPONSE."); - } - // Compare received tag and created tag. - if (memcmp(&received[1], rti_tag, hmac_length) != 0) { - // Federation IDs do not match. Send back a HMAC_DOES_NOT_MATCH message. - lf_print_warning("HMAC authentication failed. Rejecting the federate."); - send_reject(socket, HMAC_DOES_NOT_MATCH); - return false; - } else { - LF_PRINT_LOG("Federate's HMAC verified."); - return true; - } + // Wait for MSG_TYPE_FED_RESPONSE + unsigned char received[1 + hmac_length]; + read_from_socket_fail_on_error(socket, 1 + hmac_length, received, NULL, "Failed to read federate response."); + if (received[0] != MSG_TYPE_FED_RESPONSE) { + lf_print_error_and_exit("Received unexpected response %u from the federate (see net_common.h).", received[0]); + return false; + } + // HMAC tag is created with MSG_TYPE_FED_RESPONSE and RTI's nonce. + unsigned char mac_buf2[1 + NONCE_LENGTH]; + mac_buf2[0] = MSG_TYPE_FED_RESPONSE; + memcpy(&mac_buf2[1], rti_nonce, NONCE_LENGTH); + unsigned char rti_tag[hmac_length]; + ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, mac_buf2, 1 + NONCE_LENGTH, rti_tag, + &hmac_length); + if (ret == NULL) { + lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_FED_RESPONSE."); } + // Compare received tag and created tag. + if (memcmp(&received[1], rti_tag, hmac_length) != 0) { + // Federation IDs do not match. Send back a HMAC_DOES_NOT_MATCH message. + lf_print_warning("HMAC authentication failed. Rejecting the federate."); + send_reject(socket, HMAC_DOES_NOT_MATCH); + return false; + } else { + LF_PRINT_LOG("Federate's HMAC verified."); + return true; + } + } #endif - // FIXME: The socket descriptor here (parameter) is not used. Should be removed? - void lf_connect_to_persistent_federates(int socket_descriptor) { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote.number_of_transient_federates; i++) { - // Wait for an incoming connection request. - struct sockaddr client_fd; - uint32_t client_length = sizeof(client_fd); - // The following blocks until a federate connects. - int socket_id = -1; - while (1) { - socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); - if (socket_id >= 0) { - // Got a socket - break; - } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { - lf_print_error_system_failure("RTI failed to accept the socket."); - } else { - // Try again - lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); - continue; - } + // FIXME: The socket descriptor here (parameter) is not used. Should be removed? + void lf_connect_to_persistent_federates(int socket_descriptor) { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote.number_of_transient_federates; i++) { + // Wait for an incoming connection request. + struct sockaddr client_fd; + uint32_t client_length = sizeof(client_fd); + // The following blocks until a federate connects. + int socket_id = -1; + while (1) { + socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); + if (socket_id >= 0) { + // Got a socket + break; + } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { + lf_print_error_system_failure("RTI failed to accept the socket."); + } else { + // Try again + lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); + continue; } + } // Wait for the first message from the federate when RTI -a option is on. #ifdef __RTI_AUTH__ - if (rti_remote->authentication_enabled) { - if (!authenticate_federate(&socket_id)) { - lf_print_warning("RTI failed to authenticate the incoming federate."); - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - socket_id = -1; - // Ignore the federate that failed authentication. - i--; - continue; - } + if (rti_remote->authentication_enabled) { + if (!authenticate_federate(&socket_id)) { + lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + socket_id = -1; + // Ignore the federate that failed authentication. + i--; + continue; } + } #endif - // The first message from the federate should contain its ID and the federation ID. - int32_t fed_id = receive_and_check_fed_id_message(&socket_id); - if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && - receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { + // The first message from the federate should contain its ID and the federation ID. + int32_t fed_id = receive_and_check_fed_id_message(&socket_id); + if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && + receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - federate_info_t* fed = GET_FED_INFO(fed_id); - lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + federate_info_t* fed = GET_FED_INFO(fed_id); + lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); - // If the federate is transient, then do not count it. - if (fed->is_transient) { - rti_remote->number_of_connected_transient_federates++; - assert(rti_remote->number_of_connected_transient_federates <= rti_remote->number_of_transient_federates); - i--; - lf_print("RTI: Transient federate %d joined.", fed->enclave.id); - } - } else { - // Received message was rejected. Try again. + // If the federate is transient, then do not count it. + if (fed->is_transient) { + rti_remote->number_of_connected_transient_federates++; + assert(rti_remote->number_of_connected_transient_federates <= rti_remote->number_of_transient_federates); i--; + lf_print("RTI: Transient federate %d joined.", fed->enclave.id); } + } else { + // Received message was rejected. Try again. + i--; } - // All federates have connected. - LF_PRINT_DEBUG("All persistent federates have connected to RTI."); - - if (rti_remote->clock_sync_global_status >= clock_sync_on) { - // Create the thread that performs periodic PTP clock synchronization sessions - // over the UDP channel, but only if the UDP channel is open and at least one - // federate is performing runtime clock synchronization. - bool clock_sync_enabled = false; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed_info = GET_FED_INFO(i); - if (fed_info->clock_synchronization_enabled) { - clock_sync_enabled = true; - break; - } - } - if (rti_remote->final_port_UDP != UINT16_MAX && clock_sync_enabled) { - lf_thread_create(&rti_remote->clock_thread, clock_synchronization_thread, NULL); + } + // All federates have connected. + LF_PRINT_DEBUG("All persistent federates have connected to RTI."); + + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + // Create the thread that performs periodic PTP clock synchronization sessions + // over the UDP channel, but only if the UDP channel is open and at least one + // federate is performing runtime clock synchronization. + bool clock_sync_enabled = false; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed_info = GET_FED_INFO(i); + if (fed_info->clock_synchronization_enabled) { + clock_sync_enabled = true; + break; } } + if (rti_remote->final_port_UDP != UINT16_MAX && clock_sync_enabled) { + lf_thread_create(&rti_remote->clock_thread, clock_synchronization_thread, NULL); + } } + } - /** - * @brief A request for immediate stop to the federate - * - * @param fed: the deferate to stop - */ - void send_stop(federate_info_t * fed) { - // Reply with a stop granted to all federates - unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; - outgoing_buffer[0] = MSG_TYPE_STOP; - lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); + /** + * @brief A request for immediate stop to the federate + * + * @param fed: the deferate to stop + */ + void send_stop(federate_info_t * fed) { + // Reply with a stop granted to all federates + unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; + outgoing_buffer[0] = MSG_TYPE_STOP; + lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); - } - write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, - "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); + } + write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, + "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); + + LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); + } + + /** + * @brief A request for immediate stop to the federate + * + * @param fed: the deferate to stop + */ + void send_stop(federate_info_t * fed) { + // Reply with a stop granted to all federates + unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; + outgoing_buffer[0] = MSG_TYPE_STOP; + lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); - LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); } + write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, + "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); - /** - * @brief A request for immediate stop to the federate - * - * @param fed: the deferate to stop - */ - void send_stop(federate_info_t * fed) { - // Reply with a stop granted to all federates - unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; - outgoing_buffer[0] = MSG_TYPE_STOP; - lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); + LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); + } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); - } - write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, - "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); - - LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); - } - - void* lf_connect_to_transient_federates_thread(void* nothing) { - // This loop will continue to accept connections of transient federates, as - // soon as there is room, or enable hot swap - - while (!rti_remote->all_persistent_federates_exited) { - // Continue waiting for an incoming connection requests from transients - // to join, or for hot swap. - // Wait for an incoming connection request. - struct sockaddr client_fd; - uint32_t client_length = sizeof(client_fd); - // The following blocks until a federate connects. - int socket_id = -1; - while (1) { - if (!rti_remote->all_persistent_federates_exited) { - return NULL; - } - socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); - if (socket_id >= 0) { - // Got a socket - break; - } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { - lf_print_error_system_failure("RTI failed to accept the socket."); - } else { - // Try again - lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); - continue; - } + void* lf_connect_to_transient_federates_thread(void* nothing) { + // This loop will continue to accept connections of transient federates, as + // soon as there is room, or enable hot swap + + while (!rti_remote->all_persistent_federates_exited) { + // Continue waiting for an incoming connection requests from transients + // to join, or for hot swap. + // Wait for an incoming connection request. + struct sockaddr client_fd; + uint32_t client_length = sizeof(client_fd); + // The following blocks until a federate connects. + int socket_id = -1; + while (1) { + if (!rti_remote->all_persistent_federates_exited) { + return NULL; } + socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); + if (socket_id >= 0) { + // Got a socket + break; + } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { + lf_print_error_system_failure("RTI failed to accept the socket."); + } else { + // Try again + lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); + continue; + } + } // Wait for the first message from the federate when RTI -a option is on. #ifdef __RTI_AUTH__ - if (rti_remote->authentication_enabled) { - if (!authenticate_federate(&socket_id)) { - lf_print_warning("RTI failed to authenticate the incoming federate."); - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - socket_id = -1; - // Ignore the federate that failed authentication. - i--; - continue; - } + if (rti_remote->authentication_enabled) { + if (!authenticate_federate(&socket_id)) { + lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + socket_id = -1; + // Ignore the federate that failed authentication. + i--; + continue; } + } #endif - // The first message from the federate should contain its ID and the federation ID. - // The function also detects if a hot swap request is initiated. - int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in*)&client_fd); - - if (fed_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && - receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - LF_MUTEX_LOCK(&rti_mutex); - if (hot_swap_in_progress) { - lf_print("RTI: Hot swap confirmed for federate %d.", fed_id); - - // Then send STOP - federate_info_t* fed_old = GET_FED_INFO(fed_id); - hot_swap_federate->enclave.completed = fed_old->enclave.completed; - - LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); - send_stop(fed_old); - LF_MUTEX_UNLOCK(&rti_mutex); - - // Wait for the old federate to send MSG_TYPE_RESIGN - LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); - // FIXME: Should this have a timeout? - while (!hot_swap_old_resigned) - ; - - // The latest LTC is the tag at which the old federate resigned. This is useful - // for computing the effective_start_time of the new joining federate. - hot_swap_federate->enclave.completed = fed_old->enclave.completed; - - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - lf_thread_create(&(hot_swap_federate->thread_id), federate_info_thread_TCP, hot_swap_federate); - - // Redirect the federate in rti_remote - rti_remote->base.scheduling_nodes[fed_id] = (scheduling_node_t*)hot_swap_federate; - - // Free the old federate memory and reset the Hot wap indicators - // FIXME: Is this enough to free the memory allocated to the federate? - free(fed_old); - lf_mutex_lock(&rti_mutex); - hot_swap_in_progress = false; - lf_mutex_unlock(&rti_mutex); - - lf_print("RTI: Hot swap succeeded for federate %d.", fed_id); - } else { - lf_mutex_unlock(&rti_mutex); - - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - federate_info_t* fed = GET_FED_INFO(fed_id); - lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); - lf_print("RTI: Transient federate %d joined.", fed_id); - } - rti_remote->number_of_connected_transient_federates++; + // The first message from the federate should contain its ID and the federation ID. + // The function also detects if a hot swap request is initiated. + int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in*)&client_fd); + + if (fed_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && + receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { + LF_MUTEX_LOCK(&rti_mutex); + if (hot_swap_in_progress) { + lf_print("RTI: Hot swap confirmed for federate %d.", fed_id); + + // Then send STOP + federate_info_t* fed_old = GET_FED_INFO(fed_id); + hot_swap_federate->enclave.completed = fed_old->enclave.completed; + + LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); + send_stop(fed_old); + LF_MUTEX_UNLOCK(&rti_mutex); + + // Wait for the old federate to send MSG_TYPE_RESIGN + LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); + // FIXME: Should this have a timeout? + while (!hot_swap_old_resigned) + ; + + // The latest LTC is the tag at which the old federate resigned. This is useful + // for computing the effective_start_time of the new joining federate. + hot_swap_federate->enclave.completed = fed_old->enclave.completed; + + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + lf_thread_create(&(hot_swap_federate->thread_id), federate_info_thread_TCP, hot_swap_federate); + + // Redirect the federate in rti_remote + rti_remote->base.scheduling_nodes[fed_id] = (scheduling_node_t*)hot_swap_federate; + + // Free the old federate memory and reset the Hot wap indicators + // FIXME: Is this enough to free the memory allocated to the federate? + free(fed_old); + lf_mutex_lock(&rti_mutex); + hot_swap_in_progress = false; + lf_mutex_unlock(&rti_mutex); + + lf_print("RTI: Hot swap succeeded for federate %d.", fed_id); } else { - // If a hot swap was initialed, but the connection information or/and clock - // synchronization fail, then reset hot_swap_in_profress, and free the memory - // allocated for hot_swap_federate - if (hot_swap_in_progress) { - lf_print("RTI: Hot swap canceled for federate %d.", fed_id); - lf_mutex_lock(&rti_mutex); - hot_swap_in_progress = false; - lf_mutex_unlock(&rti_mutex); - - // FIXME: Is this enough to free the memory of a federate_info_t data structure? - free(hot_swap_federate); - } - } - } - } + lf_mutex_unlock(&rti_mutex); - void* respond_to_erroneous_connections(void* nothing) { - initialize_lf_thread_id(); - while (true) { - // Wait for an incoming connection request. - // The following will block until either a federate attempts to connect - // or close(rti->socket_descriptor_TCP) is called. - int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, -1); - if (socket_id < 0) { - return NULL; - } - if (rti_remote->all_federates_exited) { - return NULL; + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + federate_info_t* fed = GET_FED_INFO(fed_id); + lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + lf_print("RTI: Transient federate %d joined.", fed_id); } + rti_remote->number_of_connected_transient_federates++; + } else { + // If a hot swap was initialed, but the connection information or/and clock + // synchronization fail, then reset hot_swap_in_profress, and free the memory + // allocated for hot_swap_federate + if (hot_swap_in_progress) { + lf_print("RTI: Hot swap canceled for federate %d.", fed_id); + lf_mutex_lock(&rti_mutex); + hot_swap_in_progress = false; + lf_mutex_unlock(&rti_mutex); - lf_print_error("RTI received an unexpected connection request. Federation is running."); - unsigned char response[2]; - response[0] = MSG_TYPE_REJECT; - response[1] = FEDERATION_ID_DOES_NOT_MATCH; - // Ignore errors on this response. - if (write_to_socket(socket_id, 2, response)) { - lf_print_warning("RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); + // FIXME: Is this enough to free the memory of a federate_info_t data structure? + free(hot_swap_federate); } - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); } - return NULL; - } - - void initialize_federate(federate_info_t * fed, uint16_t id) { - initialize_scheduling_node(&(fed->enclave), id); - fed->requested_stop = false; - fed->socket = -1; // No socket. - fed->clock_synchronization_enabled = true; - fed->in_transit_message_tags = pqueue_tag_init(10); - strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); - fed->server_ip_addr.s_addr = 0; - fed->server_port = -1; - fed->is_transient = true; - fed->effective_start_tag = NEVER_TAG; - fed->pending_grant = NEVER_TAG; - fed->pending_provisional_grant = NEVER_TAG; - } - - void reset_transient_federate(federate_info_t * fed) { - fed->enclave.next_event = NEVER_TAG; - fed->enclave.state = NOT_CONNECTED; - // Reset of the federate-related attributes - fed->socket = -1; // No socket. - fed->clock_synchronization_enabled = true; - fed->in_transit_message_tags = pqueue_tag_init(10); - strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); - fed->server_ip_addr.s_addr = 0; - fed->server_port = -1; - fed->requested_stop = false; - fed->is_transient = true; - fed->effective_start_tag = NEVER_TAG; - fed->pending_grant = NEVER_TAG; - fed->pending_provisional_grant = NEVER_TAG; - // FIXME: There is room though to check if the interface has changed??? Do we allow this? - } - - int32_t start_rti_server(uint16_t port) { - _lf_initialize_clock(); - // Create the TCP socket server - if (create_server(port, &rti_remote->socket_descriptor_TCP, &rti_remote->final_port_TCP, TCP, true)) { - lf_print_error_system_failure("RTI failed to create TCP server: %s.", strerror(errno)); - }; - lf_print("RTI: Listening for federates."); - // Create the UDP socket server - // Try to get the rti_remote->final_port_TCP + 1 port - if (rti_remote->clock_sync_global_status >= clock_sync_on) { - if (create_server(rti_remote->final_port_TCP + 1, &rti_remote->socket_descriptor_UDP, - &rti_remote->final_port_UDP, UDP, true)) { - lf_print_error_system_failure("RTI failed to create UDP server: %s.", strerror(errno)); - } + } + } + + void* respond_to_erroneous_connections(void* nothing) { + initialize_lf_thread_id(); + while (true) { + // Wait for an incoming connection request. + // The following will block until either a federate attempts to connect + // or close(rti->socket_descriptor_TCP) is called. + int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, -1); + if (socket_id < 0) { + return NULL; + } + if (rti_remote->all_federates_exited) { + return NULL; + } + + lf_print_error("RTI received an unexpected connection request. Federation is running."); + unsigned char response[2]; + response[0] = MSG_TYPE_REJECT; + response[1] = FEDERATION_ID_DOES_NOT_MATCH; + // Ignore errors on this response. + if (write_to_socket(socket_id, 2, response)) { + lf_print_warning("RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); } - return rti_remote->socket_descriptor_TCP; + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); } + return NULL; + } - /** - * Iterate over the federates and sets 'has_upstream_transient_federates'. - * Once done, check that no transient federate has an upstream transient federate, - * and compute the number of persistent federates that do have upstream transients, - * which is the maximun number of delayed grants that can be pending at the same time. - * This is useful for initialyzing the queue of delayed grants. + void initialize_federate(federate_info_t * fed, uint16_t id) { + initialize_scheduling_node(&(fed->enclave), id); + fed->requested_stop = false; + fed->socket = -1; // No socket. + fed->clock_synchronization_enabled = true; + fed->in_transit_message_tags = pqueue_tag_init(10); + strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); + fed->server_ip_addr.s_addr = 0; + fed->server_port = -1; + fed->is_transient = true; + fed->effective_start_tag = NEVER_TAG; + fed->pending_grant = NEVER_TAG; + fed->pending_provisional_grant = NEVER_TAG; + } - * @return -1, if there is more than one level of transiency, else, the number of - * persistents that have an upstream transient - */ - static int set_has_upstream_transient_federates_parameter_and_check() { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - for (int j = 0; j < fed->enclave.num_upstream; j++) { - federate_info_t* upstream_fed = GET_FED_INFO(fed->enclave.upstream[j]); - if (upstream_fed->is_transient) { - fed->has_upstream_transient_federates = true; - break; - } - } + void reset_transient_federate(federate_info_t * fed) { + fed->enclave.next_event = NEVER_TAG; + fed->enclave.state = NOT_CONNECTED; + // Reset of the federate-related attributes + fed->socket = -1; // No socket. + fed->clock_synchronization_enabled = true; + fed->in_transit_message_tags = pqueue_tag_init(10); + strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); + fed->server_ip_addr.s_addr = 0; + fed->server_port = -1; + fed->requested_stop = false; + fed->is_transient = true; + fed->effective_start_tag = NEVER_TAG; + fed->pending_grant = NEVER_TAG; + fed->pending_provisional_grant = NEVER_TAG; + // FIXME: There is room though to check if the interface has changed??? Do we allow this? + } + + int32_t start_rti_server(uint16_t port) { + _lf_initialize_clock(); + // Create the TCP socket server + if (create_server(port, &rti_remote->socket_descriptor_TCP, &rti_remote->final_port_TCP, TCP, true)) { + lf_print_error_system_failure("RTI failed to create TCP server: %s.", strerror(errno)); + }; + lf_print("RTI: Listening for federates."); + // Create the UDP socket server + // Try to get the rti_remote->final_port_TCP + 1 port + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + if (create_server(rti_remote->final_port_TCP + 1, &rti_remote->socket_descriptor_UDP, &rti_remote->final_port_UDP, + UDP, true)) { + lf_print_error_system_failure("RTI failed to create UDP server: %s.", strerror(errno)); } + } + return rti_remote->socket_descriptor_TCP; + } - // Now check that no transient has an upstream transient - // FIXME: Do we really need this? Or should it be the job of the validator? - int max_number_of_delayed_grants = 0; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->is_transient && fed->has_upstream_transient_federates) { - return -1; - } - if (!fed->is_transient && fed->has_upstream_transient_federates) { - max_number_of_delayed_grants++; + /** + * Iterate over the federates and sets 'has_upstream_transient_federates'. + * Once done, check that no transient federate has an upstream transient federate, + * and compute the number of persistent federates that do have upstream transients, + * which is the maximun number of delayed grants that can be pending at the same time. + * This is useful for initialyzing the queue of delayed grants. + + * @return -1, if there is more than one level of transiency, else, the number of + * persistents that have an upstream transient + */ + static int set_has_upstream_transient_federates_parameter_and_check() { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + for (int j = 0; j < fed->enclave.num_upstream; j++) { + federate_info_t* upstream_fed = GET_FED_INFO(fed->enclave.upstream[j]); + if (upstream_fed->is_transient) { + fed->has_upstream_transient_federates = true; + break; } } + } - return max_number_of_delayed_grants; + // Now check that no transient has an upstream transient + // FIXME: Do we really need this? Or should it be the job of the validator? + int max_number_of_delayed_grants = 0; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->is_transient && fed->has_upstream_transient_federates) { + return -1; + } + if (!fed->is_transient && fed->has_upstream_transient_federates) { + max_number_of_delayed_grants++; + } } - void wait_for_federates(int socket_descriptor) { - // Wait for connections from persistent federates and create a thread for each. - lf_connect_to_persistent_federates(socket_descriptor); - // Wait for connections from persistent federates and create a thread for each. - lf_connect_to_persistent_federates(socket_descriptor); + return max_number_of_delayed_grants; + } + + void wait_for_federates(int socket_descriptor) { + // Wait for connections from persistent federates and create a thread for each. + lf_connect_to_persistent_federates(socket_descriptor); + // Wait for connections from persistent federates and create a thread for each. + lf_connect_to_persistent_federates(socket_descriptor); - // Set has_upstream_transient_federates parameter in all federates and check - // that there is no more than one level of transiency - if (rti_remote->number_of_transient_federates > 0) { - int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); - if (max_number_of_pending_grants == -1) { - lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); - } - rti_remote->delayed_grants = pqueue_delayed_grants_init(max_number_of_pending_grants); + // Set has_upstream_transient_federates parameter in all federates and check + // that there is no more than one level of transiency + if (rti_remote->number_of_transient_federates > 0) { + int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); + if (max_number_of_pending_grants == -1) { + lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); } + rti_remote->delayed_grants = pqueue_delayed_grants_init(max_number_of_pending_grants); + } - // All persistent federates have connected. - lf_print("RTI: All expected persistent federates have connected. Starting execution."); - if (rti_remote->number_of_transient_federates > 0) { - lf_print("RTI: Transient Federates can join and leave the federation at anytime."); - } + // All persistent federates have connected. + lf_print("RTI: All expected persistent federates have connected. Starting execution."); + if (rti_remote->number_of_transient_federates > 0) { + lf_print("RTI: Transient Federates can join and leave the federation at anytime."); + } - // The socket server will only continue to accept connections from transient - // federates. - // In case some other federation's federates are trying to join the wrong - // federation, need to respond. Start a separate thread to do that. - lf_thread_t responder_thread; - lf_thread_t transient_thread; - lf_thread_t delayed_grants_thread; - - // If the federation does not include transient federates, then respond to - // erronous connections. Otherwise, continue to accept transients joining and - // respond to duplicate joing requests. - if (rti_remote->number_of_transient_federates == 0) { - lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); - } else if (rti_remote->number_of_transient_federates > 0) { - lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); - lf_thread_create(&delayed_grants_thread, lf_delayed_grants_thread, NULL); + // The socket server will only continue to accept connections from transient + // federates. + // In case some other federation's federates are trying to join the wrong + // federation, need to respond. Start a separate thread to do that. + lf_thread_t responder_thread; + lf_thread_t transient_thread; + lf_thread_t delayed_grants_thread; + + // If the federation does not include transient federates, then respond to + // erronous connections. Otherwise, continue to accept transients joining and + // respond to duplicate joing requests. + if (rti_remote->number_of_transient_federates == 0) { + lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); + } else if (rti_remote->number_of_transient_federates > 0) { + lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); + lf_thread_create(&delayed_grants_thread, lf_delayed_grants_thread, NULL); + } + + // Wait for persistent federate threads to exit. + void* thread_exit_status; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (!fed->is_transient) { + lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); + lf_thread_join(fed->thread_id, &thread_exit_status); + pqueue_tag_free(fed->in_transit_message_tags); + lf_print("RTI: Persistent federate %d thread exited.", fed->enclave.id); } + } + + rti_remote->all_persistent_federates_exited = true; + rti_remote->phase = shutdown_phase; + lf_print("RTI: All persistent threads exited."); - // Wait for persistent federate threads to exit. - void* thread_exit_status; + // Wait for transient federate threads to exit, if any. + if (rti_remote->number_of_transient_federates > 0) { for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { federate_info_t* fed = GET_FED_INFO(i); - if (!fed->is_transient) { + if (fed->is_transient) { lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); lf_thread_join(fed->thread_id, &thread_exit_status); pqueue_tag_free(fed->in_transit_message_tags); - lf_print("RTI: Persistent federate %d thread exited.", fed->enclave.id); - } - } - - rti_remote->all_persistent_federates_exited = true; - rti_remote->phase = shutdown_phase; - lf_print("RTI: All persistent threads exited."); - - // Wait for transient federate threads to exit, if any. - if (rti_remote->number_of_transient_federates > 0) { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->is_transient) { - lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); - lf_thread_join(fed->thread_id, &thread_exit_status); - pqueue_tag_free(fed->in_transit_message_tags); - lf_print("RTI: Transient federate %d thread exited.", fed->enclave.id); - } + lf_print("RTI: Transient federate %d thread exited.", fed->enclave.id); } } + } - rti_remote->all_federates_exited = true; + rti_remote->all_federates_exited = true; - // Shutdown and close the socket that is listening for incoming connections - // so that the accept() call in respond_to_erroneous_connections returns. - // That thread should then check rti->all_federates_exited and it should exit. - if (shutdown(socket_descriptor, SHUT_RDWR)) { - LF_PRINT_LOG("On shut down TCP socket, received reply: %s", strerror(errno)); - } - // NOTE: In all common TCP/IP stacks, there is a time period, - // typically between 30 and 120 seconds, called the TIME_WAIT period, - // before the port is released after this close. This is because - // the OS is preventing another program from accidentally receiving - // duplicated packets intended for this program. - close(socket_descriptor); - - if (rti_remote->socket_descriptor_UDP > 0) { - if (shutdown(rti_remote->socket_descriptor_UDP, SHUT_RDWR)) { - LF_PRINT_LOG("On shut down UDP socket, received reply: %s", strerror(errno)); - } - close(rti_remote->socket_descriptor_UDP); - } + // Shutdown and close the socket that is listening for incoming connections + // so that the accept() call in respond_to_erroneous_connections returns. + // That thread should then check rti->all_federates_exited and it should exit. + if (shutdown(socket_descriptor, SHUT_RDWR)) { + LF_PRINT_LOG("On shut down TCP socket, received reply: %s", strerror(errno)); } + // NOTE: In all common TCP/IP stacks, there is a time period, + // typically between 30 and 120 seconds, called the TIME_WAIT period, + // before the port is released after this close. This is because + // the OS is preventing another program from accidentally receiving + // duplicated packets intended for this program. + close(socket_descriptor); + + if (rti_remote->socket_descriptor_UDP > 0) { + if (shutdown(rti_remote->socket_descriptor_UDP, SHUT_RDWR)) { + LF_PRINT_LOG("On shut down UDP socket, received reply: %s", strerror(errno)); + } + close(rti_remote->socket_descriptor_UDP); + } + } - void initialize_RTI(rti_remote_t * rti) { - rti_remote = rti; - - // Initialize thread synchronization primitives - LF_MUTEX_INIT(&rti_mutex); - LF_COND_INIT(&received_start_times, &rti_mutex); - LF_COND_INIT(&sent_start_time, &rti_mutex); - LF_COND_INIT(&updated_delayed_grants, &rti_mutex); - - initialize_rti_common(&rti_remote->base); - rti_remote->base.mutex = &rti_mutex; - - // federation_rti related initializations - rti_remote->max_start_time = 0LL; - rti_remote->num_feds_proposed_start = 0; - rti_remote->all_federates_exited = false; - rti_remote->federation_id = "Unidentified Federation"; - rti_remote->user_specified_port = 0; - rti_remote->final_port_TCP = 0; - rti_remote->socket_descriptor_TCP = -1; - rti_remote->final_port_UDP = UINT16_MAX; - rti_remote->socket_descriptor_UDP = -1; - rti_remote->clock_sync_global_status = clock_sync_init; - rti_remote->clock_sync_period_ns = MSEC(10); - rti_remote->clock_sync_exchanges_per_interval = 10; - rti_remote->authentication_enabled = false; - rti_remote->base.tracing_enabled = false; - rti_remote->stop_in_progress = false; - rti_remote->number_of_transient_federates = 0; - rti_remote->phase = startup_phase; - } - - void free_scheduling_nodes(scheduling_node_t * *scheduling_nodes, uint16_t number_of_scheduling_nodes) { - for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { - // FIXME: Gives error freeing memory not allocated!!!! - scheduling_node_t* node = scheduling_nodes[i]; - if (node->upstream != NULL) - free(node->upstream); - if (node->downstream != NULL) - free(node->downstream); - } - free(scheduling_nodes); + void initialize_RTI(rti_remote_t * rti) { + rti_remote = rti; + + // Initialize thread synchronization primitives + LF_MUTEX_INIT(&rti_mutex); + LF_COND_INIT(&received_start_times, &rti_mutex); + LF_COND_INIT(&sent_start_time, &rti_mutex); + LF_COND_INIT(&updated_delayed_grants, &rti_mutex); + + initialize_rti_common(&rti_remote->base); + rti_remote->base.mutex = &rti_mutex; + + // federation_rti related initializations + rti_remote->max_start_time = 0LL; + rti_remote->num_feds_proposed_start = 0; + rti_remote->all_federates_exited = false; + rti_remote->federation_id = "Unidentified Federation"; + rti_remote->user_specified_port = 0; + rti_remote->final_port_TCP = 0; + rti_remote->socket_descriptor_TCP = -1; + rti_remote->final_port_UDP = UINT16_MAX; + rti_remote->socket_descriptor_UDP = -1; + rti_remote->clock_sync_global_status = clock_sync_init; + rti_remote->clock_sync_period_ns = MSEC(10); + rti_remote->clock_sync_exchanges_per_interval = 10; + rti_remote->authentication_enabled = false; + rti_remote->base.tracing_enabled = false; + rti_remote->stop_in_progress = false; + rti_remote->number_of_transient_federates = 0; + rti_remote->phase = startup_phase; + } + + void free_scheduling_nodes(scheduling_node_t * *scheduling_nodes, uint16_t number_of_scheduling_nodes) { + for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { + // FIXME: Gives error freeing memory not allocated!!!! + scheduling_node_t* node = scheduling_nodes[i]; + if (node->upstream != NULL) + free(node->upstream); + if (node->downstream != NULL) + free(node->downstream); } + free(scheduling_nodes); + } #endif // STANDALONE_RTI diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index c6a1beaee..ef6634b0b 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -588,9 +588,7 @@ void _lf_initialize_start_tag(environment_t* env) { // If we have a non-zero STA offset, then we need to allow messages to arrive // at the start time. To avoid spurious STP violations, we temporarily // set the current time back by the STA offset. - env->current_tag = - (tag_t){.time = effective_start_tag.time - lf_fed_STA_offset, .microstep = effective_start_tag.microstep}; - + env->current_tag.time -= lf_fed_STA_offset; LF_PRINT_LOG("Waiting for start time " PRINTF_TIME " plus STA " PRINTF_TIME ".", start_time, lf_fed_STA_offset); #else // For other than federated decentralized execution, there is no lf_fed_STA_offset variable defined. From 9461e8fc466fd89e26271894d24dfa10cf4c44b9 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 4 Mar 2024 00:43:47 +0100 Subject: [PATCH 092/148] Use a tag queue to manage delayed grants --- core/federated/RTI/rti_remote.c | 297 ++++++++++++++------------------ 1 file changed, 132 insertions(+), 165 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index ea26c18f6..6a53ff95d 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -30,6 +30,7 @@ #include "net_util.h" #include #include "clock.h" // For lf_clock_cond_timedwait() +#include "clock.h" // For lf_clock_cond_timedwait() // Global variables defined in tag.c: extern instant_t start_time; @@ -193,48 +194,6 @@ static void notify_grant_delayed(federate_info_t* fed, tag_t tag, bool is_provis LF_MUTEX_UNLOCK(&rti_mutex); } -/** - * @brief Cancel a delayed grant by removing it from delayed_grants queue. - * - * The removal will cause the broadcast to cause the delayed_grants_thread to - * account for the update. - * - * In case there is already a grant for that federte, keep the soonest one. - * FIXME: Is that correct? - * - * @param fed The federate. - */ - -void notify_grant_canceled(federate_info_t* fed) { - LF_MUTEX_LOCK(&rti_mutex); - pqueue_delayed_grant_element_t* dge = - pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, fed->enclave.id); - if (dge != NULL) { - pqueue_delayed_grants_remove(rti_remote->delayed_grants, dge); - LF_PRINT_LOG("RTI: Canceling the delayed grant of " PRINTF_TAG " for federate %d.", dge->base.tag.time - start_time, - dge->base.tag.microstep, dge->fed_id); - lf_cond_signal(&updated_delayed_grants); - } - LF_MUTEX_UNLOCK(&rti_mutex); -} - -/** - * Find the number of non connected upstream transients - * @param fed The federate - * @return the number of non connected upstream transients - */ -static int get_num_absent_upstream_transients(federate_info_t* fed) { - int num_absent_upstream_transients = 0; - for (int j = 0; j < fed->enclave.num_upstream; j++) { - federate_info_t* upstream = GET_FED_INFO(fed->enclave.upstream[j]); - // Do Ignore this enclave if it no longer connected. - if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { - num_absent_upstream_transients++; - } - } - return num_absent_upstream_transients; -} - /** * Find the number of non connected upstream transients * @param fed The federate @@ -349,7 +308,7 @@ void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { notify_tag_advance_grant_immediate(e, tag); } else { if (get_num_absent_upstream_transients(fed) > 0) { - notify_tag_advance_grant_delayed(fed, tag); + notify_grant_delayed(fed, tag, false); } else { notify_tag_advance_grant_immediate(e, tag); } @@ -2268,10 +2227,9 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); fed->server_ip_addr.s_addr = 0; fed->server_port = -1; + fed->has_upstream_transient_federates = false; fed->is_transient = true; fed->effective_start_tag = NEVER_TAG; - fed->pending_grant = NEVER_TAG; - fed->pending_provisional_grant = NEVER_TAG; } void reset_transient_federate(federate_info_t * fed) { @@ -2287,9 +2245,6 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { fed->requested_stop = false; fed->is_transient = true; fed->effective_start_tag = NEVER_TAG; - fed->pending_grant = NEVER_TAG; - fed->pending_provisional_grant = NEVER_TAG; - // FIXME: There is room though to check if the interface has changed??? Do we allow this? } int32_t start_rti_server(uint16_t port) { @@ -2312,7 +2267,7 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { /** * Iterate over the federates and sets 'has_upstream_transient_federates'. - * Once done, check that no transient federate has an upstream transient federate, + * Once done, check that no transient federate has an upstream transient federate. * and compute the number of persistent federates that do have upstream transients, * which is the maximun number of delayed grants that can be pending at the same time. * This is useful for initialyzing the queue of delayed grants. @@ -2344,144 +2299,156 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { max_number_of_delayed_grants++; } } + // Now check that no transient has an upstream transient + // FIXME: Do we really need this? Or should it be the job of the validator? + uint16_t max_number_of_delayed_grants = 0; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->is_transient && fed->has_upstream_transient_federates) { + return -1; + } + if (!fed->is_transient && fed->has_upstream_transient_federates) { + max_number_of_delayed_grants++; + } + } return max_number_of_delayed_grants; } + return max_number_of_delayed_grants; +} - void wait_for_federates(int socket_descriptor) { - // Wait for connections from persistent federates and create a thread for each. - lf_connect_to_persistent_federates(socket_descriptor); - // Wait for connections from persistent federates and create a thread for each. - lf_connect_to_persistent_federates(socket_descriptor); - - // Set has_upstream_transient_federates parameter in all federates and check - // that there is no more than one level of transiency - if (rti_remote->number_of_transient_federates > 0) { - int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); - if (max_number_of_pending_grants == -1) { - lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); - } - rti_remote->delayed_grants = pqueue_delayed_grants_init(max_number_of_pending_grants); - } +void wait_for_federates(int socket_descriptor) { + // Wait for connections from persistent federates and create a thread for each. + lf_connect_to_persistent_federates(socket_descriptor); - // All persistent federates have connected. - lf_print("RTI: All expected persistent federates have connected. Starting execution."); - if (rti_remote->number_of_transient_federates > 0) { - lf_print("RTI: Transient Federates can join and leave the federation at anytime."); + // Set has_upstream_transient_federates parameter in all federates and check + // that there is no more than one level of transiency + if (rti_remote->number_of_transient_federates > 0) { + int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); + if (max_number_of_pending_grants == -1) { + lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); } + rti_remote->delayed_grants = pqueue_delayed_grants_init(max_number_of_pending_grants); + } - // The socket server will only continue to accept connections from transient - // federates. - // In case some other federation's federates are trying to join the wrong - // federation, need to respond. Start a separate thread to do that. - lf_thread_t responder_thread; - lf_thread_t transient_thread; - lf_thread_t delayed_grants_thread; + // All persistent federates have connected. + lf_print("RTI: All expected persistent federates have connected. Starting execution."); + if (rti_remote->number_of_transient_federates > 0) { + lf_print("RTI: Transient Federates can join and leave the federation at anytime."); + } + + // The socket server will only continue to accept connections from transient + // federates. + // In case some other federation's federates are trying to join the wrong + // federation, need to respond. Start a separate thread to do that. + lf_thread_t responder_thread; + lf_thread_t transient_thread; + lf_thread_t delayed_grants_thread; + + // If the federation does not include transient federates, then respond to + // erronous connections. Otherwise, continue to accept transients joining and + // respond to duplicate joing requests. + if (rti_remote->number_of_transient_federates == 0) { + lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); + } else if (rti_remote->number_of_transient_federates > 0) { + lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); + lf_thread_create(&delayed_grants_thread, lf_delayed_grants_thread, NULL); + } - // If the federation does not include transient federates, then respond to - // erronous connections. Otherwise, continue to accept transients joining and - // respond to duplicate joing requests. - if (rti_remote->number_of_transient_federates == 0) { - lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); - } else if (rti_remote->number_of_transient_federates > 0) { - lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); - lf_thread_create(&delayed_grants_thread, lf_delayed_grants_thread, NULL); + // Wait for persistent federate threads to exit. + void* thread_exit_status; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (!fed->is_transient) { + lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); + lf_thread_join(fed->thread_id, &thread_exit_status); + pqueue_tag_free(fed->in_transit_message_tags); + lf_print("RTI: Persistent federate %d thread exited.", fed->enclave.id); } + } - // Wait for persistent federate threads to exit. - void* thread_exit_status; + rti_remote->all_persistent_federates_exited = true; + rti_remote->phase = shutdown_phase; + lf_print("RTI: All persistent threads exited."); + + // Wait for transient federate threads to exit, if any. + if (rti_remote->number_of_transient_federates > 0) { for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { federate_info_t* fed = GET_FED_INFO(i); - if (!fed->is_transient) { + if (fed->is_transient) { lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); lf_thread_join(fed->thread_id, &thread_exit_status); pqueue_tag_free(fed->in_transit_message_tags); - lf_print("RTI: Persistent federate %d thread exited.", fed->enclave.id); - } - } - - rti_remote->all_persistent_federates_exited = true; - rti_remote->phase = shutdown_phase; - lf_print("RTI: All persistent threads exited."); - - // Wait for transient federate threads to exit, if any. - if (rti_remote->number_of_transient_federates > 0) { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->is_transient) { - lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); - lf_thread_join(fed->thread_id, &thread_exit_status); - pqueue_tag_free(fed->in_transit_message_tags); - lf_print("RTI: Transient federate %d thread exited.", fed->enclave.id); - } + lf_print("RTI: Transient federate %d thread exited.", fed->enclave.id); } } + } - rti_remote->all_federates_exited = true; + rti_remote->all_federates_exited = true; - // Shutdown and close the socket that is listening for incoming connections - // so that the accept() call in respond_to_erroneous_connections returns. - // That thread should then check rti->all_federates_exited and it should exit. - if (shutdown(socket_descriptor, SHUT_RDWR)) { - LF_PRINT_LOG("On shut down TCP socket, received reply: %s", strerror(errno)); - } - // NOTE: In all common TCP/IP stacks, there is a time period, - // typically between 30 and 120 seconds, called the TIME_WAIT period, - // before the port is released after this close. This is because - // the OS is preventing another program from accidentally receiving - // duplicated packets intended for this program. - close(socket_descriptor); - - if (rti_remote->socket_descriptor_UDP > 0) { - if (shutdown(rti_remote->socket_descriptor_UDP, SHUT_RDWR)) { - LF_PRINT_LOG("On shut down UDP socket, received reply: %s", strerror(errno)); - } - close(rti_remote->socket_descriptor_UDP); - } + // Shutdown and close the socket that is listening for incoming connections + // so that the accept() call in respond_to_erroneous_connections returns. + // That thread should then check rti->all_federates_exited and it should exit. + if (shutdown(socket_descriptor, SHUT_RDWR)) { + LF_PRINT_LOG("On shut down TCP socket, received reply: %s", strerror(errno)); } - - void initialize_RTI(rti_remote_t * rti) { - rti_remote = rti; - - // Initialize thread synchronization primitives - LF_MUTEX_INIT(&rti_mutex); - LF_COND_INIT(&received_start_times, &rti_mutex); - LF_COND_INIT(&sent_start_time, &rti_mutex); - LF_COND_INIT(&updated_delayed_grants, &rti_mutex); - - initialize_rti_common(&rti_remote->base); - rti_remote->base.mutex = &rti_mutex; - - // federation_rti related initializations - rti_remote->max_start_time = 0LL; - rti_remote->num_feds_proposed_start = 0; - rti_remote->all_federates_exited = false; - rti_remote->federation_id = "Unidentified Federation"; - rti_remote->user_specified_port = 0; - rti_remote->final_port_TCP = 0; - rti_remote->socket_descriptor_TCP = -1; - rti_remote->final_port_UDP = UINT16_MAX; - rti_remote->socket_descriptor_UDP = -1; - rti_remote->clock_sync_global_status = clock_sync_init; - rti_remote->clock_sync_period_ns = MSEC(10); - rti_remote->clock_sync_exchanges_per_interval = 10; - rti_remote->authentication_enabled = false; - rti_remote->base.tracing_enabled = false; - rti_remote->stop_in_progress = false; - rti_remote->number_of_transient_federates = 0; - rti_remote->phase = startup_phase; + // NOTE: In all common TCP/IP stacks, there is a time period, + // typically between 30 and 120 seconds, called the TIME_WAIT period, + // before the port is released after this close. This is because + // the OS is preventing another program from accidentally receiving + // duplicated packets intended for this program. + close(socket_descriptor); + + if (rti_remote->socket_descriptor_UDP > 0) { + if (shutdown(rti_remote->socket_descriptor_UDP, SHUT_RDWR)) { + LF_PRINT_LOG("On shut down UDP socket, received reply: %s", strerror(errno)); + } + close(rti_remote->socket_descriptor_UDP); } +} - void free_scheduling_nodes(scheduling_node_t * *scheduling_nodes, uint16_t number_of_scheduling_nodes) { - for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { - // FIXME: Gives error freeing memory not allocated!!!! - scheduling_node_t* node = scheduling_nodes[i]; - if (node->upstream != NULL) - free(node->upstream); - if (node->downstream != NULL) - free(node->downstream); - } - free(scheduling_nodes); +void initialize_RTI(rti_remote_t* rti) { + rti_remote = rti; + + // Initialize thread synchronization primitives + LF_MUTEX_INIT(&rti_mutex); + LF_COND_INIT(&received_start_times, &rti_mutex); + LF_COND_INIT(&sent_start_time, &rti_mutex); + LF_COND_INIT(&updated_delayed_grants, &rti_mutex); + + initialize_rti_common(&rti_remote->base); + rti_remote->base.mutex = &rti_mutex; + + // federation_rti related initializations + rti_remote->max_start_time = 0LL; + rti_remote->num_feds_proposed_start = 0; + rti_remote->all_federates_exited = false; + rti_remote->federation_id = "Unidentified Federation"; + rti_remote->user_specified_port = 0; + rti_remote->final_port_TCP = 0; + rti_remote->socket_descriptor_TCP = -1; + rti_remote->final_port_UDP = UINT16_MAX; + rti_remote->socket_descriptor_UDP = -1; + rti_remote->clock_sync_global_status = clock_sync_init; + rti_remote->clock_sync_period_ns = MSEC(10); + rti_remote->clock_sync_exchanges_per_interval = 10; + rti_remote->authentication_enabled = false; + rti_remote->base.tracing_enabled = false; + rti_remote->stop_in_progress = false; + rti_remote->number_of_transient_federates = 0; + rti_remote->phase = startup_phase; +} + +void free_scheduling_nodes(scheduling_node_t** scheduling_nodes, uint16_t number_of_scheduling_nodes) { + for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { + // FIXME: Gives error freeing memory not allocated!!!! + scheduling_node_t* node = scheduling_nodes[i]; + if (node->upstream != NULL) + free(node->upstream); + if (node->downstream != NULL) + free(node->downstream); } + free(scheduling_nodes); +} #endif // STANDALONE_RTI From 04c614aab795a7df371cd3aa927d0c3011a6d589 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 15 Mar 2024 01:54:56 +0100 Subject: [PATCH 093/148] Fix missing code from the previous commit --- core/federated/RTI/rti_remote.c | 347 +++++++++++++++++++++++++++++--- core/federated/RTI/rti_remote.h | 5 + 2 files changed, 320 insertions(+), 32 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 6a53ff95d..c70dad7db 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -134,6 +134,210 @@ void pqueue_delayed_grants_remove(pqueue_delayed_grants_t* q, pqueue_delayed_gra pqueue_tag_remove((pqueue_tag_t*)q, (void*)e); } +/** + * @brief Return the first item with the specified tag or NULL if there is none. + * @param q The queue. + * @param t The tag. + * @return An entry with the specified tag or NULL if there isn't one. + */ +pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_with_tag(pqueue_delayed_grants_t* q, tag_t t) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_find_with_tag((pqueue_tag_t*)q, t); +} + +// Function that does not in pqueue_tag.c +/** + * @brief Return the first item with the specified federate id or NULL if there is none. + * @param q The queue. + * @param fed_id The federate id. + * @return An entry with the specified federate if or NULL if there isn't one. + */ + +pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t* q, uint16_t fed_id) { + pqueue_delayed_grant_element_t* dge; + pqueue_t* _q = (pqueue_t*)q; + if (!q || q->size == 1) + return NULL; + for (int i = 1; i <= q->size; i++) { + dge = (pqueue_delayed_grant_element_t*)q->d[i]; + if (dge->fed_id == fed_id) { + return dge; + } + } + return NULL; +} + +// Utility functions to simplify the call of pqueue_tag routines. +// These functions mainly do the casting. +// FIXME: Should we remove the queue parameter from the functions? + +/** + * @brief Creates a priority queue of delayed grants that is sorted by tags. + * + * @param nbr_delayed_grants The size. + * @return The dynamically allocated queue or NULL. + */ +pqueue_delayed_grants_t* pqueue_delayed_grants_init(uint16_t nbr_delayed_grants) { + return (pqueue_delayed_grants_t*)pqueue_tag_init((size_t)nbr_delayed_grants); +} + +/** + * @brief Return the size of the queue. + * + * @param q The queue. + * @return The size. + */ +size_t pqueue_delayed_grants_size(pqueue_delayed_grants_t* q) { return pqueue_tag_size((pqueue_tag_t*)q); } + +/** + * @brief Insert an\ delayed grant element into the queue. + * + * @param q The queue. + * @param e The delayed grant element to insert. + * @return 0 on success + */ +int pqueue_delayed_grants_insert(pqueue_delayed_grants_t* q, pqueue_delayed_grant_element_t* d) { + return pqueue_tag_insert((pqueue_tag_t*)q, (void*)d); +} + +/** + * @brief Pop the least-tag element from the queue. + * + * @param q The queue. + * @return NULL on error, otherwise the entry + */ +pqueue_delayed_grant_element_t* pqueue_delayed_grants_pop(pqueue_delayed_grants_t* q) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_pop((pqueue_tag_t*)q); +} + +/** + * @brief Return highest-ranking element without removing it. + * + * @param q The queue. + * @return NULL on if the queue is empty, otherwise the delayed grant element. + */ +pqueue_delayed_grant_element_t* pqueue_delayed_grants_peek(pqueue_delayed_grants_t* q) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_peek((pqueue_tag_t*)q); +} + +/** + * @brief Free all memory used by the queue including elements that are marked dynamic. + * + * @param q The queue. + */ +void pqueue_delayed_grants_free(pqueue_delayed_grants_t* q) { pqueue_tag_free((pqueue_tag_t*)q); } + +/** + * @brief Remove an item from the delayed grants queue. + * + * @param q The queue. + * @param e The entry to remove. + */ +void pqueue_delayed_grants_remove(pqueue_delayed_grants_t* q, pqueue_delayed_grant_element_t* e) { + pqueue_tag_remove((pqueue_tag_t*)q, (void*)e); +} + +/** + * @brief Return the first item with the specified tag or NULL if there is none. + * @param q The queue. + * @param t The tag. + * @return An entry with the specified tag or NULL if there isn't one. + */ +pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_with_tag(pqueue_delayed_grants_t* q, tag_t t) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_find_with_tag((pqueue_tag_t*)q, t); +} + +// Function that does not in pqueue_tag.c +/** + * @brief Return the first item with the specified federate id or NULL if there is none. + * @param q The queue. + * @param fed_id The federate id. + * @return An entry with the specified federate if or NULL if there isn't one. + */ + +pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t* q, uint16_t fed_id) { + pqueue_delayed_grant_element_t* dge; + pqueue_t* _q = (pqueue_t*)q; + if (!q || q->size == 1) + return NULL; + for (int i = 1; i <= q->size; i++) { + dge = (pqueue_delayed_grant_element_t*)q->d[i]; + if (dge->fed_id == fed_id) { + return dge; + } + } + return NULL; +} + +// Utility functions to simplify the call of pqueue_tag routines. +// These functions mainly do the casting. +// FIXME: Should we remove the queue parameter from the functions? + +/** + * @brief Creates a priority queue of delayed grants that is sorted by tags. + * + * @param nbr_delayed_grants The size. + * @return The dynamically allocated queue or NULL. + */ +pqueue_delayed_grants_t* pqueue_delayed_grants_init(uint16_t nbr_delayed_grants) { + return (pqueue_delayed_grants_t*)pqueue_tag_init((size_t)nbr_delayed_grants); +} + +/** + * @brief Return the size of the queue. + * + * @param q The queue. + * @return The size. + */ +size_t pqueue_delayed_grants_size(pqueue_delayed_grants_t* q) { return pqueue_tag_size((pqueue_tag_t*)q); } + +/** + * @brief Insert an\ delayed grant element into the queue. + * + * @param q The queue. + * @param e The delayed grant element to insert. + * @return 0 on success + */ +int pqueue_delayed_grants_insert(pqueue_delayed_grants_t* q, pqueue_delayed_grant_element_t* d) { + return pqueue_tag_insert((pqueue_tag_t*)q, (void*)d); +} + +/** + * @brief Pop the least-tag element from the queue. + * + * @param q The queue. + * @return NULL on error, otherwise the entry + */ +pqueue_delayed_grant_element_t* pqueue_delayed_grants_pop(pqueue_delayed_grants_t* q) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_pop((pqueue_tag_t*)q); +} + +/** + * @brief Return highest-ranking element without removing it. + * + * @param q The queue. + * @return NULL on if the queue is empty, otherwise the delayed grant element. + */ +pqueue_delayed_grant_element_t* pqueue_delayed_grants_peek(pqueue_delayed_grants_t* q) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_peek((pqueue_tag_t*)q); +} + +/** + * @brief Free all memory used by the queue including elements that are marked dynamic. + * + * @param q The queue. + */ +void pqueue_delayed_grants_free(pqueue_delayed_grants_t* q) { pqueue_tag_free((pqueue_tag_t*)q); } + +/** + * @brief Remove an item from the delayed grants queue. + * + * @param q The queue. + * @param e The entry to remove. + */ +void pqueue_delayed_grants_remove(pqueue_delayed_grants_t* q, pqueue_delayed_grant_element_t* e) { + pqueue_tag_remove((pqueue_tag_t*)q, (void*)e); +} + // Function that does not exist in pqueue_tag.c /** * @brief Return the first item with the specified federate id or NULL if there is none. @@ -194,21 +398,15 @@ static void notify_grant_delayed(federate_info_t* fed, tag_t tag, bool is_provis LF_MUTEX_UNLOCK(&rti_mutex); } -/** - * Find the number of non connected upstream transients - * @param fed The federate - * @return the number of non connected upstream transients - */ -static int get_num_absent_upstream_transients(federate_info_t* fed) { - int num_absent_upstream_transients = 0; - for (int j = 0; j < fed->enclave.num_upstream; j++) { - federate_info_t* upstream = GET_FED_INFO(fed->enclave.upstream[j]); - // Do Ignore this enclave if it no longer connected. - if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { - num_absent_upstream_transients++; - } +void notify_grant_canceled(federate_info_t* fed) { + LF_MUTEX_LOCK(&rti_mutex); + pqueue_delayed_grant_element_t* dge = + pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, fed->enclave.id); + if (dge != NULL) { + pqueue_delayed_grants_remove(rti_remote->delayed_grants, dge); + lf_cond_broadcast(&updated_delayed_grants); } - return num_absent_upstream_transients; + LF_MUTEX_UNLOCK(&rti_mutex); } /** @@ -261,27 +459,27 @@ static void notify_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) } /** - * @brief Thread that sleeps for a period of time, and then wakes up to check if - * a tag advance grant needs to be sent. That is, if the pending tag have not - * been reset to NEVER_TAG, the tag advance grant will be immediate. + * Notify a tag advance grant (TAG) message to the specified federate after + * the physical time reaches the tag. A thread is created to this end. + * + * If a provisionl tag advance grant is pending, cancel it. If there is another + * pending tag advance grant, do not proceed with the thread creation. * - * @param federate the fedarate whose tag advance grant needs to be delayed. + * @param fed The federate. + * @param tag The tag to grant. */ -void* pending_grant_thread(void* federate) { - federate_info_t* fed = (federate_info_t*)federate; - - interval_t sleep_interval = fed->pending_grant.time - lf_time_physical(); - if (sleep_interval > 0) { - lf_sleep(sleep_interval); - } - +static void notify_tag_advance_grant_delayed(federate_info_t* fed, tag_t tag) { + // Check wether there is already a pending grant + // And check the pending provisional grant as well lf_mutex_lock(&rti_mutex); - - // If the pending grant becomes NEVER_TAG, then this means that it should - // not be sent - if (lf_tag_compare(fed->pending_grant, NEVER_TAG) != 0) { - notify_tag_advance_grant_immediate(&(fed->enclave), fed->pending_grant); - fed->pending_grant = NEVER_TAG; + if (lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) { + // If a tag is issued, then stop any possible provisional tag grant + fed->pending_grant = tag; + fed->pending_provisional_grant = NEVER_TAG; + lf_thread_create(&(fed->pending_grant_thread_id), pending_grant_thread, fed); + } else { + // If there is already a pending tag grant, then let it be sent first + // FIXME: Is this correct? } lf_mutex_unlock(&rti_mutex); } @@ -2187,6 +2385,91 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { } } } + return NULL; + } + + void* lf_delayed_grants_thread(void* nothing) { + initialize_lf_thread_id(); + while (rti_remote->phase == execution_phase) { + if (pqueue_delayed_grants_size(rti_remote->delayed_grants) != 0) { + pqueue_delayed_grant_element_t* next; + // Do not pop, but rather read + next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + instant_t next_time = next->base.tag.time; + // Wait for expiration, or a signal to stop or terminate. + if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time) == LF_TIMEOUT) { + lf_print("RTI: lf_delayed_grants_thread() is sending grant to %d at " PRINTF_TIME ".", next->fed_id, + next_time - start_time); + // Time reached to send the grant. Do it for delayed grants with + // the same tag + LF_MUTEX_LOCK(&rti_mutex); + do { + next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); + federate_info_t* fed = GET_FED_INFO(next->fed_id); + if (next->is_provisional) { + notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } else { + notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } + } while ((next = pqueue_delayed_grants_find_with_tag(rti_remote->delayed_grants, next->base.tag)) != NULL); + LF_MUTEX_UNLOCK(&rti_mutex); + } else { + // Waiting was interrupted, because of an update in the queue, or + // because this thread needs to terminate + lf_print("RTI: lf_delayed_grants_thread() did not send grant to %d at " PRINTF_TIME + ", but rather terminated!", + next->fed_id, next_time - start_time); + } + } + } + // The federation is at the shutdown phase. All persistent federates exited. + // We can do a sanity check that the delayed_grants queue is empty. + // FIXME: If there are still pending grants, what does that mean? Maybe that the + // federation stopped after a request to stop (not a timeout). Therefore, we need + // cleanup, and free the memory... + // TODO: do it! + } + + void* lf_delayed_grants_thread(void* nothing) { + initialize_lf_thread_id(); + while (rti_remote->phase == execution_phase) { + if (pqueue_delayed_grants_size(rti_remote->delayed_grants) != 0) { + pqueue_delayed_grant_element_t* next; + // Do not pop, but rather read + next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + instant_t next_time = next->base.tag.time; + // Wait for expiration, or a signal to stop or terminate. + if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time) == LF_TIMEOUT) { + lf_print("RTI: lf_delayed_grants_thread() is sending grant to %d at " PRINTF_TIME ".", next->fed_id, + next_time - start_time); + // Time reached to send the grant. Do it for delayed grants with + // the same tag + LF_MUTEX_LOCK(&rti_mutex); + do { + next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); + federate_info_t* fed = GET_FED_INFO(next->fed_id); + if (next->is_provisional) { + notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } else { + notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } + } while ((next = pqueue_delayed_grants_find_with_tag(rti_remote->delayed_grants, next->base.tag)) != NULL); + LF_MUTEX_UNLOCK(&rti_mutex); + } else { + // Waiting was interrupted, because of an update in the queue, or + // because this thread needs to terminate + lf_print("RTI: lf_delayed_grants_thread() did not send grant to %d at " PRINTF_TIME + ", but rather terminated!", + next->fed_id, next_time - start_time); + } + } + } + // The federation is at the shutdown phase. All persistent federates exited. + // We can do a sanity check that the delayed_grants queue is empty. + // FIXME: If there are still pending grants, what does that mean? Maybe that the + // federation stopped after a request to stop (not a timeout). Therefore, we need + // cleanup, and free the memory... + // TODO: do it! } void* respond_to_erroneous_connections(void* nothing) { diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index 62e75235b..da0dc1832 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -408,6 +408,11 @@ void* lf_connect_to_persistent_transient_federates_thread(int socket_descriptor) */ void* lf_connect_to_transient_federates_thread(void* nothing); +/** + * Thread that manages the delayed grants using a priprity queue. + */ +void* lf_delayed_grants_thread(void* nothing); + /** * Thread to respond to new connections, which could be federates of other * federations who are attempting to join the wrong federation. From cd6807f5e662d45200fe23d41a2603114d25747f Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 18 Mar 2024 23:52:48 +0100 Subject: [PATCH 094/148] Fix implicit declaration error by moving lf_stop(), lf_get_federates_bin_directory(), and lf_get_federaion_id() declarations to util.h --- include/core/federated/federate.h | 42 ------------------------------- include/core/utils/util.h | 8 +----- 2 files changed, 1 insertion(+), 49 deletions(-) diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index d77a6731a..26550d60c 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -530,46 +530,4 @@ void lf_synchronize_with_other_federates(); */ bool lf_update_max_level(tag_t tag, bool is_provisional); -/** - * @brief Stop the execution of a federate. - * Every enclave within the federate will stop at one microstep later than its - * current tag. Unlike lf_request_stop(), this process does not require any - * involvement from the RTI, nor does it necessitate any consensus. - * - * This function is particularly useful for testing transient federates. - */ -void lf_stop(); - -#ifdef FEDERATED_DECENTRALIZED -/** - * @brief Return the physical time that we should wait until before advancing to the specified tag. - * - * This function adds the STA offset (STP_offset parameter) to the time of the specified tag unless - * the tag is the starting tag (it is always safe to advance to the starting tag). It also avoids - * adding the STA offset if all network input ports are known at least up to one microstep earlier - * than the specified tag. - * - * This function assumes that the caller holds the environment mutex. - * @param time The specified time. - */ -instant_t lf_wait_until_time(tag_t tag); -#endif // FEDERATED_DECENTRALIZED - -/** - * @brief Returns the federation id. - * - * This function is useful for creating federates on runtime. - */ -char* lf_get_federation_id(); - -/** - * @brief Returns the effective start time of the federate. The start_time of persistent - * federates is equal to their effective_start_time. Transient federates, however, - * have their effective_start_time higher or equal to their start_time. - */ -instant_t lf_get_effective_start_time(); - -/** @brief Returns the start time of the federate. */ -instant_t lf_get_start_time(); - #endif // FEDERATE_H diff --git a/include/core/utils/util.h b/include/core/utils/util.h index 25c6a8d72..39f910b4d 100644 --- a/include/core/utils/util.h +++ b/include/core/utils/util.h @@ -204,17 +204,11 @@ void lf_vprint_error_and_exit(const char* format, va_list args) ATTRIBUTE_FORMAT */ void lf_stop(); -/** - * @brief Return the directory containing the executables of the individual - * federates. - */ -char* lf_get_federates_bin_directory(); - /** * @brief Returns the federation id. * * This function is useful for creating federates on runtime. */ -const char* lf_get_federation_id(); +char* lf_get_federation_id(); #endif /* UTIL_H */ From f512a97cbeaab1ef32746bb5310f14f7f98e26f8 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 12 Apr 2024 17:52:31 +0100 Subject: [PATCH 095/148] Fix merge --- util/tracing/trace_util.c | 1 + 1 file changed, 1 insertion(+) diff --git a/util/tracing/trace_util.c b/util/tracing/trace_util.c index 400c4c9ca..ed32c5baa 100644 --- a/util/tracing/trace_util.c +++ b/util/tracing/trace_util.c @@ -61,6 +61,7 @@ typedef struct open_file_t { open_file_t* next; } open_file_t; open_file_t* _open_files = NULL; + /** * Function to be invoked upon exiting. */ From f042571df42864b2a06be2431e29b11fc65438fb Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Mon, 15 Jul 2024 10:58:00 -0400 Subject: [PATCH 096/148] Support zero-delay cycles --- core/federated/RTI/rti_remote.c | 162 ++++++++++++-------- core/federated/federate.c | 113 ++++++++++---- include/core/federated/network/net_common.h | 27 +++- 3 files changed, 202 insertions(+), 100 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index c70dad7db..5265a5eef 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -368,6 +368,7 @@ pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_dela * * In case there is already a grant for that federate, keep the soonest one. * FIXME: Is that correct? + * FIXME: Why not just add it to the queue? * * @param fed The federate. * @param tag The tag to grant. @@ -394,6 +395,7 @@ static void notify_grant_delayed(federate_info_t* fed, tag_t tag, bool is_provis } else { // FIXME: Decide what to do in this case... // TODO: do it! + // FIXME: Add to the queue? } LF_MUTEX_UNLOCK(&rti_mutex); } @@ -418,7 +420,7 @@ static int get_num_absent_upstream_transients(federate_info_t* fed) { int num_absent_upstream_transients = 0; for (int j = 0; j < fed->enclave.num_upstream; j++) { federate_info_t* upstream = GET_FED_INFO(fed->enclave.upstream[j]); - // Do Ignore this enclave if it no longer connected. + // Ignore this enclave if it no longer connected. if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { num_absent_upstream_transients++; } @@ -426,6 +428,61 @@ static int get_num_absent_upstream_transients(federate_info_t* fed) { return num_absent_upstream_transients; } +/** + * @brief Send MSG_TYPE_UPSTREAM_CONNECTED to the specified federate. + * + * This function assumes that the mutex lock is already held. + * @param destination The destination federate. + * @param disconnected The connected federate. + */ +static void send_upstream_connected_locked(federate_info_t* destination, federate_info_t* connected) { + if (!connected->is_transient) { + // No need to send connected message for persistent federates. + return; + } + unsigned char buffer[MSG_TYPE_UPSTREAM_CONNECTED_LENGTH]; + buffer[0] = MSG_TYPE_UPSTREAM_CONNECTED; + encode_uint16(connected->enclave.id, &buffer[1]); + if (write_to_socket_close_on_error(&destination->socket, MSG_TYPE_UPSTREAM_CONNECTED_LENGTH, buffer)) { + lf_print_warning("RTI: Failed to send upstream connected message to federate %d.", connected->enclave.id); + } +} + +/** + * @brief Send MSG_TYPE_UPSTREAM_DISCONNECTED to the specified federate. + * + * This function assumes that the mutex lock is already held. + * @param destination The destination federate. + * @param disconnected The disconnected federate. + */ +static void send_upstream_disconnected_locked(federate_info_t* destination, federate_info_t* disconnected) { + unsigned char buffer[MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH]; + buffer[0] = MSG_TYPE_UPSTREAM_DISCONNECTED; + encode_uint16(disconnected->enclave.id, &buffer[1]); + if (write_to_socket_close_on_error(&destination->socket, MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH, buffer)) { + lf_print_warning("RTI: Failed to send upstream disconnected message to federate %d.", disconnected->enclave.id); + } +} + +/** + * @brief Mark a federate as disconnected and inform downstream federates. + * @param e The enclave corresponding to the disconnected federate. + */ +static void notify_federate_disconnected(scheduling_node_t* e) { + e->state = NOT_CONNECTED; + // Notify downstream federates. Need to hold the mutex lock to do this. + LF_MUTEX_LOCK(&rti_mutex); + for (int j = 0; j < e->num_downstream; j++) { + federate_info_t* downstream = GET_FED_INFO(e->downstream[j]); + // Ignore this enclave if it no longer connected. + if (downstream->enclave.state != NOT_CONNECTED) { + // Notify the downstream enclave. + send_upstream_disconnected_locked(downstream, GET_FED_INFO(e->id)); + } + } + LF_MUTEX_UNLOCK(&rti_mutex); +} + /** * Notify a tag advance grant (TAG) message to the specified federate immediately. * @@ -450,7 +507,7 @@ static void notify_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) // to fail. Consider a failure here a soft failure and update the federate's status. if (write_to_socket(((federate_info_t*)e)->socket, message_length, buffer)) { lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); - e->state = NOT_CONNECTED; + notify_federate_disconnected(e); } else { e->last_granted = tag; LF_PRINT_LOG("RTI sent to federate %d the tag advance grant (TAG) " PRINTF_TAG ".", e->id, tag.time - start_time, @@ -538,7 +595,7 @@ void notify_provisional_tag_advance_grant_immediate(scheduling_node_t* e, tag_t // to fail. Consider a failure here a soft failure and update the federate's status. if (write_to_socket(((federate_info_t*)e)->socket, message_length, buffer)) { lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); - e->state = NOT_CONNECTED; + notify_federate_disconnected(e); } else { e->last_provisionally_granted = tag; LF_PRINT_LOG("RTI sent to federate %d the Provisional Tag Advance Grant (PTAG) " PRINTF_TAG ".", e->id, @@ -1129,14 +1186,17 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { } /** - * Send the start time to the federate my_fed. + * Send to the start time to the federate my_fed. * This function assumes the caller does not hold the mutex. * * If it is the startup phase, the start_time will be the maximum received timestamps * plus an offset. The federate will then receive identical federation_start_time * and federate_start_tag.time (the federate_start_tag.microstep will be 0). * If, however, the startup phase is passed, the federate will receive different - * values than sateted above. + * values than stated above. + * + * This will also notify federates downstream of my_fed that this federate is now + * connected. This is important when there are zero-delay cycles. * * @param my_fed the federate to send the start time to. * @param federation_start_time the federation start_time @@ -1166,6 +1226,12 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { my_fed->enclave.state = GRANTED; lf_cond_broadcast(&sent_start_time); LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); + + // Notify downstream federates of this now connected transient. + for (int i = 0; i < my_fed->enclave.num_upstream; i++) { + send_upstream_connected_locked(GET_FED_INFO(my_fed->enclave.upstream[i]), my_fed); + } + LF_MUTEX_UNLOCK(&rti_mutex); } @@ -1471,6 +1537,7 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { _lf_federate_reports_error = true; lf_print_error("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); + notify_federate_disconnected(&my_fed->enclave); my_fed->enclave.state = NOT_CONNECTED; // Indicate that there will no further events from this federate. @@ -1589,8 +1656,8 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { case MSG_TYPE_NEXT_EVENT_TAG: handle_next_event_tag(my_fed); break; - case MSG_TYPE_LATEST_TAG_CONFIRMED: - handle_latest_tag_confirmed(my_fed); + case MSG_TYPE_LATEST_TAG_COMPLETE: + handle_latest_tag_complete(my_fed); break; case MSG_TYPE_STOP_REQUEST: handle_stop_request_message(my_fed); // FIXME: Reviewed until here. @@ -2388,72 +2455,40 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { return NULL; } + /** + * This thread is responsible for managing the priority queue of delayed grants to be issued. + * It waits until the current time matches the highest priority tag time in the queue. + * If reached, it notifies the grant immediately. If, however, the current time has not yet + * reached the highest priority tag and the queue has been updated (either by inserting or + * canceling an entry), the thread stops waiting and restarts the process again. + */ void* lf_delayed_grants_thread(void* nothing) { initialize_lf_thread_id(); - while (rti_remote->phase == execution_phase) { - if (pqueue_delayed_grants_size(rti_remote->delayed_grants) != 0) { - pqueue_delayed_grant_element_t* next; - // Do not pop, but rather read - next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); - instant_t next_time = next->base.tag.time; - // Wait for expiration, or a signal to stop or terminate. - if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time) == LF_TIMEOUT) { - lf_print("RTI: lf_delayed_grants_thread() is sending grant to %d at " PRINTF_TIME ".", next->fed_id, - next_time - start_time); - // Time reached to send the grant. Do it for delayed grants with - // the same tag - LF_MUTEX_LOCK(&rti_mutex); - do { - next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); - federate_info_t* fed = GET_FED_INFO(next->fed_id); - if (next->is_provisional) { - notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - } else { - notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - } - } while ((next = pqueue_delayed_grants_find_with_tag(rti_remote->delayed_grants, next->base.tag)) != NULL); - LF_MUTEX_UNLOCK(&rti_mutex); - } else { - // Waiting was interrupted, because of an update in the queue, or - // because this thread needs to terminate - lf_print("RTI: lf_delayed_grants_thread() did not send grant to %d at " PRINTF_TIME - ", but rather terminated!", - next->fed_id, next_time - start_time); - } - } - } - // The federation is at the shutdown phase. All persistent federates exited. - // We can do a sanity check that the delayed_grants queue is empty. - // FIXME: If there are still pending grants, what does that mean? Maybe that the - // federation stopped after a request to stop (not a timeout). Therefore, we need - // cleanup, and free the memory... - // TODO: do it! - } - void* lf_delayed_grants_thread(void* nothing) { - initialize_lf_thread_id(); - while (rti_remote->phase == execution_phase) { + // Wait for the first condition signal + lf_cond_wait(&updated_delayed_grants); + + while (true) { + if (rti_remote->all_federates_exited) { + break; + } if (pqueue_delayed_grants_size(rti_remote->delayed_grants) != 0) { pqueue_delayed_grant_element_t* next; + // Do not pop, but rather read next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); instant_t next_time = next->base.tag.time; // Wait for expiration, or a signal to stop or terminate. - if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time) == LF_TIMEOUT) { - lf_print("RTI: lf_delayed_grants_thread() is sending grant to %d at " PRINTF_TIME ".", next->fed_id, - next_time - start_time); - // Time reached to send the grant. Do it for delayed grants with - // the same tag + if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time)) { + // Time reached to send the grant. Do it for delayed grants with the same tag LF_MUTEX_LOCK(&rti_mutex); - do { - next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); - federate_info_t* fed = GET_FED_INFO(next->fed_id); - if (next->is_provisional) { - notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - } else { - notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - } - } while ((next = pqueue_delayed_grants_find_with_tag(rti_remote->delayed_grants, next->base.tag)) != NULL); + next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); + federate_info_t* fed = GET_FED_INFO(next->fed_id); + if (next->is_provisional) { + notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } else { + notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } LF_MUTEX_UNLOCK(&rti_mutex); } else { // Waiting was interrupted, because of an update in the queue, or @@ -2470,6 +2505,7 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { // federation stopped after a request to stop (not a timeout). Therefore, we need // cleanup, and free the memory... // TODO: do it! + return NULL; } void* respond_to_erroneous_connections(void* nothing) { diff --git a/core/federated/federate.c b/core/federated/federate.c index aced99ca1..4e029fa10 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -169,6 +169,8 @@ extern interval_t _lf_action_delay_table[]; extern size_t _lf_action_table_size; extern lf_action_base_t* _lf_zero_delay_cycle_action_table[]; extern size_t _lf_zero_delay_cycle_action_table_size; +extern uint16_t _lf_zero_delay_cycle_upstream_ids[]; +extern bool _lf_zero_delay_cycle_upstream_disconnected[]; extern reaction_t* network_input_reactions[]; extern size_t num_network_input_reactions; extern reaction_t* port_absent_reaction[]; @@ -194,7 +196,7 @@ static lf_action_base_t* action_for_port(int port_id) { /** * Update the last known status tag of all network input ports - * to the value of `tag`, unless that the provided `tag` is less + * to the value of `tag`, unless the provided `tag` is less * than the last_known_status_tag of the port. This is called when * a TAG signal is received from the RTI in centralized coordination. * If any update occurs, then this broadcasts on `lf_port_status_changed`. @@ -260,7 +262,7 @@ static void update_last_known_status_on_input_ports(tag_t tag, environment_t* en * * @param env The top-level environment, whose mutex is assumed to be held. * @param tag The tag on which the latest status of the specified network input port is known. - * @param portID The port ID. + * @param port_id The port ID. */ static void update_last_known_status_on_input_port(environment_t* env, tag_t tag, int port_id) { if (lf_tag_compare(tag, env->current_tag) < 0) @@ -294,41 +296,33 @@ static void update_last_known_status_on_input_port(environment_t* env, tag_t tag } /** - * @brief Mark all the input ports connected to the given federate as known to be absent until FOREVER. + * Set the status of network port with id portID. * - * This does nothing if the federate is not using decentralized coordination. - * This function acquires the mutex on the top-level environment. - * @param fed_id The ID of the federate. + * @param env The top-level environment, whose mutex is assumed to be held. + * @param action The action associated with the network input port. + * @param tag The tag of the PTAG. */ -static void mark_inputs_known_absent(int fed_id) { -#ifdef FEDERATED_DECENTRALIZED - // Note that when transient federates are supported, this will need to be updated because the - // federate could rejoin. - environment_t* env; - _lf_get_environments(&env); - LF_MUTEX_LOCK(&env->mutex); - - for (size_t i = 0; i < _lf_action_table_size; i++) { - lf_action_base_t* action = _lf_action_table[i]; - if (action->source_id == fed_id) { - update_last_known_status_on_input_port(env, FOREVER_TAG, i); - } +static void update_last_known_status_on_action(environment_t* env, lf_action_base_t* action, tag_t tag) { + if (lf_tag_compare(tag, env->current_tag) < 0) + tag = env->current_tag; + trigger_t* input_port_trigger = action->trigger; + if (lf_tag_compare(tag, input_port_trigger->last_known_status_tag) > 0) { + LF_PRINT_LOG("Updating the last known status tag of port for upstream absent transient federate from " PRINTF_TAG + " to " PRINTF_TAG ".", + input_port_trigger->last_known_status_tag.time - lf_time_start(), + input_port_trigger->last_known_status_tag.microstep, tag.time - lf_time_start(), tag.microstep); + input_port_trigger->last_known_status_tag = tag; } - LF_MUTEX_UNLOCK(&env->mutex); -#else - // Do nothing, except suppress unused parameter error. - (void)fed_id; -#endif // FEDERATED_DECENTRALIZED } /** - * Set the status of network port with id portID. + * Set the status of network port with id port_id. * - * @param portID The network port ID + * @param port_id The network port ID * @param status The network port status (port_status_t) */ -static void set_network_port_status(int portID, port_status_t status) { - lf_action_base_t* network_input_port_action = action_for_port(portID); +static void set_network_port_status(int port_id, port_status_t status) { + lf_action_base_t* network_input_port_action = action_for_port(port_id); network_input_port_action->trigger->status = status; } @@ -720,7 +714,7 @@ static int handle_port_absent_message(int* socket, int fed_id) { tracepoint_federate_from_federate(receive_PORT_ABS, _lf_my_fed_id, fed_id, &intended_tag); } LF_PRINT_LOG("Handling port absent for tag " PRINTF_TAG " for port %hu of fed %d.", - intended_tag.time - lf_time_start(), intended_tag.microstep, port_id, fed_id); + intended_tag.time - lf_time_start(), intended_tag.microstep, port_id, _lf_my_fed_id); // Environment is always the one corresponding to the top-level scheduling enclave. environment_t* env; @@ -991,7 +985,7 @@ static instant_t get_start_time_from_rti(instant_t my_physical_time) { * a notification of this update, which may unblock whichever worker * thread is trying to advance time. * - * @note This function is very similar to handle_provisinal_tag_advance_grant() except that + * @note This function is very similar to handle_provisional_tag_advance_grant() except that * it sets last_TAG_was_provisional to false. */ static void handle_tag_advance_grant(void) { @@ -1233,7 +1227,8 @@ static void* update_ports_from_staa_offsets(void* args) { * * @note This function is similar to handle_tag_advance_grant() except that * it sets last_TAG_was_provisional to true and also it does not update the - * last known tag for input ports. + * last known tag for input ports unless there is an upstream federate that is + * disconnected. */ static void handle_provisional_tag_advance_grant() { // Environment is always the one corresponding to the top-level scheduling enclave. @@ -1270,6 +1265,12 @@ static void handle_provisional_tag_advance_grant() { env->current_tag.time - start_time, env->current_tag.microstep, _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); + for (int i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { + if (_lf_zero_delay_cycle_upstream_disconnected[i] == true) { + update_last_known_status_on_action(env, _lf_zero_delay_cycle_action_table[i], PTAG); + } + } + // Even if we don't modify the event queue, we need to broadcast a change // because we do not need to continue to wait for a TAG. lf_cond_broadcast(&env->event_q_changed); @@ -1506,6 +1507,44 @@ static void send_failed_signal() { */ static void handle_rti_failed_message(void) { exit(1); } +/** + * @brief Handle message from the RTI that an upstream federate has connected. + * + */ +static void handle_upstream_connected_message(void) { + size_t bytes_to_read = sizeof(uint16_t); + unsigned char buffer[bytes_to_read]; + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, + "Failed to read upstream connected message from RTI."); + uint16_t connected = extract_uint16(buffer); + lf_print("********* FIXME: Upstream %d connected *********\n", connected); + // Mark the upstream as connected. + for (int i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { + if (_lf_zero_delay_cycle_upstream_ids[i] == connected) { + _lf_zero_delay_cycle_upstream_disconnected[i] = false; + } + } +} + +/** + * @brief Handle message from the RTI that an upstream federate has disconnected. + * + */ +static void handle_upstream_disconnected_message(void) { + size_t bytes_to_read = sizeof(uint16_t); + unsigned char buffer[bytes_to_read]; + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, + "Failed to read upstream disconnected message from RTI."); + uint16_t disconnected = extract_uint16(buffer); + lf_print("********* FIXME: Upstream %d disconnected *********\n", disconnected); + // Mark the upstream as disconnected. + for (int i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { + if (_lf_zero_delay_cycle_upstream_ids[i] == disconnected) { + _lf_zero_delay_cycle_upstream_disconnected[i] = true; + } + } +} + /** * Thread that listens for TCP inputs from the RTI. * When messages arrive, this calls the appropriate handler. @@ -1582,6 +1621,12 @@ static void* listen_to_rti_TCP(void* args) { case MSG_TYPE_FAILED: handle_rti_failed_message(); break; + case MSG_TYPE_UPSTREAM_CONNECTED: + handle_upstream_connected_message(); + break; + case MSG_TYPE_UPSTREAM_DISCONNECTED: + handle_upstream_disconnected_message(); + break; case MSG_TYPE_CLOCK_SYNC_T1: case MSG_TYPE_CLOCK_SYNC_T4: lf_print_error("Federate %d received unexpected clock sync message from RTI on TCP socket.", _lf_my_fed_id); @@ -1941,8 +1986,12 @@ void lf_connect_to_rti(const char* hostname, int port) { } else if (response == MSG_TYPE_RESIGN) { lf_print_warning("RTI resigned. Will try again"); continue; + } else if (response == MSG_TYPE_UPSTREAM_CONNECTED) { + handle_upstream_connected_message(); + } else if (response == MSG_TYPE_UPSTREAM_DISCONNECTED) { + handle_upstream_disconnected_message(); } else { - lf_print_warning("RTI gave unexpect response %u. Will try again", response); + lf_print_warning("RTI on port %d gave unexpected response %u. Will try again", uport, response); continue; } } diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index 320ad039b..d745378e7 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -618,6 +618,23 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define MSG_TYPE_FAILED 25 +///////////////////////////////////////////// +//// Transient federate support + +/** + * A message the informs a downstream federate that a federate upstream of it + * is connected. The next 2 bytes are the federate ID of the upstream federate. + */ +#define MSG_TYPE_UPSTREAM_CONNECTED 26 +#define MSG_TYPE_UPSTREAM_CONNECTED_LENGTH (1 + sizeof(uint16_t)) + +/** + * A message the informs a downstream federate that a federate upstream of it + * is no longer connected. The next 2 bytes are the federate ID of the upstream federate. + */ +#define MSG_TYPE_UPSTREAM_DISCONNECTED 27 +#define MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH (1 + sizeof(uint16_t)) + /** * As an answer to MSG_TYPE_TIMESTAMP, the RTI broadcasts to all persistent * federates, or sends to newly joining transient federate, a message of @@ -625,17 +642,17 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * together with the effective starting logical tag. The latter is useful for * transient federates. */ -#define MSG_TYPE_TIMESTAMP_START 50 +#define MSG_TYPE_TIMESTAMP_START 28 #define MSG_TYPE_TIMESTAMP_START_LENGTH (1 + sizeof(instant_t) + sizeof(instant_t) + sizeof(microstep_t)) /** - * Byte sent by the RTI ordering the federate to stop. Upon receiving the meaasage, - * the federate will call lf_stop(), which will make him resign at its current_tag + * Byte sent by the RTI ordering the federate to stop. Upon receiving the message, + * the federate will call lf_stop(), which will make it resign at its current_tag * plus 1 microstep. - * The next 8 bytes will be the time at which the federates will stop. * + * The next 8 bytes will be the time at which the federates will stop. * The next 4 bytes will be the microstep at which the federates will stop.. */ -#define MSG_TYPE_STOP 30 +#define MSG_TYPE_STOP 29 #define MSG_TYPE_STOP_LENGTH 1 ///////////////////////////////////////////// From cf003cbf90aaf98c41195cda7d6e867d15cb4c68 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 15 Jul 2024 17:01:48 +0100 Subject: [PATCH 097/148] Fix the type of _lf_zero_delay_cycle_action_table_size iterator to be size_t + automatic formatter --- core/federated/federate.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 4e029fa10..68733679d 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -296,7 +296,15 @@ static void update_last_known_status_on_input_port(environment_t* env, tag_t tag } /** - * Set the status of network port with id portID. + * @brief Update the last known status tag of a network input action. + * + * This function is similar to update_last_known_status_on_input_port, but + * it is called when a PTAG is granted and an upstream transient federate is not + * connected. It updates the last known status tag of the network input action + * so that it will not wait for a message or absent message from the upstream federate. + * + * This function assumes the caller holds the mutex on the top-level environment, + * and, if the tag actually increases, it broadcasts on `lf_port_status_changed`. * * @param env The top-level environment, whose mutex is assumed to be held. * @param action The action associated with the network input port. @@ -1265,7 +1273,7 @@ static void handle_provisional_tag_advance_grant() { env->current_tag.time - start_time, env->current_tag.microstep, _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); - for (int i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { + for (size_t i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { if (_lf_zero_delay_cycle_upstream_disconnected[i] == true) { update_last_known_status_on_action(env, _lf_zero_delay_cycle_action_table[i], PTAG); } @@ -1519,7 +1527,7 @@ static void handle_upstream_connected_message(void) { uint16_t connected = extract_uint16(buffer); lf_print("********* FIXME: Upstream %d connected *********\n", connected); // Mark the upstream as connected. - for (int i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { + for (size_t i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { if (_lf_zero_delay_cycle_upstream_ids[i] == connected) { _lf_zero_delay_cycle_upstream_disconnected[i] = false; } @@ -1538,7 +1546,7 @@ static void handle_upstream_disconnected_message(void) { uint16_t disconnected = extract_uint16(buffer); lf_print("********* FIXME: Upstream %d disconnected *********\n", disconnected); // Mark the upstream as disconnected. - for (int i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { + for (size_t i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { if (_lf_zero_delay_cycle_upstream_ids[i] == disconnected) { _lf_zero_delay_cycle_upstream_disconnected[i] = true; } From 1fde08f2c7c33a558e6579b4b842de67af2b4693 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Mon, 15 Jul 2024 16:30:48 -0400 Subject: [PATCH 098/148] Removed unnecessary parens --- core/federated/federate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 68733679d..07b1aacd5 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2714,7 +2714,7 @@ char* lf_get_federates_bin_directory() { bin_directory_defined = true; #endif if (bin_directory_defined) { - return (LF_FEDERATES_BIN_DIRECTORY); + return LF_FEDERATES_BIN_DIRECTORY; } return NULL; } From 6aeb52a01058b3d19468ea964dfc9ee6b4e709db Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 17 Jul 2024 06:58:17 +0100 Subject: [PATCH 099/148] Run clang formatter --- core/federated/RTI/rti_remote.c | 1 + 1 file changed, 1 insertion(+) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 5265a5eef..d24645949 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -431,6 +431,7 @@ static int get_num_absent_upstream_transients(federate_info_t* fed) { /** * @brief Send MSG_TYPE_UPSTREAM_CONNECTED to the specified federate. * + * * This function assumes that the mutex lock is already held. * @param destination The destination federate. * @param disconnected The connected federate. From 4ed301ba83998d65ab90038157469dbe0e4e142f Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 17 Jul 2024 09:03:12 -0400 Subject: [PATCH 100/148] Fixed thread interactions --- core/federated/RTI/rti_remote.c | 54 +++++++++++++++------------------ core/federated/federate.c | 27 +++++++++++------ 2 files changed, 41 insertions(+), 40 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index d24645949..e3bc4dac6 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -41,13 +41,13 @@ extern instant_t start_time; static rti_remote_t* rti_remote; // Referance to the federate instance to support hot swap -federate_info_t* hot_swap_federate; +static federate_info_t* hot_swap_federate; // Indicates if a hot swap process is in progress -bool hot_swap_in_progress = false; +static bool hot_swap_in_progress = false; // Indicates that the old federate has stopped. -bool hot_swap_old_resigned = false; +static bool hot_swap_old_resigned = false; bool _lf_federate_reports_error = false; @@ -361,24 +361,15 @@ pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_dela } /** - * @brief Insert the delayed grant into the delayed_grants queue. - * - * The insertion will cause the broadcast to cause the delayed_grants_thread to - * account for the update. - * - * In case there is already a grant for that federate, keep the soonest one. - * FIXME: Is that correct? - * FIXME: Why not just add it to the queue? + * @brief Insert the delayed grant into the delayed_grants queue and notify. * + * This function assumes the caller holds the rti_mutex. * @param fed The federate. * @param tag The tag to grant. * @param is_provisional State whther the grant is provisional. */ static void notify_grant_delayed(federate_info_t* fed, tag_t tag, bool is_provisional) { - // Check wether there is already a pending grant, - // and check the pending provisional grant as well - // Iterate over the - LF_MUTEX_LOCK(&rti_mutex); + // Check wether there is already a pending grant. pqueue_delayed_grant_element_t* dge = pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, fed->enclave.id); if (dge == NULL) { @@ -393,22 +384,24 @@ static void notify_grant_delayed(federate_info_t* fed, tag_t tag, bool is_provis dge->base.tag.microstep, dge->fed_id); lf_cond_signal(&updated_delayed_grants); } else { - // FIXME: Decide what to do in this case... - // TODO: do it! - // FIXME: Add to the queue? - } - LF_MUTEX_UNLOCK(&rti_mutex); -} - -void notify_grant_canceled(federate_info_t* fed) { - LF_MUTEX_LOCK(&rti_mutex); - pqueue_delayed_grant_element_t* dge = - pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, fed->enclave.id); - if (dge != NULL) { - pqueue_delayed_grants_remove(rti_remote->delayed_grants, dge); - lf_cond_broadcast(&updated_delayed_grants); + // Note that there should never be more than one pending grant for a federate. + int compare = lf_tag_compare(dge->base.tag, tag); + if (compare > 0) { + // Update the pre-existing grant. + dge->base.tag = tag; + dge->is_provisional = is_provisional; + LF_PRINT_LOG("RTI: Updating a delayed grant of " PRINTF_TAG " for federate %d.", tag.time - start_time, + tag.microstep, dge->fed_id); + lf_cond_signal(&updated_delayed_grants); + } else if (compare == 0) { + if (dge->is_provisional != is_provisional) { + // Update the grant to keep the most recent is_provisional status. + dge->is_provisional = is_provisional; + LF_PRINT_LOG("RTI: Changing status of a delayed grant of " PRINTF_TAG " for federate %d to provisional: %d.", + dge->base.tag.time - start_time, dge->base.tag.microstep, dge->fed_id, is_provisional); + } + } } - LF_MUTEX_UNLOCK(&rti_mutex); } /** @@ -2487,6 +2480,7 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { federate_info_t* fed = GET_FED_INFO(next->fed_id); if (next->is_provisional) { notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + // FIXME: Send port absent notification to all federates downstream of absent federates. } else { notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); } diff --git a/core/federated/federate.c b/core/federated/federate.c index 07b1aacd5..de8ed5fc8 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1274,7 +1274,7 @@ static void handle_provisional_tag_advance_grant() { _fed.last_TAG.microstep); for (size_t i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { - if (_lf_zero_delay_cycle_upstream_disconnected[i] == true) { + if (_lf_zero_delay_cycle_upstream_disconnected[i]) { update_last_known_status_on_action(env, _lf_zero_delay_cycle_action_table[i], PTAG); } } @@ -1525,7 +1525,7 @@ static void handle_upstream_connected_message(void) { read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, "Failed to read upstream connected message from RTI."); uint16_t connected = extract_uint16(buffer); - lf_print("********* FIXME: Upstream %d connected *********\n", connected); + LF_PRINT_DEBUG("Received notification that upstream federate %d has connected", connected); // Mark the upstream as connected. for (size_t i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { if (_lf_zero_delay_cycle_upstream_ids[i] == connected) { @@ -1544,7 +1544,7 @@ static void handle_upstream_disconnected_message(void) { read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, "Failed to read upstream disconnected message from RTI."); uint16_t disconnected = extract_uint16(buffer); - lf_print("********* FIXME: Upstream %d disconnected *********\n", disconnected); + LF_PRINT_DEBUG("Received notification that upstream federate %d has disconnected", disconnected); // Mark the upstream as disconnected. for (size_t i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { if (_lf_zero_delay_cycle_upstream_ids[i] == disconnected) { @@ -2666,13 +2666,20 @@ bool lf_update_max_level(tag_t tag, bool is_provisional) { _lf_action_delay_table[i])) <= 0)) { continue; } +#else + // For centralized coordination, if there is an upstream transient federate that is not + // connected, then we don't want to block on its action. + if (_lf_zero_delay_cycle_upstream_disconnected[i]) { + // Mark the action known up to and including the current tag. It is absent. + update_last_known_status_on_action(env, input_port_action, env->current_tag); + } #endif // FEDERATED_DECENTRALIZED - // If the current tag is greater than the last known status tag of the input port, - // and the input port is not physical, then block on that port by ensuring - // the MLAA is no greater than the level of that port. - // For centralized coordination, this is applied only to input ports coming from - // federates that are in a ZDC. For decentralized coordination, this is applied - // to all input ports. + // If the current tag is greater than the last known status tag of the input port, + // and the input port is not physical, then block on that port by ensuring + // the MLAA is no greater than the level of that port. + // For centralized coordination, this is applied only to input ports coming from + // federates that are in a ZDC. For decentralized coordination, this is applied + // to all input ports. if (lf_tag_compare(env->current_tag, input_port_action->trigger->last_known_status_tag) > 0 && !input_port_action->trigger->is_physical) { max_level_allowed_to_advance = @@ -2697,7 +2704,7 @@ void lf_stop() { lf_set_stop_tag(&env[i], new_stop_tag); - lf_print("Setting the stop tag of env %d to " PRINTF_TAG ".", i, env[i].stop_tag.time - start_time, + LF_PRINT_LOG("Setting the stop tag of env %d to " PRINTF_TAG ".", i, env[i].stop_tag.time - start_time, env[i].stop_tag.microstep); if (env[i].barrier.requestors) From 5d2d472b113774da3e5053f9f8bec6c209f6eea3 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 17 Jul 2024 09:17:02 -0400 Subject: [PATCH 101/148] Format --- core/federated/RTI/rti_remote.c | 7 ++++--- core/federated/federate.c | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index e3bc4dac6..bdaefd05c 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -363,6 +363,7 @@ pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_dela /** * @brief Insert the delayed grant into the delayed_grants queue and notify. * + * * This function assumes the caller holds the rti_mutex. * @param fed The federate. * @param tag The tag to grant. @@ -2394,9 +2395,9 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { // Wait for the old federate to send MSG_TYPE_RESIGN LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); - // FIXME: Should this have a timeout? - while (!hot_swap_old_resigned) - ; + // FIXME: This is a busy wait! Need instead a lf_cond_wait on a condition variable. + while (!hot_swap_old_resigned) { + } // The latest LTC is the tag at which the old federate resigned. This is useful // for computing the effective_start_time of the new joining federate. diff --git a/core/federated/federate.c b/core/federated/federate.c index de8ed5fc8..62e835c78 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2705,7 +2705,7 @@ void lf_stop() { lf_set_stop_tag(&env[i], new_stop_tag); LF_PRINT_LOG("Setting the stop tag of env %d to " PRINTF_TAG ".", i, env[i].stop_tag.time - start_time, - env[i].stop_tag.microstep); + env[i].stop_tag.microstep); if (env[i].barrier.requestors) _lf_decrement_tag_barrier_locked(&env[i]); From 3554de72fd53ebeb0652aa6cdcd969087a4e8287 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Thu, 18 Jul 2024 14:09:25 +0100 Subject: [PATCH 102/148] Fix mutex access in lf_delayed_grants() --- core/federated/RTI/rti_remote.c | 63 +++++++++++++++------------------ 1 file changed, 29 insertions(+), 34 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index bdaefd05c..a1089a483 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -2459,48 +2459,43 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { */ void* lf_delayed_grants_thread(void* nothing) { initialize_lf_thread_id(); - - // Wait for the first condition signal - lf_cond_wait(&updated_delayed_grants); - - while (true) { - if (rti_remote->all_federates_exited) { - break; - } - if (pqueue_delayed_grants_size(rti_remote->delayed_grants) != 0) { - pqueue_delayed_grant_element_t* next; - - // Do not pop, but rather read - next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + // Hold the mutex only when accessing rti_remote->delayed_grants pqueue + while (!rti_remote->all_federates_exited) { + if (pqueue_delayed_grants_size(rti_remote->delayed_grants) > 0) { + // Do not pop, but rather peek. + LF_MUTEX_LOCK(&rti_mutex); + pqueue_delayed_grant_element_t* next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); instant_t next_time = next->base.tag.time; + LF_MUTEX_UNLOCK(&rti_mutex); // Wait for expiration, or a signal to stop or terminate. if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time)) { - // Time reached to send the grant. Do it for delayed grants with the same tag + // Time reached to send the grant. + // However, the grant may have been canceled while we were waiting. LF_MUTEX_LOCK(&rti_mutex); - next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); - federate_info_t* fed = GET_FED_INFO(next->fed_id); - if (next->is_provisional) { - notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - // FIXME: Send port absent notification to all federates downstream of absent federates. - } else { - notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + pqueue_delayed_grant_element_t* new_next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + if (next == new_next) { + pqueue_delayed_grants_pop(rti_remote->delayed_grants); + federate_info_t* fed = GET_FED_INFO(next->fed_id); + if (next->is_provisional) { + notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + // FIXME: Send port absent notification to all federates downstream of absent federates. + } else { + notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } + free(next); } - LF_MUTEX_UNLOCK(&rti_mutex); - } else { - // Waiting was interrupted, because of an update in the queue, or - // because this thread needs to terminate - lf_print("RTI: lf_delayed_grants_thread() did not send grant to %d at " PRINTF_TIME - ", but rather terminated!", - next->fed_id, next_time - start_time); } + LF_MUTEX_UNLOCK(&rti_mutex); + } else if (pqueue_delayed_grants_size(rti_remote->delayed_grants) == 0) { + // Wait for something to appear on the queue. + lf_cond_wait(&updated_delayed_grants); } } - // The federation is at the shutdown phase. All persistent federates exited. - // We can do a sanity check that the delayed_grants queue is empty. - // FIXME: If there are still pending grants, what does that mean? Maybe that the - // federation stopped after a request to stop (not a timeout). Therefore, we need - // cleanup, and free the memory... - // TODO: do it! + // Free any delayed grants that are still on the queue. + while (pqueue_delayed_grants_size(rti_remote->delayed_grants) > 0) { + pqueue_delayed_grant_element_t* next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); + free(next); + } return NULL; } From 00f8a302bcb446216b87c834c0e93c41b5880c43 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Thu, 18 Jul 2024 16:09:09 -0400 Subject: [PATCH 103/148] Made functions static and use free function for queue --- core/federated/RTI/rti_remote.c | 18 +++++++++--------- core/federated/RTI/rti_remote.h | 5 ----- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index a1089a483..364640ced 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -74,7 +74,7 @@ extern int lf_critical_section_exit(environment_t* env) { return lf_mutex_unlock * @param nbr_delayed_grants The size. * @return The dynamically allocated queue or NULL. */ -pqueue_delayed_grants_t* pqueue_delayed_grants_init(uint16_t nbr_delayed_grants) { +static pqueue_delayed_grants_t* pqueue_delayed_grants_init(uint16_t nbr_delayed_grants) { return (pqueue_delayed_grants_t*)pqueue_tag_init((size_t)nbr_delayed_grants); } @@ -84,7 +84,7 @@ pqueue_delayed_grants_t* pqueue_delayed_grants_init(uint16_t nbr_delayed_grants) * @param q The queue. * @return The size. */ -size_t pqueue_delayed_grants_size(pqueue_delayed_grants_t* q) { return pqueue_tag_size((pqueue_tag_t*)q); } +static size_t pqueue_delayed_grants_size(pqueue_delayed_grants_t* q) { return pqueue_tag_size((pqueue_tag_t*)q); } /** * @brief Insert an\ delayed grant element into the queue. @@ -93,7 +93,7 @@ size_t pqueue_delayed_grants_size(pqueue_delayed_grants_t* q) { return pqueue_ta * @param e The delayed grant element to insert. * @return 0 on success */ -int pqueue_delayed_grants_insert(pqueue_delayed_grants_t* q, pqueue_delayed_grant_element_t* d) { +static int pqueue_delayed_grants_insert(pqueue_delayed_grants_t* q, pqueue_delayed_grant_element_t* d) { return pqueue_tag_insert((pqueue_tag_t*)q, (void*)d); } @@ -103,7 +103,7 @@ int pqueue_delayed_grants_insert(pqueue_delayed_grants_t* q, pqueue_delayed_gran * @param q The queue. * @return NULL on error, otherwise the entry */ -pqueue_delayed_grant_element_t* pqueue_delayed_grants_pop(pqueue_delayed_grants_t* q) { +static pqueue_delayed_grant_element_t* pqueue_delayed_grants_pop(pqueue_delayed_grants_t* q) { return (pqueue_delayed_grant_element_t*)pqueue_tag_pop((pqueue_tag_t*)q); } @@ -113,7 +113,7 @@ pqueue_delayed_grant_element_t* pqueue_delayed_grants_pop(pqueue_delayed_grants_ * @param q The queue. * @return NULL on if the queue is empty, otherwise the delayed grant element. */ -pqueue_delayed_grant_element_t* pqueue_delayed_grants_peek(pqueue_delayed_grants_t* q) { +static pqueue_delayed_grant_element_t* pqueue_delayed_grants_peek(pqueue_delayed_grants_t* q) { return (pqueue_delayed_grant_element_t*)pqueue_tag_peek((pqueue_tag_t*)q); } @@ -122,7 +122,7 @@ pqueue_delayed_grant_element_t* pqueue_delayed_grants_peek(pqueue_delayed_grants * * @param q The queue. */ -void pqueue_delayed_grants_free(pqueue_delayed_grants_t* q) { pqueue_tag_free((pqueue_tag_t*)q); } +static void pqueue_delayed_grants_free(pqueue_delayed_grants_t* q) { pqueue_tag_free((pqueue_tag_t*)q); } /** * @brief Remove an item from the delayed grants queue. @@ -130,7 +130,7 @@ void pqueue_delayed_grants_free(pqueue_delayed_grants_t* q) { pqueue_tag_free((p * @param q The queue. * @param e The entry to remove. */ -void pqueue_delayed_grants_remove(pqueue_delayed_grants_t* q, pqueue_delayed_grant_element_t* e) { +static void pqueue_delayed_grants_remove(pqueue_delayed_grants_t* q, pqueue_delayed_grant_element_t* e) { pqueue_tag_remove((pqueue_tag_t*)q, (void*)e); } @@ -151,8 +151,8 @@ pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_with_tag(pqueue_delay * @param fed_id The federate id. * @return An entry with the specified federate if or NULL if there isn't one. */ - -pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t* q, uint16_t fed_id) { +static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t* q, + uint16_t fed_id) { pqueue_delayed_grant_element_t* dge; pqueue_t* _q = (pqueue_t*)q; if (!q || q->size == 1) diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index da0dc1832..62e75235b 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -408,11 +408,6 @@ void* lf_connect_to_persistent_transient_federates_thread(int socket_descriptor) */ void* lf_connect_to_transient_federates_thread(void* nothing); -/** - * Thread that manages the delayed grants using a priprity queue. - */ -void* lf_delayed_grants_thread(void* nothing); - /** * Thread to respond to new connections, which could be federates of other * federations who are attempting to join the wrong federation. From db1c7a2f5967f91eec160f4d965e3d908bca54ad Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Fri, 19 Jul 2024 09:42:08 -0400 Subject: [PATCH 104/148] Added max function --- core/utils/pqueue_tag.c | 11 +++++++++++ include/core/utils/pqueue_tag.h | 7 +++++++ test/general/utils/pqueue_test.c | 2 ++ 3 files changed, 20 insertions(+) diff --git a/core/utils/pqueue_tag.c b/core/utils/pqueue_tag.c index c1abe35ba..9a7491dd7 100644 --- a/core/utils/pqueue_tag.c +++ b/core/utils/pqueue_tag.c @@ -159,3 +159,14 @@ void pqueue_tag_remove_up_to(pqueue_tag_t* q, tag_t t) { } void pqueue_tag_dump(pqueue_tag_t* q) { pqueue_dump((pqueue_t*)q, pqueue_tag_print_element); } + +tag_t pqueue_tag_max_tag(pqueue_tag_t* q) { + tag_t result = NEVER_TAG; + for (int i = 1; i < q->size; i++) { + pqueue_tag_element_t* element = (pqueue_tag_element_t*)(q->d[i]); + if (lf_tag_compare(element->tag, result) > 0) { + result = element->tag; + } + } + return result; +} diff --git a/include/core/utils/pqueue_tag.h b/include/core/utils/pqueue_tag.h index e06e074be..dec61a5cd 100644 --- a/include/core/utils/pqueue_tag.h +++ b/include/core/utils/pqueue_tag.h @@ -216,4 +216,11 @@ void pqueue_tag_remove_up_to(pqueue_tag_t* q, tag_t t); */ void pqueue_tag_dump(pqueue_tag_t* q); +/** + * @brief Return the maximum tag in the queue or NEVER_TAG if the queue is empty. + * + * @param q The queue. + */ +tag_t pqueue_tag_max_tag(pqueue_tag_t* q); + #endif // PQUEUE_TAG_H diff --git a/test/general/utils/pqueue_test.c b/test/general/utils/pqueue_test.c index 665c4e13f..18b3009a8 100644 --- a/test/general/utils/pqueue_test.c +++ b/test/general/utils/pqueue_test.c @@ -23,6 +23,8 @@ static void insert_on_queue(pqueue_tag_t* q) { assert(!pqueue_tag_insert_tag(q, t2)); assert(!pqueue_tag_insert_tag(q, t3)); + assert(lf_tag_compare(pqueue_tag_max_tag(q), t1) == 0); + assert(!pqueue_tag_insert_if_no_match(q, t4)); assert(pqueue_tag_insert_if_no_match(q, t1)); assert(pqueue_tag_insert_if_no_match(q, t4)); From 76b9e2edcb4dc2a45662422dc0ec1a356c35becf Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Fri, 19 Jul 2024 09:53:30 -0400 Subject: [PATCH 105/148] Revert to holding mutex plus general cleanups --- core/federated/RTI/rti_remote.c | 146 +++++++++++++------- core/federated/RTI/rti_remote.h | 8 +- include/core/federated/network/net_common.h | 31 ++--- 3 files changed, 113 insertions(+), 72 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 364640ced..4d5eb86eb 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -56,9 +56,9 @@ bool _lf_federate_reports_error = false; #define GET_FED_INFO(_idx) (federate_info_t*)rti_remote->base.scheduling_nodes[_idx] lf_mutex_t rti_mutex; -lf_cond_t received_start_times; -lf_cond_t sent_start_time; -lf_cond_t updated_delayed_grants; +static lf_cond_t received_start_times; +static lf_cond_t sent_start_time; +static lf_cond_t updated_delayed_grants; extern int lf_critical_section_enter(environment_t* env) { return lf_mutex_lock(&rti_mutex); } @@ -1245,74 +1245,72 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { LF_MUTEX_LOCK(&rti_mutex); - // Processing the TIMESTAMP depends on whether it is the startup phase (all - // persistent federates joined) or not. - if (rti_remote->phase == - startup_phase) { // This is equivalent to: rti_remote->num_feds_proposed_start < - // (rti_remote->number_of_enclaves - rti_remote->number_of_transient_federates) + // Processing the TIMESTAMP depends on whether it is the startup phase. + if (rti_remote->phase == startup_phase) { + // Not all persistent federates have proposed a start time. if (timestamp > rti_remote->max_start_time) { rti_remote->max_start_time = timestamp; } - // Check that persistent federates did propose a start_time + // Note that if a transient federate's thread gets here during the startup phase, + // then it will be assigned the same global tag as its effective start tag and its + // timestamp will affect that start tag. if (!my_fed->is_transient) { rti_remote->num_feds_proposed_start++; } if (rti_remote->num_feds_proposed_start == (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { - // All federates have proposed a start time. + // This federate is the last persistent federate to proposed a start time. lf_cond_broadcast(&received_start_times); rti_remote->phase = execution_phase; } else { - // Some federates have not yet proposed a start time. - // wait for a notification. + // Wait until all persistent federates have proposed a start time. while (rti_remote->num_feds_proposed_start < (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { - // FIXME: Should have a timeout here? lf_cond_wait(&received_start_times); } } + // Add an offset to the maximum tag to get everyone starting together. + start_time = rti_remote->max_start_time + DELAY_START; + my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; LF_MUTEX_UNLOCK(&rti_mutex); - // Send back to the federate the maximum time plus an offset on a TIMESTAMP - // message. - // Add an offset to this start time to get everyone starting together. - start_time = rti_remote->max_start_time + DELAY_START; - my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; + // Notify the federate of its start tag. send_start_tag(my_fed, start_time, my_fed->effective_start_tag); - } else if (rti_remote->phase == shutdown_phase) { - // Do not answer the federate if the federation is in hsutdown phase - // Or maybe send and error message? + } else if (rti_remote->phase == shutdown_phase || !my_fed->is_transient) { LF_MUTEX_UNLOCK(&rti_mutex); - return; - } else { // The federation is the execution phase - // A transient has joined after the startup phase - // At this point, we already hold the mutex - // This is rather a possible extreme corner case, where a transient sends its timestamp, and only - // enters the if section after all persistents have joined. - if (timestamp < start_time) { - timestamp = start_time; - } + // Send reject message if the federation is in shutdown phase or if + // it is in the execution phase but the federate is persistent. + send_reject(&my_fed->socket, JOINING_TOO_LATE); + return; + } else { + // The federation is transient and we are in the execution phase. + // At this point, we already hold the mutex. //// Algorithm for computing the effective_start_time of a joining transient // The effective_start_time will be the max among all the following tags: // 1. At tag: (joining time, 0 microstep) - // 2. The latest completed logical tag + 1 microstep - // 3. The latest granted (P)TAG + 1 microstep, of every downstream federate - // 4. The maximun tag of messages from the upstream federates + 1 microstep + // 2. (start_time, 1 microstep) + // 3. The latest completed logical tag + 1 microstep + // 4. The latest granted (P)TAG + 1 microstep, of every downstream federate + // 5. The maximun tag of messages from the upstream federates + 1 microstep // Condition 1. my_fed->effective_start_tag = (tag_t){.time = timestamp, .microstep = 0u}; // Condition 2. - // FIXME: Not sure if this corner case can happen, but better to be on the safe side. + if (timestamp < start_time) { + my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 1u}; + } + + // Condition 3. if (lf_tag_compare(my_fed->enclave.completed, my_fed->effective_start_tag) >= 0) { my_fed->effective_start_tag = my_fed->enclave.completed; my_fed->effective_start_tag.microstep++; } - // Condition 3. Iterate over the downstream federates + // Condition 4. Iterate over the downstream federates for (int j = 0; j < my_fed->enclave.num_downstream; j++) { federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); @@ -1329,16 +1327,18 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { } } - // Condition 4. Iterate over the messages from the upstream federates + // Condition 5. + // This one is a bit subtle. Any messages from upstream federates that the RTI has + // not yet seen will be sent to this joining federate after the effective_start_tag + // because the effective_start_tag is sent while still holding the mutex. + + // Iterate over the messages from the upstream federates for (int j = 0; j < my_fed->enclave.num_upstream; j++) { federate_info_t* upstream = GET_FED_INFO(my_fed->enclave.upstream[j]); - // Get the max over the TAG of the upstreams size_t queue_size = pqueue_tag_size(upstream->in_transit_message_tags); if (queue_size != 0) { - pqueue_t* pq = (pqueue_t*)(upstream->in_transit_message_tags); - pqueue_tag_element_t* message_with_max_tag = (pqueue_tag_element_t*)(pq->d[queue_size]); - tag_t max_tag = message_with_max_tag->tag; + tag_t max_tag = pqueue_tag_max_tag(upstream->in_transit_message_tags); if (lf_tag_compare(max_tag, my_fed->effective_start_tag) >= 0) { my_fed->effective_start_tag = max_tag; @@ -1347,8 +1347,8 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { } } - // For every downstream that has a pending grant that is higher then the - // effective_start_time of the federate, cancel it + // For every downstream that has a pending grant that is higher than the + // effective_start_time of the federate, cancel it. for (int j = 0; j < my_fed->enclave.num_downstream; j++) { federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); @@ -1358,7 +1358,7 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { } // Check the pending grants, if any, and keep it only if it is - // sonner than the effective start tag + // sooner than the effective start tag. pqueue_delayed_grant_element_t* dge = pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, downstream->enclave.id); if (dge != NULL && lf_tag_compare(dge->base.tag, my_fed->effective_start_tag) > 0) { @@ -1366,13 +1366,14 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { } } - LF_MUTEX_UNLOCK(&rti_mutex); - // Once the effective start time set, sent it to the joining transient, // together with the start time of the federation. - // Send the start time + // Have to send the start tag while still holding the mutex to ensure that no message + // from an upstream federate is forwarded before the start tag. send_start_tag(my_fed, start_time, my_fed->effective_start_tag); + + LF_MUTEX_UNLOCK(&rti_mutex); } } @@ -1702,11 +1703,11 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { return NULL; } - void send_reject(int* socket_id, unsigned char error_code) { + void send_reject(int* socket_id, rejection_code_t error_code) { LF_PRINT_DEBUG("RTI sending MSG_TYPE_REJECT."); unsigned char response[2]; response[0] = MSG_TYPE_REJECT; - response[1] = error_code; + response[1] = (unsigned char)error_code; LF_MUTEX_LOCK(&rti_mutex); // NOTE: Ignore errors on this response. if (write_to_socket(*socket_id, 2, response)) { @@ -1718,7 +1719,6 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { *socket_id = -1; LF_MUTEX_UNLOCK(&rti_mutex); } - lf_print("handle_timestamp for transient 1157"); /** * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload @@ -2450,6 +2450,54 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { return NULL; } + /** + * @brief Thread that manages the delayed grants using a priprity queue. + * + * This thread is responsible for managing the priority queue of delayed grants to be issued. + * It waits until the current time matches the highest priority tag time in the queue. + * If reached, it notifies the grant immediately. If, however, the current time has not yet + * reached the highest priority tag and the queue has been updated (either by inserting or + * canceling an entry), the thread stops waiting and restarts the process again. + */ + static void* lf_delayed_grants_thread(void* nothing) { + initialize_lf_thread_id(); + // Hold the mutex when not waiting. + LF_MUTEX_LOCK(&rti_mutex); + while (!rti_remote->all_federates_exited) { + if (pqueue_delayed_grants_size(rti_remote->delayed_grants) > 0) { + // Do not pop, but rather peek. + pqueue_delayed_grant_element_t* next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + instant_t next_time = next->base.tag.time; + // Wait for expiration, or a signal to stop or terminate. + int ret = lf_clock_cond_timedwait(&updated_delayed_grants, next_time); + if (ret == LF_TIMEOUT) { + // Time reached to send the grant. + // However, the grant may have been canceled while we were waiting. + pqueue_delayed_grant_element_t* new_next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + if (next == new_next) { + pqueue_delayed_grants_pop(rti_remote->delayed_grants); + federate_info_t* fed = GET_FED_INFO(next->fed_id); + if (next->is_provisional) { + notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } else { + notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } + free(next); + } + } else if (ret != 0) { + // An error occurred. + lf_print_error_and_exit("lf_delayed_grants_thread: lf_clock_cond_timedwait failed with code %d.", ret); + } + } else if (pqueue_delayed_grants_size(rti_remote->delayed_grants) == 0) { + // Wait for something to appear on the queue. + lf_cond_wait(&updated_delayed_grants); + } + } + // Free any delayed grants that are still on the queue. + pqueue_delayed_grants_free(rti_remote->delayed_grants); + LF_MUTEX_UNLOCK(&rti_mutex); + return NULL; + } /** * This thread is responsible for managing the priority queue of delayed grants to be issued. * It waits until the current time matches the highest priority tag time in the queue. diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index 62e75235b..9bf6c5ca8 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -82,7 +82,11 @@ typedef enum clock_sync_stat { clock_sync_off, clock_sync_init, clock_sync_on } /** * The federation life cycle phases. */ -typedef enum federation_life_cycle_phase { startup_phase, execution_phase, shutdown_phase } federation_life_cycle_phase; +typedef enum federation_life_cycle_phase { + startup_phase, // Not all persistent federates have joined. + execution_phase, // All persistent federates have joined. + shutdown_phase // Federation is shutting down. +} federation_life_cycle_phase; /** * @brief The type for an element in a delayed grants priority queue that is sorted by tag. @@ -390,7 +394,7 @@ void* federate_info_thread_TCP(void* fed); * @param socket_id Pointer to the socket ID. * @param error_code An error code. */ -void send_reject(int* socket_id, unsigned char error_code); +void send_reject(int* socket_id, rejection_code_t error_code); /** * Wait for one incoming connection request from each (persistent) federate, diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index d745378e7..a58476a10 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -662,26 +662,15 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * These codes are sent in a MSG_TYPE_REJECT message. * They are limited to one byte (uchar). */ - -/** Federation ID does not match. */ -#define FEDERATION_ID_DOES_NOT_MATCH 1 - -/** Federate with the specified ID has already joined. */ -#define FEDERATE_ID_IN_USE 2 - -/** Federate ID out of range. */ -#define FEDERATE_ID_OUT_OF_RANGE 3 - -/** Incoming message is not expected. */ -#define UNEXPECTED_MESSAGE 4 - -/** Connected to the wrong server. */ -#define WRONG_SERVER 5 - -/** HMAC authentication failed. */ -#define HMAC_DOES_NOT_MATCH 6 - -/** RTI not executed using -a or --auth option. */ -#define RTI_NOT_EXECUTED_WITH_AUTH 7 +typedef enum { + FEDERATION_ID_DOES_NOT_MATCH = 1, + FEDERATE_ID_IN_USE = 2, + FEDERATE_ID_OUT_OF_RANGE = 3, + UNEXPECTED_MESSAGE = 4, + WRONG_SERVER = 5, + HMAC_DOES_NOT_MATCH = 6, + RTI_NOT_EXECUTED_WITH_AUTH = 7, + JOINING_TOO_LATE = 8 +} rejection_code_t; #endif /* NET_COMMON_H */ From 5a3cf7c18f4d50ca07f8940803df57c981df4859 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Fri, 19 Jul 2024 17:47:52 -0400 Subject: [PATCH 106/148] Hold mutex and notify downstream not upstream --- core/federated/RTI/rti_remote.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 4d5eb86eb..29429df8c 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1193,11 +1193,13 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { * This will also notify federates downstream of my_fed that this federate is now * connected. This is important when there are zero-delay cycles. * + * This function assumes the caller holds the mutex. + * * @param my_fed the federate to send the start time to. * @param federation_start_time the federation start_time * @param federate_start_tag the federate effective start tag */ - void send_start_tag(federate_info_t * my_fed, instant_t federation_start_time, tag_t federate_start_tag) { + static void send_start_tag(federate_info_t * my_fed, instant_t federation_start_time, tag_t federate_start_tag) { // Send back to the federate the maximum time plus an offset on a TIMESTAMP_START // message. // In the startup phase, federates will receive identical start_time and @@ -1214,7 +1216,6 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); } - LF_MUTEX_LOCK(&rti_mutex); // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP // message has been sent. That MSG_TYPE_TIMESTAMP message grants time advance to // the federate to the start time. From 2117bd89734b3679348a1cc18ee8089abf89dbdf Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 24 Jul 2024 15:44:47 +0100 Subject: [PATCH 107/148] Fix the pqueue tag iterator type --- core/utils/pqueue_tag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/utils/pqueue_tag.c b/core/utils/pqueue_tag.c index 9a7491dd7..57e52ef5c 100644 --- a/core/utils/pqueue_tag.c +++ b/core/utils/pqueue_tag.c @@ -162,7 +162,7 @@ void pqueue_tag_dump(pqueue_tag_t* q) { pqueue_dump((pqueue_t*)q, pqueue_tag_pri tag_t pqueue_tag_max_tag(pqueue_tag_t* q) { tag_t result = NEVER_TAG; - for (int i = 1; i < q->size; i++) { + for (size_t i = 1; i < q->size; i++) { pqueue_tag_element_t* element = (pqueue_tag_element_t*)(q->d[i]); if (lf_tag_compare(element->tag, result) > 0) { result = element->tag; From b7544b3c7084c42dbe4a407647fb2f5c3e742fe6 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 24 Jul 2024 15:50:10 +0100 Subject: [PATCH 108/148] Fix condition 2 for computing the effective start time of a transient + set the start time in the trace file --- core/federated/RTI/rti_remote.c | 4565 ++++++++++++++++--------------- 1 file changed, 2286 insertions(+), 2279 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 29429df8c..9c1d02287 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -153,2661 +153,2668 @@ pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_with_tag(pqueue_delay */ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t* q, uint16_t fed_id) { - pqueue_delayed_grant_element_t* dge; - pqueue_t* _q = (pqueue_t*)q; - if (!q || q->size == 1) - return NULL; - for (int i = 1; i <= q->size; i++) { - dge = (pqueue_delayed_grant_element_t*)q->d[i]; - if (dge->fed_id == fed_id) { - return dge; + static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t * q, + uint16_t fed_id) { + pqueue_delayed_grant_element_t* dge; + pqueue_t* _q = (pqueue_t*)q; + if (!q || q->size == 1) + return NULL; + for (int i = 1; i <= q->size; i++) { + dge = (pqueue_delayed_grant_element_t*)q->d[i]; + if (dge->fed_id == fed_id) { + return dge; + } } + return NULL; } - return NULL; -} -// Utility functions to simplify the call of pqueue_tag routines. -// These functions mainly do the casting. -// FIXME: Should we remove the queue parameter from the functions? + // Utility functions to simplify the call of pqueue_tag routines. + // These functions mainly do the casting. + // FIXME: Should we remove the queue parameter from the functions? -/** - * @brief Creates a priority queue of delayed grants that is sorted by tags. - * - * @param nbr_delayed_grants The size. - * @return The dynamically allocated queue or NULL. - */ -pqueue_delayed_grants_t* pqueue_delayed_grants_init(uint16_t nbr_delayed_grants) { - return (pqueue_delayed_grants_t*)pqueue_tag_init((size_t)nbr_delayed_grants); -} + /** + * @brief Creates a priority queue of delayed grants that is sorted by tags. + * + * @param nbr_delayed_grants The size. + * @return The dynamically allocated queue or NULL. + */ + pqueue_delayed_grants_t* pqueue_delayed_grants_init(uint16_t nbr_delayed_grants) { + return (pqueue_delayed_grants_t*)pqueue_tag_init((size_t)nbr_delayed_grants); + } -/** - * @brief Return the size of the queue. - * - * @param q The queue. - * @return The size. - */ -size_t pqueue_delayed_grants_size(pqueue_delayed_grants_t* q) { return pqueue_tag_size((pqueue_tag_t*)q); } + /** + * @brief Return the size of the queue. + * + * @param q The queue. + * @return The size. + */ + size_t pqueue_delayed_grants_size(pqueue_delayed_grants_t * q) { return pqueue_tag_size((pqueue_tag_t*)q); } -/** - * @brief Insert an\ delayed grant element into the queue. - * - * @param q The queue. - * @param e The delayed grant element to insert. - * @return 0 on success - */ -int pqueue_delayed_grants_insert(pqueue_delayed_grants_t* q, pqueue_delayed_grant_element_t* d) { - return pqueue_tag_insert((pqueue_tag_t*)q, (void*)d); -} + /** + * @brief Insert an\ delayed grant element into the queue. + * + * @param q The queue. + * @param e The delayed grant element to insert. + * @return 0 on success + */ + int pqueue_delayed_grants_insert(pqueue_delayed_grants_t * q, pqueue_delayed_grant_element_t * d) { + return pqueue_tag_insert((pqueue_tag_t*)q, (void*)d); + } -/** - * @brief Pop the least-tag element from the queue. - * - * @param q The queue. - * @return NULL on error, otherwise the entry - */ -pqueue_delayed_grant_element_t* pqueue_delayed_grants_pop(pqueue_delayed_grants_t* q) { - return (pqueue_delayed_grant_element_t*)pqueue_tag_pop((pqueue_tag_t*)q); -} + /** + * @brief Pop the least-tag element from the queue. + * + * @param q The queue. + * @return NULL on error, otherwise the entry + */ + pqueue_delayed_grant_element_t* pqueue_delayed_grants_pop(pqueue_delayed_grants_t * q) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_pop((pqueue_tag_t*)q); + } -/** - * @brief Return highest-ranking element without removing it. - * - * @param q The queue. - * @return NULL on if the queue is empty, otherwise the delayed grant element. - */ -pqueue_delayed_grant_element_t* pqueue_delayed_grants_peek(pqueue_delayed_grants_t* q) { - return (pqueue_delayed_grant_element_t*)pqueue_tag_peek((pqueue_tag_t*)q); -} + /** + * @brief Return highest-ranking element without removing it. + * + * @param q The queue. + * @return NULL on if the queue is empty, otherwise the delayed grant element. + */ + pqueue_delayed_grant_element_t* pqueue_delayed_grants_peek(pqueue_delayed_grants_t * q) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_peek((pqueue_tag_t*)q); + } -/** - * @brief Free all memory used by the queue including elements that are marked dynamic. - * - * @param q The queue. - */ -void pqueue_delayed_grants_free(pqueue_delayed_grants_t* q) { pqueue_tag_free((pqueue_tag_t*)q); } + /** + * @brief Free all memory used by the queue including elements that are marked dynamic. + * + * @param q The queue. + */ + void pqueue_delayed_grants_free(pqueue_delayed_grants_t * q) { pqueue_tag_free((pqueue_tag_t*)q); } -/** - * @brief Remove an item from the delayed grants queue. - * - * @param q The queue. - * @param e The entry to remove. - */ -void pqueue_delayed_grants_remove(pqueue_delayed_grants_t* q, pqueue_delayed_grant_element_t* e) { - pqueue_tag_remove((pqueue_tag_t*)q, (void*)e); -} + /** + * @brief Remove an item from the delayed grants queue. + * + * @param q The queue. + * @param e The entry to remove. + */ + void pqueue_delayed_grants_remove(pqueue_delayed_grants_t * q, pqueue_delayed_grant_element_t * e) { + pqueue_tag_remove((pqueue_tag_t*)q, (void*)e); + } -/** - * @brief Return the first item with the specified tag or NULL if there is none. - * @param q The queue. - * @param t The tag. - * @return An entry with the specified tag or NULL if there isn't one. - */ -pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_with_tag(pqueue_delayed_grants_t* q, tag_t t) { - return (pqueue_delayed_grant_element_t*)pqueue_tag_find_with_tag((pqueue_tag_t*)q, t); -} + /** + * @brief Return the first item with the specified tag or NULL if there is none. + * @param q The queue. + * @param t The tag. + * @return An entry with the specified tag or NULL if there isn't one. + */ + pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_with_tag(pqueue_delayed_grants_t * q, tag_t t) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_find_with_tag((pqueue_tag_t*)q, t); + } -// Function that does not in pqueue_tag.c -/** - * @brief Return the first item with the specified federate id or NULL if there is none. - * @param q The queue. - * @param fed_id The federate id. - * @return An entry with the specified federate if or NULL if there isn't one. - */ + // Function that does not in pqueue_tag.c + /** + * @brief Return the first item with the specified federate id or NULL if there is none. + * @param q The queue. + * @param fed_id The federate id. + * @return An entry with the specified federate if or NULL if there isn't one. + */ -pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t* q, uint16_t fed_id) { - pqueue_delayed_grant_element_t* dge; - pqueue_t* _q = (pqueue_t*)q; - if (!q || q->size == 1) - return NULL; - for (int i = 1; i <= q->size; i++) { - dge = (pqueue_delayed_grant_element_t*)q->d[i]; - if (dge->fed_id == fed_id) { - return dge; + pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t * q, uint16_t fed_id) { + pqueue_delayed_grant_element_t* dge; + pqueue_t* _q = (pqueue_t*)q; + if (!q || q->size == 1) + return NULL; + for (int i = 1; i <= q->size; i++) { + dge = (pqueue_delayed_grant_element_t*)q->d[i]; + if (dge->fed_id == fed_id) { + return dge; + } } + return NULL; } - return NULL; -} -// Utility functions to simplify the call of pqueue_tag routines. -// These functions mainly do the casting. -// FIXME: Should we remove the queue parameter from the functions? + // Utility functions to simplify the call of pqueue_tag routines. + // These functions mainly do the casting. + // FIXME: Should we remove the queue parameter from the functions? -/** - * @brief Creates a priority queue of delayed grants that is sorted by tags. - * - * @param nbr_delayed_grants The size. - * @return The dynamically allocated queue or NULL. - */ -pqueue_delayed_grants_t* pqueue_delayed_grants_init(uint16_t nbr_delayed_grants) { - return (pqueue_delayed_grants_t*)pqueue_tag_init((size_t)nbr_delayed_grants); -} + /** + * @brief Creates a priority queue of delayed grants that is sorted by tags. + * + * @param nbr_delayed_grants The size. + * @return The dynamically allocated queue or NULL. + */ + pqueue_delayed_grants_t* pqueue_delayed_grants_init(uint16_t nbr_delayed_grants) { + return (pqueue_delayed_grants_t*)pqueue_tag_init((size_t)nbr_delayed_grants); + } -/** - * @brief Return the size of the queue. - * - * @param q The queue. - * @return The size. - */ -size_t pqueue_delayed_grants_size(pqueue_delayed_grants_t* q) { return pqueue_tag_size((pqueue_tag_t*)q); } + /** + * @brief Return the size of the queue. + * + * @param q The queue. + * @return The size. + */ + size_t pqueue_delayed_grants_size(pqueue_delayed_grants_t * q) { return pqueue_tag_size((pqueue_tag_t*)q); } -/** - * @brief Insert an\ delayed grant element into the queue. - * - * @param q The queue. - * @param e The delayed grant element to insert. - * @return 0 on success - */ -int pqueue_delayed_grants_insert(pqueue_delayed_grants_t* q, pqueue_delayed_grant_element_t* d) { - return pqueue_tag_insert((pqueue_tag_t*)q, (void*)d); -} + /** + * @brief Insert an\ delayed grant element into the queue. + * + * @param q The queue. + * @param e The delayed grant element to insert. + * @return 0 on success + */ + int pqueue_delayed_grants_insert(pqueue_delayed_grants_t * q, pqueue_delayed_grant_element_t * d) { + return pqueue_tag_insert((pqueue_tag_t*)q, (void*)d); + } -/** - * @brief Pop the least-tag element from the queue. - * - * @param q The queue. - * @return NULL on error, otherwise the entry - */ -pqueue_delayed_grant_element_t* pqueue_delayed_grants_pop(pqueue_delayed_grants_t* q) { - return (pqueue_delayed_grant_element_t*)pqueue_tag_pop((pqueue_tag_t*)q); -} + /** + * @brief Pop the least-tag element from the queue. + * + * @param q The queue. + * @return NULL on error, otherwise the entry + */ + pqueue_delayed_grant_element_t* pqueue_delayed_grants_pop(pqueue_delayed_grants_t * q) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_pop((pqueue_tag_t*)q); + } -/** - * @brief Return highest-ranking element without removing it. - * - * @param q The queue. - * @return NULL on if the queue is empty, otherwise the delayed grant element. - */ -pqueue_delayed_grant_element_t* pqueue_delayed_grants_peek(pqueue_delayed_grants_t* q) { - return (pqueue_delayed_grant_element_t*)pqueue_tag_peek((pqueue_tag_t*)q); -} + /** + * @brief Return highest-ranking element without removing it. + * + * @param q The queue. + * @return NULL on if the queue is empty, otherwise the delayed grant element. + */ + pqueue_delayed_grant_element_t* pqueue_delayed_grants_peek(pqueue_delayed_grants_t * q) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_peek((pqueue_tag_t*)q); + } -/** - * @brief Free all memory used by the queue including elements that are marked dynamic. - * - * @param q The queue. - */ -void pqueue_delayed_grants_free(pqueue_delayed_grants_t* q) { pqueue_tag_free((pqueue_tag_t*)q); } + /** + * @brief Free all memory used by the queue including elements that are marked dynamic. + * + * @param q The queue. + */ + void pqueue_delayed_grants_free(pqueue_delayed_grants_t * q) { pqueue_tag_free((pqueue_tag_t*)q); } -/** - * @brief Remove an item from the delayed grants queue. - * - * @param q The queue. - * @param e The entry to remove. - */ -void pqueue_delayed_grants_remove(pqueue_delayed_grants_t* q, pqueue_delayed_grant_element_t* e) { - pqueue_tag_remove((pqueue_tag_t*)q, (void*)e); -} + /** + * @brief Remove an item from the delayed grants queue. + * + * @param q The queue. + * @param e The entry to remove. + */ + void pqueue_delayed_grants_remove(pqueue_delayed_grants_t * q, pqueue_delayed_grant_element_t * e) { + pqueue_tag_remove((pqueue_tag_t*)q, (void*)e); + } -// Function that does not exist in pqueue_tag.c -/** - * @brief Return the first item with the specified federate id or NULL if there is none. - * @param q The queue. - * @param fed_id The federate id. - * @return An entry with the specified federate if or NULL if there isn't one. - */ -pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t* q, uint16_t fed_id) { - pqueue_delayed_grant_element_t* dge; - if (!q || q->size == 1) - return NULL; - for (int i = 1; i < q->size; i++) { - dge = (pqueue_delayed_grant_element_t*)q->d[i]; - if (dge) { - if (dge->fed_id == fed_id) { - return dge; + // Function that does not exist in pqueue_tag.c + /** + * @brief Return the first item with the specified federate id or NULL if there is none. + * @param q The queue. + * @param fed_id The federate id. + * @return An entry with the specified federate if or NULL if there isn't one. + */ + pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t * q, uint16_t fed_id) { + pqueue_delayed_grant_element_t* dge; + if (!q || q->size == 1) + return NULL; + for (int i = 1; i < q->size; i++) { + dge = (pqueue_delayed_grant_element_t*)q->d[i]; + if (dge) { + if (dge->fed_id == fed_id) { + return dge; + } } } + return NULL; } - return NULL; -} -/** - * @brief Insert the delayed grant into the delayed_grants queue and notify. - * - * - * This function assumes the caller holds the rti_mutex. - * @param fed The federate. - * @param tag The tag to grant. - * @param is_provisional State whther the grant is provisional. - */ -static void notify_grant_delayed(federate_info_t* fed, tag_t tag, bool is_provisional) { - // Check wether there is already a pending grant. - pqueue_delayed_grant_element_t* dge = - pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, fed->enclave.id); - if (dge == NULL) { + /** + * @brief Insert the delayed grant into the delayed_grants queue and notify. + * + * + * This function assumes the caller holds the rti_mutex. + * @param fed The federate. + * @param tag The tag to grant. + * @param is_provisional State whther the grant is provisional. + */ + static void notify_grant_delayed(federate_info_t * fed, tag_t tag, bool is_provisional) { + // Check wether there is already a pending grant. pqueue_delayed_grant_element_t* dge = - (pqueue_delayed_grant_element_t*)malloc(sizeof(pqueue_delayed_grant_element_t)); - dge->base.is_dynamic = 1; - dge->base.tag = tag; - dge->fed_id = fed->enclave.id; - dge->is_provisional = is_provisional; - pqueue_delayed_grants_insert(rti_remote->delayed_grants, dge); - LF_PRINT_LOG("RTI: Inserting a delayed grant of " PRINTF_TAG " for federate %d.", dge->base.tag.time - start_time, - dge->base.tag.microstep, dge->fed_id); - lf_cond_signal(&updated_delayed_grants); - } else { - // Note that there should never be more than one pending grant for a federate. - int compare = lf_tag_compare(dge->base.tag, tag); - if (compare > 0) { - // Update the pre-existing grant. + pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, fed->enclave.id); + if (dge == NULL) { + pqueue_delayed_grant_element_t* dge = + (pqueue_delayed_grant_element_t*)malloc(sizeof(pqueue_delayed_grant_element_t)); + dge->base.is_dynamic = 1; dge->base.tag = tag; + dge->fed_id = fed->enclave.id; dge->is_provisional = is_provisional; - LF_PRINT_LOG("RTI: Updating a delayed grant of " PRINTF_TAG " for federate %d.", tag.time - start_time, - tag.microstep, dge->fed_id); + pqueue_delayed_grants_insert(rti_remote->delayed_grants, dge); + LF_PRINT_LOG("RTI: Inserting a delayed grant of " PRINTF_TAG " for federate %d.", dge->base.tag.time - start_time, + dge->base.tag.microstep, dge->fed_id); lf_cond_signal(&updated_delayed_grants); - } else if (compare == 0) { - if (dge->is_provisional != is_provisional) { - // Update the grant to keep the most recent is_provisional status. + } else { + // Note that there should never be more than one pending grant for a federate. + int compare = lf_tag_compare(dge->base.tag, tag); + if (compare > 0) { + // Update the pre-existing grant. + dge->base.tag = tag; dge->is_provisional = is_provisional; - LF_PRINT_LOG("RTI: Changing status of a delayed grant of " PRINTF_TAG " for federate %d to provisional: %d.", - dge->base.tag.time - start_time, dge->base.tag.microstep, dge->fed_id, is_provisional); + LF_PRINT_LOG("RTI: Updating a delayed grant of " PRINTF_TAG " for federate %d.", tag.time - start_time, + tag.microstep, dge->fed_id); + lf_cond_signal(&updated_delayed_grants); + } else if (compare == 0) { + if (dge->is_provisional != is_provisional) { + // Update the grant to keep the most recent is_provisional status. + dge->is_provisional = is_provisional; + LF_PRINT_LOG("RTI: Changing status of a delayed grant of " PRINTF_TAG " for federate %d to provisional: %d.", + dge->base.tag.time - start_time, dge->base.tag.microstep, dge->fed_id, is_provisional); + } } } } -} -/** - * Find the number of non connected upstream transients - * @param fed The federate - * @return the number of non connected upstream transients - */ -static int get_num_absent_upstream_transients(federate_info_t* fed) { - int num_absent_upstream_transients = 0; - for (int j = 0; j < fed->enclave.num_upstream; j++) { - federate_info_t* upstream = GET_FED_INFO(fed->enclave.upstream[j]); - // Ignore this enclave if it no longer connected. - if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { - num_absent_upstream_transients++; + /** + * Find the number of non connected upstream transients + * @param fed The federate + * @return the number of non connected upstream transients + */ + static int get_num_absent_upstream_transients(federate_info_t * fed) { + int num_absent_upstream_transients = 0; + for (int j = 0; j < fed->enclave.num_upstream; j++) { + federate_info_t* upstream = GET_FED_INFO(fed->enclave.upstream[j]); + // Ignore this enclave if it no longer connected. + if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { + num_absent_upstream_transients++; + } } + return num_absent_upstream_transients; } - return num_absent_upstream_transients; -} - -/** - * @brief Send MSG_TYPE_UPSTREAM_CONNECTED to the specified federate. - * - * - * This function assumes that the mutex lock is already held. - * @param destination The destination federate. - * @param disconnected The connected federate. - */ -static void send_upstream_connected_locked(federate_info_t* destination, federate_info_t* connected) { - if (!connected->is_transient) { - // No need to send connected message for persistent federates. - return; - } - unsigned char buffer[MSG_TYPE_UPSTREAM_CONNECTED_LENGTH]; - buffer[0] = MSG_TYPE_UPSTREAM_CONNECTED; - encode_uint16(connected->enclave.id, &buffer[1]); - if (write_to_socket_close_on_error(&destination->socket, MSG_TYPE_UPSTREAM_CONNECTED_LENGTH, buffer)) { - lf_print_warning("RTI: Failed to send upstream connected message to federate %d.", connected->enclave.id); - } -} -/** - * @brief Send MSG_TYPE_UPSTREAM_DISCONNECTED to the specified federate. - * - * This function assumes that the mutex lock is already held. - * @param destination The destination federate. - * @param disconnected The disconnected federate. - */ -static void send_upstream_disconnected_locked(federate_info_t* destination, federate_info_t* disconnected) { - unsigned char buffer[MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH]; - buffer[0] = MSG_TYPE_UPSTREAM_DISCONNECTED; - encode_uint16(disconnected->enclave.id, &buffer[1]); - if (write_to_socket_close_on_error(&destination->socket, MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH, buffer)) { - lf_print_warning("RTI: Failed to send upstream disconnected message to federate %d.", disconnected->enclave.id); + /** + * @brief Send MSG_TYPE_UPSTREAM_CONNECTED to the specified federate. + * + * + * This function assumes that the mutex lock is already held. + * @param destination The destination federate. + * @param disconnected The connected federate. + */ + static void send_upstream_connected_locked(federate_info_t * destination, federate_info_t * connected) { + if (!connected->is_transient) { + // No need to send connected message for persistent federates. + return; + } + unsigned char buffer[MSG_TYPE_UPSTREAM_CONNECTED_LENGTH]; + buffer[0] = MSG_TYPE_UPSTREAM_CONNECTED; + encode_uint16(connected->enclave.id, &buffer[1]); + if (write_to_socket_close_on_error(&destination->socket, MSG_TYPE_UPSTREAM_CONNECTED_LENGTH, buffer)) { + lf_print_warning("RTI: Failed to send upstream connected message to federate %d.", connected->enclave.id); + } } -} -/** - * @brief Mark a federate as disconnected and inform downstream federates. - * @param e The enclave corresponding to the disconnected federate. - */ -static void notify_federate_disconnected(scheduling_node_t* e) { - e->state = NOT_CONNECTED; - // Notify downstream federates. Need to hold the mutex lock to do this. - LF_MUTEX_LOCK(&rti_mutex); - for (int j = 0; j < e->num_downstream; j++) { - federate_info_t* downstream = GET_FED_INFO(e->downstream[j]); - // Ignore this enclave if it no longer connected. - if (downstream->enclave.state != NOT_CONNECTED) { - // Notify the downstream enclave. - send_upstream_disconnected_locked(downstream, GET_FED_INFO(e->id)); + /** + * @brief Send MSG_TYPE_UPSTREAM_DISCONNECTED to the specified federate. + * + * This function assumes that the mutex lock is already held. + * @param destination The destination federate. + * @param disconnected The disconnected federate. + */ + static void send_upstream_disconnected_locked(federate_info_t * destination, federate_info_t * disconnected) { + unsigned char buffer[MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH]; + buffer[0] = MSG_TYPE_UPSTREAM_DISCONNECTED; + encode_uint16(disconnected->enclave.id, &buffer[1]); + if (write_to_socket_close_on_error(&destination->socket, MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH, buffer)) { + lf_print_warning("RTI: Failed to send upstream disconnected message to federate %d.", disconnected->enclave.id); } } - LF_MUTEX_UNLOCK(&rti_mutex); -} -/** - * Notify a tag advance grant (TAG) message to the specified federate immediately. - * - * This function will keep a record of this TAG in the enclave's last_granted - * field. - * - * @param e The enclave. - * @param tag The tag to grant. - */ -static void notify_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { - size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); - unsigned char buffer[message_length]; - buffer[0] = MSG_TYPE_TAG_ADVANCE_GRANT; - encode_int64(tag.time, &(buffer[1])); - encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); - - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_TAG, e->id, &tag); - } - // This function is called in notify_advance_grant_if_safe(), which is a long - // function. During this call, the socket might close, causing the following write_to_socket - // to fail. Consider a failure here a soft failure and update the federate's status. - if (write_to_socket(((federate_info_t*)e)->socket, message_length, buffer)) { - lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); - notify_federate_disconnected(e); - } else { - e->last_granted = tag; - LF_PRINT_LOG("RTI sent to federate %d the tag advance grant (TAG) " PRINTF_TAG ".", e->id, tag.time - start_time, - tag.microstep); + /** + * @brief Mark a federate as disconnected and inform downstream federates. + * @param e The enclave corresponding to the disconnected federate. + */ + static void notify_federate_disconnected(scheduling_node_t * e) { + e->state = NOT_CONNECTED; + // Notify downstream federates. Need to hold the mutex lock to do this. + LF_MUTEX_LOCK(&rti_mutex); + for (int j = 0; j < e->num_downstream; j++) { + federate_info_t* downstream = GET_FED_INFO(e->downstream[j]); + // Ignore this enclave if it no longer connected. + if (downstream->enclave.state != NOT_CONNECTED) { + // Notify the downstream enclave. + send_upstream_disconnected_locked(downstream, GET_FED_INFO(e->id)); + } + } + LF_MUTEX_UNLOCK(&rti_mutex); } -} -/** - * Notify a tag advance grant (TAG) message to the specified federate after - * the physical time reaches the tag. A thread is created to this end. - * - * If a provisionl tag advance grant is pending, cancel it. If there is another - * pending tag advance grant, do not proceed with the thread creation. - * - * @param fed The federate. - * @param tag The tag to grant. - */ -static void notify_tag_advance_grant_delayed(federate_info_t* fed, tag_t tag) { - // Check wether there is already a pending grant - // And check the pending provisional grant as well - lf_mutex_lock(&rti_mutex); - if (lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) { - // If a tag is issued, then stop any possible provisional tag grant - fed->pending_grant = tag; - fed->pending_provisional_grant = NEVER_TAG; - lf_thread_create(&(fed->pending_grant_thread_id), pending_grant_thread, fed); - } else { - // If there is already a pending tag grant, then let it be sent first - // FIXME: Is this correct? - } - lf_mutex_unlock(&rti_mutex); -} + /** + * Notify a tag advance grant (TAG) message to the specified federate immediately. + * + * This function will keep a record of this TAG in the enclave's last_granted + * field. + * + * @param e The enclave. + * @param tag The tag to grant. + */ + static void notify_tag_advance_grant_immediate(scheduling_node_t * e, tag_t tag) { + size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); + unsigned char buffer[message_length]; + buffer[0] = MSG_TYPE_TAG_ADVANCE_GRANT; + encode_int64(tag.time, &(buffer[1])); + encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); -void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { - if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || - lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { - return; - } - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (e->state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_TAG, e->id, &tag); + } + // This function is called in notify_advance_grant_if_safe(), which is a long + // function. During this call, the socket might close, causing the following write_to_socket + // to fail. Consider a failure here a soft failure and update the federate's status. + if (write_to_socket(((federate_info_t*)e)->socket, message_length, buffer)) { + lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); + notify_federate_disconnected(e); + } else { + e->last_granted = tag; + LF_PRINT_LOG("RTI sent to federate %d the tag advance grant (TAG) " PRINTF_TAG ".", e->id, tag.time - start_time, + tag.microstep); + } } - // Check if sending the tag advance grant needs to be delayed or not. - // Delay is needed when a federate has at least one absent upstream transient. - - // Check if sending the tag advance grant needs to be delayed or not - // Delay is needed when a federate has, at least one, absent upstream transient - federate_info_t* fed = GET_FED_INFO(e->id); - if (!fed->has_upstream_transient_federates) { - notify_tag_advance_grant_immediate(e, tag); - } else { - if (get_num_absent_upstream_transients(fed) > 0) { - notify_grant_delayed(fed, tag, false); + /** + * Notify a tag advance grant (TAG) message to the specified federate after + * the physical time reaches the tag. A thread is created to this end. + * + * If a provisionl tag advance grant is pending, cancel it. If there is another + * pending tag advance grant, do not proceed with the thread creation. + * + * @param fed The federate. + * @param tag The tag to grant. + */ + static void notify_tag_advance_grant_delayed(federate_info_t * fed, tag_t tag) { + // Check wether there is already a pending grant + // And check the pending provisional grant as well + lf_mutex_lock(&rti_mutex); + if (lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) { + // If a tag is issued, then stop any possible provisional tag grant + fed->pending_grant = tag; + fed->pending_provisional_grant = NEVER_TAG; + lf_thread_create(&(fed->pending_grant_thread_id), pending_grant_thread, fed); } else { - notify_tag_advance_grant_immediate(e, tag); + // If there is already a pending tag grant, then let it be sent first + // FIXME: Is this correct? } + lf_mutex_unlock(&rti_mutex); } -} -/** - * Notify a provisional tag advance grant (PTAG) message to the specified federate - * immediately. - * - * This function will keep a record of this TAG in the enclave's last_provisionally_granted - * field. - * - * @param e The scheduling node. - * @param tag The tag to grant. - */ -void notify_provisional_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { - size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); - unsigned char buffer[message_length]; - buffer[0] = MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT; - encode_int64(tag.time, &(buffer[1])); - encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); - - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_PTAG, e->id, &tag); - } - // This function is called in notify_advance_grant_if_safe(), which is a long - // function. During this call, the socket might close, causing the following write_to_socket - // to fail. Consider a failure here a soft failure and update the federate's status. - if (write_to_socket(((federate_info_t*)e)->socket, message_length, buffer)) { - lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); - notify_federate_disconnected(e); - } else { - e->last_provisionally_granted = tag; - LF_PRINT_LOG("RTI sent to federate %d the Provisional Tag Advance Grant (PTAG) " PRINTF_TAG ".", e->id, - tag.time - start_time, tag.microstep); - - // Send PTAG to all upstream federates, if they have not had - // a later or equal PTAG or TAG sent previously and if their transitive - // NET is greater than or equal to the tag. - // This is needed to stimulate absent messages from upstream and break deadlocks. - // The scenario this deals with is illustrated in `test/C/src/federated/FeedbackDelay2.lf` - // and `test/C/src/federated/FeedbackDelay4.lf`. - // Note that this is transitive. - // NOTE: This is not needed for enclaves because zero-delay loops are prohibited. - // It's only needed for federates, which is why this is implemented here. - for (int j = 0; j < e->num_upstream; j++) { - scheduling_node_t* upstream = rti_remote->base.scheduling_nodes[e->upstream[j]]; - - // Ignore this federate if it has resigned. - if (upstream->state == NOT_CONNECTED) - continue; - - tag_t earliest = earliest_future_incoming_message_tag(upstream); - tag_t strict_earliest = eimt_strict(upstream); // Non-ZDC version. - - // If these tags are equal, then a TAG or PTAG should have already been granted, - // in which case, another will not be sent. But it may not have been already granted. - if (lf_tag_compare(earliest, tag) > 0) { - notify_tag_advance_grant(upstream, tag); - } else if (lf_tag_compare(earliest, tag) == 0 && lf_tag_compare(strict_earliest, tag) > 0) { - notify_provisional_tag_advance_grant(upstream, tag); - } + void notify_tag_advance_grant(scheduling_node_t * e, tag_t tag) { + if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || + lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { + return; + } + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (e->state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); } - } -} -void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { - if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || - lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { - return; - } - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (e->state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); - } + // Check if sending the tag advance grant needs to be delayed or not. + // Delay is needed when a federate has at least one absent upstream transient. - // Check if sending the tag advance grant needs to be delayed or not - // Delay is needed when a federate has, at least one, absent upstream transient - federate_info_t* fed = GET_FED_INFO(e->id); - if (!fed->has_upstream_transient_federates) { - notify_provisional_tag_advance_grant_immediate(e, tag); - } else { - if (get_num_absent_upstream_transients(fed) > 0) { - notify_provisional_tag_advance_grant_delayed(fed, tag); - } else { - notify_provisional_tag_advance_grant_immediate(e, tag); - } + // Check if sending the tag advance grant needs to be delayed or not + // Delay is needed when a federate has, at least one, absent upstream transient federate_info_t* fed = GET_FED_INFO(e->id); if (!fed->has_upstream_transient_federates) { - notify_provisional_tag_advance_grant_immediate(e, tag); + notify_tag_advance_grant_immediate(e, tag); } else { if (get_num_absent_upstream_transients(fed) > 0) { - notify_grant_delayed(fed, tag, true); + notify_grant_delayed(fed, tag, false); } else { - notify_provisional_tag_advance_grant_immediate(e, tag); + notify_tag_advance_grant_immediate(e, tag); } } } - void update_federate_next_event_tag_locked(uint16_t federate_id, tag_t next_event_tag) { - federate_info_t* fed = GET_FED_INFO(federate_id); - tag_t min_in_transit_tag = pqueue_tag_peek_tag(fed->in_transit_message_tags); - if (lf_tag_compare(min_in_transit_tag, next_event_tag) < 0) { - next_event_tag = min_in_transit_tag; - } - update_scheduling_node_next_event_tag_locked(&(fed->enclave), next_event_tag); - } + /** + * Notify a provisional tag advance grant (PTAG) message to the specified federate + * immediately. + * + * This function will keep a record of this TAG in the enclave's last_provisionally_granted + * field. + * + * @param e The scheduling node. + * @param tag The tag to grant. + */ + void notify_provisional_tag_advance_grant_immediate(scheduling_node_t * e, tag_t tag) { + size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); + unsigned char buffer[message_length]; + buffer[0] = MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT; + encode_int64(tag.time, &(buffer[1])); + encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); - void handle_port_absent_message(federate_info_t * sending_federate, unsigned char* buffer) { - size_t message_size = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int64_t) + sizeof(uint32_t); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_PTAG, e->id, &tag); + } + // This function is called in notify_advance_grant_if_safe(), which is a long + // function. During this call, the socket might close, causing the following write_to_socket + // to fail. Consider a failure here a soft failure and update the federate's status. + if (write_to_socket(((federate_info_t*)e)->socket, message_length, buffer)) { + lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); + notify_federate_disconnected(e); + } else { + e->last_provisionally_granted = tag; + LF_PRINT_LOG("RTI sent to federate %d the Provisional Tag Advance Grant (PTAG) " PRINTF_TAG ".", e->id, + tag.time - start_time, tag.microstep); + + // Send PTAG to all upstream federates, if they have not had + // a later or equal PTAG or TAG sent previously and if their transitive + // NET is greater than or equal to the tag. + // This is needed to stimulate absent messages from upstream and break deadlocks. + // The scenario this deals with is illustrated in `test/C/src/federated/FeedbackDelay2.lf` + // and `test/C/src/federated/FeedbackDelay4.lf`. + // Note that this is transitive. + // NOTE: This is not needed for enclaves because zero-delay loops are prohibited. + // It's only needed for federates, which is why this is implemented here. + for (int j = 0; j < e->num_upstream; j++) { + scheduling_node_t* upstream = rti_remote->base.scheduling_nodes[e->upstream[j]]; - read_from_socket_fail_on_error(&sending_federate->socket, message_size, &(buffer[1]), NULL, - " RTI failed to read port absent message from federate %u.", - sending_federate->enclave.id); + // Ignore this federate if it has resigned. + if (upstream->state == NOT_CONNECTED) + continue; - uint16_t reactor_port_id = extract_uint16(&(buffer[1])); - uint16_t federate_id = extract_uint16(&(buffer[1 + sizeof(uint16_t)])); - tag_t tag = extract_tag(&(buffer[1 + 2 * sizeof(uint16_t)])); + tag_t earliest = earliest_future_incoming_message_tag(upstream); + tag_t strict_earliest = eimt_strict(upstream); // Non-ZDC version. - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_PORT_ABS, sending_federate->enclave.id, &tag); + // If these tags are equal, then a TAG or PTAG should have already been granted, + // in which case, another will not be sent. But it may not have been already granted. + if (lf_tag_compare(earliest, tag) > 0) { + notify_tag_advance_grant(upstream, tag); + } else if (lf_tag_compare(earliest, tag) == 0 && lf_tag_compare(strict_earliest, tag) > 0) { + notify_provisional_tag_advance_grant(upstream, tag); + } + } } + } - // Need to acquire the mutex lock to ensure that the thread handling - // messages coming from the socket connected to the destination does not - // issue a TAG before this message has been forwarded. - LF_MUTEX_LOCK(&rti_mutex); - - // If the destination federate is no longer connected, issue a warning - // and return. - federate_info_t* fed = GET_FED_INFO(federate_id); - if (fed->enclave.state == NOT_CONNECTED) { - LF_MUTEX_UNLOCK(&rti_mutex); - lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); - LF_PRINT_LOG("Fed status: next_event " PRINTF_TAG ", " - "completed " PRINTF_TAG ", " - "last_granted " PRINTF_TAG ", " - "last_provisionally_granted " PRINTF_TAG ".", - fed->enclave.next_event.time - start_time, fed->enclave.next_event.microstep, - fed->enclave.completed.time - start_time, fed->enclave.completed.microstep, - fed->enclave.last_granted.time - start_time, fed->enclave.last_granted.microstep, - fed->enclave.last_provisionally_granted.time - start_time, - fed->enclave.last_provisionally_granted.microstep); + void notify_provisional_tag_advance_grant(scheduling_node_t * e, tag_t tag) { + if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || + lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { return; } - - LF_PRINT_LOG("RTI forwarding port absent message for port %u to federate %u.", reactor_port_id, federate_id); - // Need to make sure that the destination federate's thread has already // sent the starting MSG_TYPE_TIMESTAMP message. - while (fed->enclave.state == PENDING) { + while (e->state == PENDING) { // Need to wait here. lf_cond_wait(&sent_start_time); } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_PORT_ABS, federate_id, &tag); + // Check if sending the tag advance grant needs to be delayed or not + // Delay is needed when a federate has, at least one, absent upstream transient + federate_info_t* fed = GET_FED_INFO(e->id); + if (!fed->has_upstream_transient_federates) { + notify_provisional_tag_advance_grant_immediate(e, tag); + } else { + if (get_num_absent_upstream_transients(fed) > 0) { + notify_provisional_tag_advance_grant_delayed(fed, tag); + } else { + notify_provisional_tag_advance_grant_immediate(e, tag); + } + federate_info_t* fed = GET_FED_INFO(e->id); + if (!fed->has_upstream_transient_federates) { + notify_provisional_tag_advance_grant_immediate(e, tag); + } else { + if (get_num_absent_upstream_transients(fed) > 0) { + notify_grant_delayed(fed, tag, true); + } else { + notify_provisional_tag_advance_grant_immediate(e, tag); + } + } } - // Forward the message. - write_to_socket_fail_on_error(&fed->socket, message_size + 1, buffer, &rti_mutex, - "RTI failed to forward message to federate %d.", federate_id); - - LF_MUTEX_UNLOCK(&rti_mutex); - } + void update_federate_next_event_tag_locked(uint16_t federate_id, tag_t next_event_tag) { + federate_info_t* fed = GET_FED_INFO(federate_id); + tag_t min_in_transit_tag = pqueue_tag_peek_tag(fed->in_transit_message_tags); + if (lf_tag_compare(min_in_transit_tag, next_event_tag) < 0) { + next_event_tag = min_in_transit_tag; + } + update_scheduling_node_next_event_tag_locked(&(fed->enclave), next_event_tag); + } - void handle_timed_message(federate_info_t * sending_federate, unsigned char* buffer) { - size_t header_size = - 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t) + sizeof(int64_t) + sizeof(uint32_t); - // Read the header, minus the first byte which has already been read. - read_from_socket_fail_on_error(&sending_federate->socket, header_size - 1, &(buffer[1]), NULL, - "RTI failed to read the timed message header from remote federate."); - // Extract the header information. of the sender - uint16_t reactor_port_id; - uint16_t federate_id; - size_t length; - tag_t intended_tag; - // Extract information from the header. - extract_timed_header(&(buffer[1]), &reactor_port_id, &federate_id, &length, &intended_tag); - - size_t total_bytes_to_read = length + header_size; - size_t bytes_to_read = length; - - if (FED_COM_BUFFER_SIZE < header_size + 1) { - lf_print_error_and_exit("Buffer size (%d) is not large enough to " - "read the header plus one byte.", - FED_COM_BUFFER_SIZE); - } - - // Cut up the payload in chunks. - if (bytes_to_read > FED_COM_BUFFER_SIZE - header_size) { - bytes_to_read = FED_COM_BUFFER_SIZE - header_size; - } - - LF_PRINT_LOG("RTI received message from federate %d for federate %u port %u with intended tag " PRINTF_TAG - ". Forwarding.", - sending_federate->enclave.id, federate_id, reactor_port_id, intended_tag.time - lf_time_start(), - intended_tag.microstep); - - read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, &(buffer[header_size]), NULL, - "RTI failed to read timed message from federate %d.", federate_id); - size_t bytes_read = bytes_to_read + header_size; - // Following only works for string messages. - // LF_PRINT_DEBUG("Message received by RTI: %s.", buffer + header_size); + void handle_port_absent_message(federate_info_t * sending_federate, unsigned char* buffer) { + size_t message_size = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int64_t) + sizeof(uint32_t); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_TAGGED_MSG, sending_federate->enclave.id, &intended_tag); - } + read_from_socket_fail_on_error(&sending_federate->socket, message_size, &(buffer[1]), NULL, + " RTI failed to read port absent message from federate %u.", + sending_federate->enclave.id); - // Need to acquire the mutex lock to ensure that the thread handling - // messages coming from the socket connected to the destination does not - // issue a TAG before this message has been forwarded. - LF_MUTEX_LOCK(&rti_mutex); + uint16_t reactor_port_id = extract_uint16(&(buffer[1])); + uint16_t federate_id = extract_uint16(&(buffer[1 + sizeof(uint16_t)])); + tag_t tag = extract_tag(&(buffer[1 + 2 * sizeof(uint16_t)])); - // If the destination federate is no longer connected, issue a warning, - // remove the message from the socket and return. - federate_info_t* fed = GET_FED_INFO(federate_id); - if (fed->enclave.state == NOT_CONNECTED) { - lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); - LF_PRINT_LOG("Fed status: next_event " PRINTF_TAG ", " - "completed " PRINTF_TAG ", " - "last_granted " PRINTF_TAG ", " - "last_provisionally_granted " PRINTF_TAG ".", - fed->enclave.next_event.time - start_time, fed->enclave.next_event.microstep, - fed->enclave.completed.time - start_time, fed->enclave.completed.microstep, - fed->enclave.last_granted.time - start_time, fed->enclave.last_granted.microstep, - fed->enclave.last_provisionally_granted.time - start_time, - fed->enclave.last_provisionally_granted.microstep); - // If the message was larger than the buffer, we must empty out the remainder also. - size_t total_bytes_read = bytes_read; - while (total_bytes_read < total_bytes_to_read) { - bytes_to_read = total_bytes_to_read - total_bytes_read; - if (bytes_to_read > FED_COM_BUFFER_SIZE) { - bytes_to_read = FED_COM_BUFFER_SIZE; - } - read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, - "RTI failed to clear message chunks."); - total_bytes_read += bytes_to_read; + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_PORT_ABS, sending_federate->enclave.id, &tag); } - LF_MUTEX_UNLOCK(&rti_mutex); - return; - } else { - if (lf_tag_compare(intended_tag, fed->effective_start_tag) < 0) { - // Do not forward the message if the federate is connected, but its - // start_time is not reached yet - lf_mutex_unlock(&rti_mutex); + + // Need to acquire the mutex lock to ensure that the thread handling + // messages coming from the socket connected to the destination does not + // issue a TAG before this message has been forwarded. + LF_MUTEX_LOCK(&rti_mutex); + + // If the destination federate is no longer connected, issue a warning + // and return. + federate_info_t* fed = GET_FED_INFO(federate_id); + if (fed->enclave.state == NOT_CONNECTED) { + LF_MUTEX_UNLOCK(&rti_mutex); + lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); + LF_PRINT_LOG("Fed status: next_event " PRINTF_TAG ", " + "completed " PRINTF_TAG ", " + "last_granted " PRINTF_TAG ", " + "last_provisionally_granted " PRINTF_TAG ".", + fed->enclave.next_event.time - start_time, fed->enclave.next_event.microstep, + fed->enclave.completed.time - start_time, fed->enclave.completed.microstep, + fed->enclave.last_granted.time - start_time, fed->enclave.last_granted.microstep, + fed->enclave.last_provisionally_granted.time - start_time, + fed->enclave.last_provisionally_granted.microstep); return; } - } - LF_PRINT_DEBUG("RTI forwarding message to port %d of federate %hu of length %zu.", reactor_port_id, federate_id, - length); + LF_PRINT_LOG("RTI forwarding port absent message for port %u to federate %u.", reactor_port_id, federate_id); - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (fed->enclave.state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); - } - - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_TAGGED_MSG, federate_id, &intended_tag); - } - - write_to_socket_fail_on_error(&fed->socket, bytes_read, buffer, &rti_mutex, - "RTI failed to forward message to federate %d.", federate_id); - - // The message length may be longer than the buffer, - // in which case we have to handle it in chunks. - size_t total_bytes_read = bytes_read; - while (total_bytes_read < total_bytes_to_read) { - LF_PRINT_DEBUG("Forwarding message in chunks."); - bytes_to_read = total_bytes_to_read - total_bytes_read; - if (bytes_to_read > FED_COM_BUFFER_SIZE) { - bytes_to_read = FED_COM_BUFFER_SIZE; - } - read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, - "RTI failed to read message chunks."); - total_bytes_read += bytes_to_read; - - // FIXME: a mutex needs to be held for this so that other threads - // do not write to destination_socket and cause interleaving. However, - // holding the rti_mutex might be very expensive. Instead, each outgoing - // socket should probably have its own mutex. - write_to_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, &rti_mutex, - "RTI failed to send message chunks."); - } - - // Record this in-transit message in federate's in-transit message queue. - if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) { - // Add a record of this message to the list of in-transit messages to this federate. - pqueue_tag_insert_if_no_match(fed->in_transit_message_tags, intended_tag); - LF_PRINT_DEBUG("RTI: Adding a message with tag " PRINTF_TAG - " to the list of in-transit messages for federate %d.", - intended_tag.time - lf_time_start(), intended_tag.microstep, federate_id); - } else { - lf_print_error("RTI: Federate %d has already completed tag " PRINTF_TAG - ", but there is an in-transit message with tag " PRINTF_TAG " from federate %hu. " - "This is going to cause an STP violation under centralized coordination.", - federate_id, fed->enclave.completed.time - lf_time_start(), fed->enclave.completed.microstep, - intended_tag.time - lf_time_start(), intended_tag.microstep, sending_federate->enclave.id); - // FIXME: Drop the federate? - } + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (fed->enclave.state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); + } - // If the message tag is less than the most recently received NET from the federate, - // then update the federate's next event tag to match the message tag. - if (lf_tag_compare(intended_tag, fed->enclave.next_event) < 0) { - update_federate_next_event_tag_locked(federate_id, intended_tag); - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_PORT_ABS, federate_id, &tag); + } - LF_MUTEX_UNLOCK(&rti_mutex); - } + // Forward the message. + write_to_socket_fail_on_error(&fed->socket, message_size + 1, buffer, &rti_mutex, + "RTI failed to forward message to federate %d.", federate_id); - void handle_latest_tag_confirmed(federate_info_t * fed) { - unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; - read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, - "RTI failed to read the content of the logical tag complete from federate %d.", - fed->enclave.id); - tag_t completed = extract_tag(buffer); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_LTC, fed->enclave.id, &completed); + LF_MUTEX_UNLOCK(&rti_mutex); } - _logical_tag_complete(&(fed->enclave), completed); - - // FIXME: Should this function be in the enclave version? - LF_MUTEX_LOCK(&rti_mutex); - // See if we can remove any of the recorded in-transit messages for this. - pqueue_tag_remove_up_to(fed->in_transit_message_tags, completed); - LF_MUTEX_UNLOCK(&rti_mutex); - } - - void handle_next_event_tag(federate_info_t * fed) { - unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; - read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, - "RTI failed to read the content of the next event tag from federate %d.", - fed->enclave.id); - // Acquire a mutex lock to ensure that this state does not change while a - // message is in transport or being used to determine a TAG. - LF_MUTEX_LOCK(&rti_mutex); // FIXME: Instead of using a mutex, it might be more efficient to use a - // select() mechanism to read and process federates' buffers in an orderly fashion. + void handle_timed_message(federate_info_t * sending_federate, unsigned char* buffer) { + size_t header_size = + 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t) + sizeof(int64_t) + sizeof(uint32_t); + // Read the header, minus the first byte which has already been read. + read_from_socket_fail_on_error(&sending_federate->socket, header_size - 1, &(buffer[1]), NULL, + "RTI failed to read the timed message header from remote federate."); + // Extract the header information. of the sender + uint16_t reactor_port_id; + uint16_t federate_id; + size_t length; + tag_t intended_tag; + // Extract information from the header. + extract_timed_header(&(buffer[1]), &reactor_port_id, &federate_id, &length, &intended_tag); + + size_t total_bytes_to_read = length + header_size; + size_t bytes_to_read = length; + + if (FED_COM_BUFFER_SIZE < header_size + 1) { + lf_print_error_and_exit("Buffer size (%d) is not large enough to " + "read the header plus one byte.", + FED_COM_BUFFER_SIZE); + } - tag_t intended_tag = extract_tag(buffer); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_NET, fed->enclave.id, &intended_tag); - } - LF_PRINT_LOG("RTI received from federate %d the Next Event Tag (NET) " PRINTF_TAG, fed->enclave.id, - intended_tag.time - start_time, intended_tag.microstep); - update_federate_next_event_tag_locked(fed->enclave.id, intended_tag); - LF_MUTEX_UNLOCK(&rti_mutex); - } + // Cut up the payload in chunks. + if (bytes_to_read > FED_COM_BUFFER_SIZE - header_size) { + bytes_to_read = FED_COM_BUFFER_SIZE - header_size; + } - /////////////////// STOP functions //////////////////// + LF_PRINT_LOG("RTI received message from federate %d for federate %u port %u with intended tag " PRINTF_TAG + ". Forwarding.", + sending_federate->enclave.id, federate_id, reactor_port_id, intended_tag.time - lf_time_start(), + intended_tag.microstep); - /** - * Boolean used to prevent the RTI from sending the - * MSG_TYPE_STOP_GRANTED message multiple times. - */ - bool stop_granted_already_sent_to_federates = false; + read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, &(buffer[header_size]), NULL, + "RTI failed to read timed message from federate %d.", federate_id); + size_t bytes_read = bytes_to_read + header_size; + // Following only works for string messages. + // LF_PRINT_DEBUG("Message received by RTI: %s.", buffer + header_size); - /** - * Once the RTI has seen proposed tags from all connected federates, - * it will broadcast a MSG_TYPE_STOP_GRANTED carrying the _RTI.max_stop_tag. - * This function also checks the most recently received NET from - * each federate and resets that be no greater than the _RTI.max_stop_tag. - * - * This function assumes the caller holds the rti_mutex lock. - */ - static void broadcast_stop_time_to_federates_locked() { - if (stop_granted_already_sent_to_federates == true) { - return; - } - stop_granted_already_sent_to_federates = true; + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_TAGGED_MSG, sending_federate->enclave.id, &intended_tag); + } - // Reply with a stop granted to all federates - unsigned char outgoing_buffer[MSG_TYPE_STOP_GRANTED_LENGTH]; - ENCODE_STOP_GRANTED(outgoing_buffer, rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); + // Need to acquire the mutex lock to ensure that the thread handling + // messages coming from the socket connected to the destination does not + // issue a TAG before this message has been forwarded. + LF_MUTEX_LOCK(&rti_mutex); - // Iterate over federates and send each the message. - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); + // If the destination federate is no longer connected, issue a warning, + // remove the message from the socket and return. + federate_info_t* fed = GET_FED_INFO(federate_id); if (fed->enclave.state == NOT_CONNECTED) { - continue; + lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); + LF_PRINT_LOG("Fed status: next_event " PRINTF_TAG ", " + "completed " PRINTF_TAG ", " + "last_granted " PRINTF_TAG ", " + "last_provisionally_granted " PRINTF_TAG ".", + fed->enclave.next_event.time - start_time, fed->enclave.next_event.microstep, + fed->enclave.completed.time - start_time, fed->enclave.completed.microstep, + fed->enclave.last_granted.time - start_time, fed->enclave.last_granted.microstep, + fed->enclave.last_provisionally_granted.time - start_time, + fed->enclave.last_provisionally_granted.microstep); + // If the message was larger than the buffer, we must empty out the remainder also. + size_t total_bytes_read = bytes_read; + while (total_bytes_read < total_bytes_to_read) { + bytes_to_read = total_bytes_to_read - total_bytes_read; + if (bytes_to_read > FED_COM_BUFFER_SIZE) { + bytes_to_read = FED_COM_BUFFER_SIZE; + } + read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, + "RTI failed to clear message chunks."); + total_bytes_read += bytes_to_read; + } + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } else { + if (lf_tag_compare(intended_tag, fed->effective_start_tag) < 0) { + // Do not forward the message if the federate is connected, but its + // start_time is not reached yet + lf_mutex_unlock(&rti_mutex); + return; + } } - if (lf_tag_compare(fed->enclave.next_event, rti_remote->base.max_stop_tag) >= 0) { - // Need the next_event to be no greater than the stop tag. - fed->enclave.next_event = rti_remote->base.max_stop_tag; + + LF_PRINT_DEBUG("RTI forwarding message to port %d of federate %hu of length %zu.", reactor_port_id, federate_id, + length); + + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (fed->enclave.state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); } + if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_STOP_GRN, fed->enclave.id, &rti_remote->base.max_stop_tag); + tracepoint_rti_to_federate(send_TAGGED_MSG, federate_id, &intended_tag); } - write_to_socket_fail_on_error(&fed->socket, MSG_TYPE_STOP_GRANTED_LENGTH, outgoing_buffer, &rti_mutex, - "RTI failed to send MSG_TYPE_STOP_GRANTED message to federate %d.", - fed->enclave.id); - } - - LF_PRINT_LOG("RTI sent to federates MSG_TYPE_STOP_GRANTED with tag " PRINTF_TAG, - rti_remote->base.max_stop_tag.time - start_time, rti_remote->base.max_stop_tag.microstep); - } - /** - * Mark a federate requesting stop. If the number of federates handling stop reaches - * the number of persistent federates, broadcast MSG_TYPE_STOP_GRANTED to every federate. - * This function assumes the _RTI.mutex is already locked. - * @param fed The federate that has requested a stop. - * @return 1 if stop time has been sent to all federates and 0 otherwise. - */ - static int mark_federate_requesting_stop(federate_info_t * fed) { - if (!fed->requested_stop) { - // Increment the number of federates handling stop only if it is persistent - if (!fed->is_transient) - rti_remote->base.num_scheduling_nodes_handling_stop++; - fed->requested_stop = true; - } - if (rti_remote->base.num_scheduling_nodes_handling_stop == - (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { - // We now have information about the stop time of all - // federates. - broadcast_stop_time_to_federates_locked(); - return 1; - } - return 0; - } + write_to_socket_fail_on_error(&fed->socket, bytes_read, buffer, &rti_mutex, + "RTI failed to forward message to federate %d.", federate_id); - /** - * Thread to time out if federates do not reply to stop request. - */ - static void* wait_for_stop_request_reply(void* args) { - initialize_lf_thread_id(); - // Divide the time into small chunks and check periodically. - interval_t chunk = MAX_TIME_FOR_REPLY_TO_STOP_REQUEST / 30; - int count = 0; - while (count++ < 30) { - if (stop_granted_already_sent_to_federates) - return NULL; - lf_sleep(chunk); - } - // If we reach here, then error out. - lf_print_error_and_exit("Received only %d stop request replies within timeout " PRINTF_TIME "ns. RTI is exiting.", - rti_remote->base.num_scheduling_nodes_handling_stop, MAX_TIME_FOR_REPLY_TO_STOP_REQUEST); - return NULL; - } + // The message length may be longer than the buffer, + // in which case we have to handle it in chunks. + size_t total_bytes_read = bytes_read; + while (total_bytes_read < total_bytes_to_read) { + LF_PRINT_DEBUG("Forwarding message in chunks."); + bytes_to_read = total_bytes_to_read - total_bytes_read; + if (bytes_to_read > FED_COM_BUFFER_SIZE) { + bytes_to_read = FED_COM_BUFFER_SIZE; + } + read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, + "RTI failed to read message chunks."); + total_bytes_read += bytes_to_read; - void handle_stop_request_message(federate_info_t * fed) { - LF_PRINT_DEBUG("RTI handling stop_request from federate %d.", fed->enclave.id); + // FIXME: a mutex needs to be held for this so that other threads + // do not write to destination_socket and cause interleaving. However, + // holding the rti_mutex might be very expensive. Instead, each outgoing + // socket should probably have its own mutex. + write_to_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, &rti_mutex, + "RTI failed to send message chunks."); + } - size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; - unsigned char buffer[bytes_to_read]; - read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, NULL, - "RTI failed to read the MSG_TYPE_STOP_REQUEST payload from federate %d.", - fed->enclave.id); + // Record this in-transit message in federate's in-transit message queue. + if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) { + // Add a record of this message to the list of in-transit messages to this federate. + pqueue_tag_insert_if_no_match(fed->in_transit_message_tags, intended_tag); + LF_PRINT_DEBUG("RTI: Adding a message with tag " PRINTF_TAG + " to the list of in-transit messages for federate %d.", + intended_tag.time - lf_time_start(), intended_tag.microstep, federate_id); + } else { + lf_print_error("RTI: Federate %d has already completed tag " PRINTF_TAG + ", but there is an in-transit message with tag " PRINTF_TAG " from federate %hu. " + "This is going to cause an STP violation under centralized coordination.", + federate_id, fed->enclave.completed.time - lf_time_start(), fed->enclave.completed.microstep, + intended_tag.time - lf_time_start(), intended_tag.microstep, sending_federate->enclave.id); + // FIXME: Drop the federate? + } - // Extract the proposed stop tag for the federate - tag_t proposed_stop_tag = extract_tag(buffer); + // If the message tag is less than the most recently received NET from the federate, + // then update the federate's next event tag to match the message tag. + if (lf_tag_compare(intended_tag, fed->enclave.next_event) < 0) { + update_federate_next_event_tag_locked(federate_id, intended_tag); + } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_STOP_REQ, fed->enclave.id, &proposed_stop_tag); + LF_MUTEX_UNLOCK(&rti_mutex); } - LF_PRINT_LOG("RTI received from federate %d a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", - fed->enclave.id, proposed_stop_tag.time - start_time, proposed_stop_tag.microstep); - - // Acquire a mutex lock to ensure that this state does change while a - // message is in transport or being used to determine a TAG. - LF_MUTEX_LOCK(&rti_mutex); - - // Check whether we have already received a stop_tag - // from this federate - if (fed->requested_stop) { - // If stop request messages have already been broadcast, treat this as if it were a reply. - if (rti_remote->stop_in_progress) { - mark_federate_requesting_stop(fed); + void handle_latest_tag_confirmed(federate_info_t * fed) { + unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, + "RTI failed to read the content of the logical tag complete from federate %d.", + fed->enclave.id); + tag_t completed = extract_tag(buffer); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_LTC, fed->enclave.id, &completed); } + _logical_tag_complete(&(fed->enclave), completed); + + // FIXME: Should this function be in the enclave version? + LF_MUTEX_LOCK(&rti_mutex); + // See if we can remove any of the recorded in-transit messages for this. + pqueue_tag_remove_up_to(fed->in_transit_message_tags, completed); LF_MUTEX_UNLOCK(&rti_mutex); - return; } - // Update the maximum stop tag received from federates - if (lf_tag_compare(proposed_stop_tag, rti_remote->base.max_stop_tag) > 0) { - rti_remote->base.max_stop_tag = proposed_stop_tag; - } + void handle_next_event_tag(federate_info_t * fed) { + unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, + "RTI failed to read the content of the next event tag from federate %d.", + fed->enclave.id); - // If all federates have replied, send stop request granted. - if (mark_federate_requesting_stop(fed)) { - // Have send stop request granted to all federates. Nothing more to do. + // Acquire a mutex lock to ensure that this state does not change while a + // message is in transport or being used to determine a TAG. + LF_MUTEX_LOCK(&rti_mutex); // FIXME: Instead of using a mutex, it might be more efficient to use a + // select() mechanism to read and process federates' buffers in an orderly fashion. + + tag_t intended_tag = extract_tag(buffer); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_NET, fed->enclave.id, &intended_tag); + } + LF_PRINT_LOG("RTI received from federate %d the Next Event Tag (NET) " PRINTF_TAG, fed->enclave.id, + intended_tag.time - start_time, intended_tag.microstep); + update_federate_next_event_tag_locked(fed->enclave.id, intended_tag); LF_MUTEX_UNLOCK(&rti_mutex); - return; } - // Forward the stop request to all other federates that have not - // also issued a stop request. - unsigned char stop_request_buffer[MSG_TYPE_STOP_REQUEST_LENGTH]; - ENCODE_STOP_REQUEST(stop_request_buffer, rti_remote->base.max_stop_tag.time, - rti_remote->base.max_stop_tag.microstep); + /////////////////// STOP functions //////////////////// + + /** + * Boolean used to prevent the RTI from sending the + * MSG_TYPE_STOP_GRANTED message multiple times. + */ + bool stop_granted_already_sent_to_federates = false; + + /** + * Once the RTI has seen proposed tags from all connected federates, + * it will broadcast a MSG_TYPE_STOP_GRANTED carrying the _RTI.max_stop_tag. + * This function also checks the most recently received NET from + * each federate and resets that be no greater than the _RTI.max_stop_tag. + * + * This function assumes the caller holds the rti_mutex lock. + */ + static void broadcast_stop_time_to_federates_locked() { + if (stop_granted_already_sent_to_federates == true) { + return; + } + stop_granted_already_sent_to_federates = true; - // Iterate over federates and send each the MSG_TYPE_STOP_REQUEST message - // if we do not have a stop_time already for them. Do not do this more than once. - if (rti_remote->stop_in_progress) { - LF_MUTEX_UNLOCK(&rti_mutex); - return; - } - rti_remote->stop_in_progress = true; - // Need a timeout here in case a federate never replies. - lf_thread_t timeout_thread; - lf_thread_create(&timeout_thread, wait_for_stop_request_reply, NULL); + // Reply with a stop granted to all federates + unsigned char outgoing_buffer[MSG_TYPE_STOP_GRANTED_LENGTH]; + ENCODE_STOP_GRANTED(outgoing_buffer, rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* f = GET_FED_INFO(i); - if (f->enclave.id != fed->enclave.id && f->requested_stop == false) { - if (f->enclave.state == NOT_CONNECTED) { - mark_federate_requesting_stop(f); + // Iterate over federates and send each the message. + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->enclave.state == NOT_CONNECTED) { continue; } + if (lf_tag_compare(fed->enclave.next_event, rti_remote->base.max_stop_tag) >= 0) { + // Need the next_event to be no greater than the stop tag. + fed->enclave.next_event = rti_remote->base.max_stop_tag; + } if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); + tracepoint_rti_to_federate(send_STOP_GRN, fed->enclave.id, &rti_remote->base.max_stop_tag); } - write_to_socket_fail_on_error(&f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, &rti_mutex, - "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", - f->enclave.id); + write_to_socket_fail_on_error(&fed->socket, MSG_TYPE_STOP_GRANTED_LENGTH, outgoing_buffer, &rti_mutex, + "RTI failed to send MSG_TYPE_STOP_GRANTED message to federate %d.", + fed->enclave.id); + } + + LF_PRINT_LOG("RTI sent to federates MSG_TYPE_STOP_GRANTED with tag " PRINTF_TAG, + rti_remote->base.max_stop_tag.time - start_time, rti_remote->base.max_stop_tag.microstep); + } + + /** + * Mark a federate requesting stop. If the number of federates handling stop reaches + * the number of persistent federates, broadcast MSG_TYPE_STOP_GRANTED to every federate. + * This function assumes the _RTI.mutex is already locked. + * @param fed The federate that has requested a stop. + * @return 1 if stop time has been sent to all federates and 0 otherwise. + */ + static int mark_federate_requesting_stop(federate_info_t * fed) { + if (!fed->requested_stop) { + // Increment the number of federates handling stop only if it is persistent + if (!fed->is_transient) + rti_remote->base.num_scheduling_nodes_handling_stop++; + fed->requested_stop = true; + } + if (rti_remote->base.num_scheduling_nodes_handling_stop == + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { + // We now have information about the stop time of all + // federates. + broadcast_stop_time_to_federates_locked(); + return 1; } + return 0; } - LF_PRINT_LOG("RTI forwarded to federates MSG_TYPE_STOP_REQUEST with tag (" PRINTF_TIME ", %u).", - rti_remote->base.max_stop_tag.time - start_time, rti_remote->base.max_stop_tag.microstep); - LF_MUTEX_UNLOCK(&rti_mutex); - } - void handle_stop_request_reply(federate_info_t * fed) { - size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_REPLY_LENGTH - 1; - unsigned char buffer_stop_time[bytes_to_read]; - read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer_stop_time, NULL, - "RTI failed to read the reply to MSG_TYPE_STOP_REQUEST message from federate %d.", - fed->enclave.id); + /** + * Thread to time out if federates do not reply to stop request. + */ + static void* wait_for_stop_request_reply(void* args) { + initialize_lf_thread_id(); + // Divide the time into small chunks and check periodically. + interval_t chunk = MAX_TIME_FOR_REPLY_TO_STOP_REQUEST / 30; + int count = 0; + while (count++ < 30) { + if (stop_granted_already_sent_to_federates) + return NULL; + lf_sleep(chunk); + } + // If we reach here, then error out. + lf_print_error_and_exit("Received only %d stop request replies within timeout " PRINTF_TIME "ns. RTI is exiting.", + rti_remote->base.num_scheduling_nodes_handling_stop, MAX_TIME_FOR_REPLY_TO_STOP_REQUEST); + return NULL; + } - tag_t federate_stop_tag = extract_tag(buffer_stop_time); + void handle_stop_request_message(federate_info_t * fed) { + LF_PRINT_DEBUG("RTI handling stop_request from federate %d.", fed->enclave.id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_STOP_REQ_REP, fed->enclave.id, &federate_stop_tag); - } + size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; + unsigned char buffer[bytes_to_read]; + read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, NULL, + "RTI failed to read the MSG_TYPE_STOP_REQUEST payload from federate %d.", + fed->enclave.id); - LF_PRINT_LOG("RTI received from federate %d STOP reply tag " PRINTF_TAG ".", fed->enclave.id, - federate_stop_tag.time - start_time, federate_stop_tag.microstep); + // Extract the proposed stop tag for the federate + tag_t proposed_stop_tag = extract_tag(buffer); - // Acquire the mutex lock so that we can change the state of the RTI - LF_MUTEX_LOCK(&rti_mutex); - // If the federate has not requested stop before, count the reply - if (lf_tag_compare(federate_stop_tag, rti_remote->base.max_stop_tag) > 0) { - rti_remote->base.max_stop_tag = federate_stop_tag; - } - mark_federate_requesting_stop(fed); - LF_MUTEX_UNLOCK(&rti_mutex); - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_STOP_REQ, fed->enclave.id, &proposed_stop_tag); + } - ////////////////////////////////////////////////// + LF_PRINT_LOG("RTI received from federate %d a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", + fed->enclave.id, proposed_stop_tag.time - start_time, proposed_stop_tag.microstep); - void handle_address_query(uint16_t fed_id) { - federate_info_t* fed = GET_FED_INFO(fed_id); - // Use buffer both for reading and constructing the reply. - // The length is what is needed for the reply. - unsigned char buffer[1 + sizeof(int32_t)]; - read_from_socket_fail_on_error(&fed->socket, sizeof(uint16_t), (unsigned char*)buffer, NULL, - "Failed to read address query."); - uint16_t remote_fed_id = extract_uint16(buffer); + // Acquire a mutex lock to ensure that this state does change while a + // message is in transport or being used to determine a TAG. + LF_MUTEX_LOCK(&rti_mutex); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_ADR_QR, fed_id, NULL); + // Check whether we have already received a stop_tag + // from this federate + if (fed->requested_stop) { + // If stop request messages have already been broadcast, treat this as if it were a reply. + if (rti_remote->stop_in_progress) { + mark_federate_requesting_stop(fed); + } + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } + + // Update the maximum stop tag received from federates + if (lf_tag_compare(proposed_stop_tag, rti_remote->base.max_stop_tag) > 0) { + rti_remote->base.max_stop_tag = proposed_stop_tag; + } + + // If all federates have replied, send stop request granted. + if (mark_federate_requesting_stop(fed)) { + // Have send stop request granted to all federates. Nothing more to do. + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } + + // Forward the stop request to all other federates that have not + // also issued a stop request. + unsigned char stop_request_buffer[MSG_TYPE_STOP_REQUEST_LENGTH]; + ENCODE_STOP_REQUEST(stop_request_buffer, rti_remote->base.max_stop_tag.time, + rti_remote->base.max_stop_tag.microstep); + + // Iterate over federates and send each the MSG_TYPE_STOP_REQUEST message + // if we do not have a stop_time already for them. Do not do this more than once. + if (rti_remote->stop_in_progress) { + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } + rti_remote->stop_in_progress = true; + // Need a timeout here in case a federate never replies. + lf_thread_t timeout_thread; + lf_thread_create(&timeout_thread, wait_for_stop_request_reply, NULL); + + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* f = GET_FED_INFO(i); + if (f->enclave.id != fed->enclave.id && f->requested_stop == false) { + if (f->enclave.state == NOT_CONNECTED) { + mark_federate_requesting_stop(f); + continue; + } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); + } + write_to_socket_fail_on_error(&f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, &rti_mutex, + "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", + f->enclave.id); + } + } + LF_PRINT_LOG("RTI forwarded to federates MSG_TYPE_STOP_REQUEST with tag (" PRINTF_TIME ", %u).", + rti_remote->base.max_stop_tag.time - start_time, rti_remote->base.max_stop_tag.microstep); + LF_MUTEX_UNLOCK(&rti_mutex); } - LF_PRINT_DEBUG("RTI received address query from %d for %d.", fed_id, remote_fed_id); + void handle_stop_request_reply(federate_info_t * fed) { + size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_REPLY_LENGTH - 1; + unsigned char buffer_stop_time[bytes_to_read]; + read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer_stop_time, NULL, + "RTI failed to read the reply to MSG_TYPE_STOP_REQUEST message from federate %d.", + fed->enclave.id); - // NOTE: server_port initializes to -1, which means the RTI does not know - // the port number because it has not yet received an MSG_TYPE_ADDRESS_ADVERTISEMENT message - // from this federate. In that case, it will respond by sending -1. + tag_t federate_stop_tag = extract_tag(buffer_stop_time); - // Response message is MSG_TYPE_ADDRESS_QUERY_REPLY. - buffer[0] = MSG_TYPE_ADDRESS_QUERY_REPLY; + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_STOP_REQ_REP, fed->enclave.id, &federate_stop_tag); + } - // Encode the port number. - federate_info_t* remote_fed = GET_FED_INFO(remote_fed_id); + LF_PRINT_LOG("RTI received from federate %d STOP reply tag " PRINTF_TAG ".", fed->enclave.id, + federate_stop_tag.time - start_time, federate_stop_tag.microstep); - // Send the port number (which could be -1). - LF_MUTEX_LOCK(&rti_mutex); - encode_int32(remote_fed->server_port, (unsigned char*)&buffer[1]); - write_to_socket_fail_on_error(&fed->socket, sizeof(int32_t) + 1, (unsigned char*)buffer, &rti_mutex, - "Failed to write port number to socket of federate %d.", fed_id); - - // Send the server IP address to federate. - write_to_socket_fail_on_error(&fed->socket, sizeof(remote_fed->server_ip_addr), - (unsigned char*)&remote_fed->server_ip_addr, &rti_mutex, - "Failed to write ip address to socket of federate %d.", fed_id); - LF_MUTEX_UNLOCK(&rti_mutex); + // Acquire the mutex lock so that we can change the state of the RTI + LF_MUTEX_LOCK(&rti_mutex); + // If the federate has not requested stop before, count the reply + if (lf_tag_compare(federate_stop_tag, rti_remote->base.max_stop_tag) > 0) { + rti_remote->base.max_stop_tag = federate_stop_tag; + } + mark_federate_requesting_stop(fed); + LF_MUTEX_UNLOCK(&rti_mutex); + } - LF_PRINT_DEBUG("Replied to address query from federate %d with address %s:%d.", fed_id, remote_fed->server_hostname, - remote_fed->server_port); - } + ////////////////////////////////////////////////// - void handle_address_ad(uint16_t federate_id) { - federate_info_t* fed = GET_FED_INFO(federate_id); - // Read the port number of the federate that can be used for physical - // connections to other federates - int32_t server_port = -1; - unsigned char buffer[sizeof(int32_t)]; - read_from_socket_fail_on_error(&fed->socket, sizeof(int32_t), (unsigned char*)buffer, NULL, - "Error reading port data from federate %d.", federate_id); + void handle_address_query(uint16_t fed_id) { + federate_info_t* fed = GET_FED_INFO(fed_id); + // Use buffer both for reading and constructing the reply. + // The length is what is needed for the reply. + unsigned char buffer[1 + sizeof(int32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(uint16_t), (unsigned char*)buffer, NULL, + "Failed to read address query."); + uint16_t remote_fed_id = extract_uint16(buffer); - server_port = extract_int32(buffer); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_ADR_QR, fed_id, NULL); + } - assert(server_port < 65536); + LF_PRINT_DEBUG("RTI received address query from %d for %d.", fed_id, remote_fed_id); - LF_MUTEX_LOCK(&rti_mutex); - fed->server_port = server_port; - LF_MUTEX_UNLOCK(&rti_mutex); + // NOTE: server_port initializes to -1, which means the RTI does not know + // the port number because it has not yet received an MSG_TYPE_ADDRESS_ADVERTISEMENT message + // from this federate. In that case, it will respond by sending -1. - LF_PRINT_LOG("Received address advertisement with port %d from federate %d.", server_port, federate_id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_ADR_AD, federate_id, NULL); - } - } + // Response message is MSG_TYPE_ADDRESS_QUERY_REPLY. + buffer[0] = MSG_TYPE_ADDRESS_QUERY_REPLY; - /** - * Send to the start time to the federate my_fed. - * This function assumes the caller does not hold the mutex. - * - * If it is the startup phase, the start_time will be the maximum received timestamps - * plus an offset. The federate will then receive identical federation_start_time - * and federate_start_tag.time (the federate_start_tag.microstep will be 0). - * If, however, the startup phase is passed, the federate will receive different - * values than stated above. - * - * This will also notify federates downstream of my_fed that this federate is now - * connected. This is important when there are zero-delay cycles. - * - * This function assumes the caller holds the mutex. - * - * @param my_fed the federate to send the start time to. - * @param federation_start_time the federation start_time - * @param federate_start_tag the federate effective start tag - */ - static void send_start_tag(federate_info_t * my_fed, instant_t federation_start_time, tag_t federate_start_tag) { - // Send back to the federate the maximum time plus an offset on a TIMESTAMP_START - // message. - // In the startup phase, federates will receive identical start_time and - // effective_start_tag - unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_START_LENGTH]; - start_time_buffer[0] = MSG_TYPE_TIMESTAMP_START; - encode_int64(swap_bytes_if_big_endian_int64(start_time), &start_time_buffer[1]); - encode_tag(&(start_time_buffer[1 + sizeof(instant_t)]), federate_start_tag); + // Encode the port number. + federate_info_t* remote_fed = GET_FED_INFO(remote_fed_id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_TIMESTAMP, my_fed->enclave.id, &federate_start_tag); - } - if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_START_LENGTH, start_time_buffer)) { - lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); + // Send the port number (which could be -1). + LF_MUTEX_LOCK(&rti_mutex); + encode_int32(remote_fed->server_port, (unsigned char*)&buffer[1]); + write_to_socket_fail_on_error(&fed->socket, sizeof(int32_t) + 1, (unsigned char*)buffer, &rti_mutex, + "Failed to write port number to socket of federate %d.", fed_id); + + // Send the server IP address to federate. + write_to_socket_fail_on_error(&fed->socket, sizeof(remote_fed->server_ip_addr), + (unsigned char*)&remote_fed->server_ip_addr, &rti_mutex, + "Failed to write ip address to socket of federate %d.", fed_id); + LF_MUTEX_UNLOCK(&rti_mutex); + + LF_PRINT_DEBUG("Replied to address query from federate %d with address %s:%d.", fed_id, + remote_fed->server_hostname, remote_fed->server_port); } - // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP - // message has been sent. That MSG_TYPE_TIMESTAMP message grants time advance to - // the federate to the start time. - my_fed->enclave.state = GRANTED; - lf_cond_broadcast(&sent_start_time); - LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); + void handle_address_ad(uint16_t federate_id) { + federate_info_t* fed = GET_FED_INFO(federate_id); + // Read the port number of the federate that can be used for physical + // connections to other federates + int32_t server_port = -1; + unsigned char buffer[sizeof(int32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(int32_t), (unsigned char*)buffer, NULL, + "Error reading port data from federate %d.", federate_id); - // Notify downstream federates of this now connected transient. - for (int i = 0; i < my_fed->enclave.num_upstream; i++) { - send_upstream_connected_locked(GET_FED_INFO(my_fed->enclave.upstream[i]), my_fed); - } + server_port = extract_int32(buffer); - LF_MUTEX_UNLOCK(&rti_mutex); - } + assert(server_port < 65536); - void handle_timestamp(federate_info_t * my_fed) { - unsigned char buffer[sizeof(int64_t)]; - // Read bytes from the socket. We need 8 bytes. - read_from_socket_fail_on_error(&my_fed->socket, sizeof(int64_t), (unsigned char*)&buffer, NULL, - "ERROR reading timestamp from federate %d.\n", my_fed->enclave.id); + LF_MUTEX_LOCK(&rti_mutex); + fed->server_port = server_port; + LF_MUTEX_UNLOCK(&rti_mutex); - int64_t timestamp = swap_bytes_if_big_endian_int64(*((int64_t*)(&buffer))); - if (rti_remote->base.tracing_enabled) { - tag_t tag = {.time = timestamp, .microstep = 0}; - tracepoint_rti_from_federate(receive_TIMESTAMP, my_fed->enclave.id, &tag); + LF_PRINT_LOG("Received address advertisement with port %d from federate %d.", server_port, federate_id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_ADR_AD, federate_id, NULL); + } } - LF_PRINT_DEBUG("RTI received timestamp message with time: " PRINTF_TIME ".", timestamp); - LF_MUTEX_LOCK(&rti_mutex); + /** + * Send to the start time to the federate my_fed. + * This function assumes the caller does not hold the mutex. + * + * If it is the startup phase, the start_time will be the maximum received timestamps + * plus an offset. The federate will then receive identical federation_start_time + * and federate_start_tag.time (the federate_start_tag.microstep will be 0). + * If, however, the startup phase is passed, the federate will receive different + * values than stated above. + * + * This will also notify federates downstream of my_fed that this federate is now + * connected. This is important when there are zero-delay cycles. + * + * This function assumes the caller holds the mutex. + * + * @param my_fed the federate to send the start time to. + * @param federation_start_time the federation start_time + * @param federate_start_tag the federate effective start tag + */ + static void send_start_tag(federate_info_t * my_fed, instant_t federation_start_time, tag_t federate_start_tag) { + // Send back to the federate the maximum time plus an offset on a TIMESTAMP_START + // message. + // In the startup phase, federates will receive identical start_time and + // effective_start_tag + unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_START_LENGTH]; + start_time_buffer[0] = MSG_TYPE_TIMESTAMP_START; + encode_int64(swap_bytes_if_big_endian_int64(start_time), &start_time_buffer[1]); + encode_tag(&(start_time_buffer[1 + sizeof(instant_t)]), federate_start_tag); - // Processing the TIMESTAMP depends on whether it is the startup phase. - if (rti_remote->phase == startup_phase) { - // Not all persistent federates have proposed a start time. - if (timestamp > rti_remote->max_start_time) { - rti_remote->max_start_time = timestamp; - } - // Note that if a transient federate's thread gets here during the startup phase, - // then it will be assigned the same global tag as its effective start tag and its - // timestamp will affect that start tag. - if (!my_fed->is_transient) { - rti_remote->num_feds_proposed_start++; + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_TIMESTAMP, my_fed->enclave.id, &federate_start_tag); } - if (rti_remote->num_feds_proposed_start == - (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { - // This federate is the last persistent federate to proposed a start time. - lf_cond_broadcast(&received_start_times); - rti_remote->phase = execution_phase; - } else { - // Wait until all persistent federates have proposed a start time. - while (rti_remote->num_feds_proposed_start < - (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { - lf_cond_wait(&received_start_times); - } + if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_START_LENGTH, start_time_buffer)) { + lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); } - // Add an offset to the maximum tag to get everyone starting together. - start_time = rti_remote->max_start_time + DELAY_START; - my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; - LF_MUTEX_UNLOCK(&rti_mutex); + // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP + // message has been sent. That MSG_TYPE_TIMESTAMP message grants time advance to + // the federate to the start time. + my_fed->enclave.state = GRANTED; + lf_cond_broadcast(&sent_start_time); + LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); + + // Notify downstream federates of this now connected transient. + for (int i = 0; i < my_fed->enclave.num_upstream; i++) { + send_upstream_connected_locked(GET_FED_INFO(my_fed->enclave.upstream[i]), my_fed); + } - // Notify the federate of its start tag. - send_start_tag(my_fed, start_time, my_fed->effective_start_tag); - } else if (rti_remote->phase == shutdown_phase || !my_fed->is_transient) { LF_MUTEX_UNLOCK(&rti_mutex); + } - // Send reject message if the federation is in shutdown phase or if - // it is in the execution phase but the federate is persistent. - send_reject(&my_fed->socket, JOINING_TOO_LATE); - return; - } else { - // The federation is transient and we are in the execution phase. - // At this point, we already hold the mutex. + void handle_timestamp(federate_info_t * my_fed) { + unsigned char buffer[sizeof(int64_t)]; + // Read bytes from the socket. We need 8 bytes. + read_from_socket_fail_on_error(&my_fed->socket, sizeof(int64_t), (unsigned char*)&buffer, NULL, + "ERROR reading timestamp from federate %d.\n", my_fed->enclave.id); - //// Algorithm for computing the effective_start_time of a joining transient - // The effective_start_time will be the max among all the following tags: - // 1. At tag: (joining time, 0 microstep) - // 2. (start_time, 1 microstep) - // 3. The latest completed logical tag + 1 microstep - // 4. The latest granted (P)TAG + 1 microstep, of every downstream federate - // 5. The maximun tag of messages from the upstream federates + 1 microstep + int64_t timestamp = swap_bytes_if_big_endian_int64(*((int64_t*)(&buffer))); + if (rti_remote->base.tracing_enabled) { + tag_t tag = {.time = timestamp, .microstep = 0}; + tracepoint_rti_from_federate(receive_TIMESTAMP, my_fed->enclave.id, &tag); + } + LF_PRINT_DEBUG("RTI received timestamp message with time: " PRINTF_TIME ".", timestamp); - // Condition 1. - my_fed->effective_start_tag = (tag_t){.time = timestamp, .microstep = 0u}; + LF_MUTEX_LOCK(&rti_mutex); - // Condition 2. - if (timestamp < start_time) { - my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 1u}; - } + // Processing the TIMESTAMP depends on whether it is the startup phase. + if (rti_remote->phase == startup_phase) { + // Not all persistent federates have proposed a start time. + if (timestamp > rti_remote->max_start_time) { + rti_remote->max_start_time = timestamp; + } + // Note that if a transient federate's thread gets here during the startup phase, + // then it will be assigned the same global tag as its effective start tag and its + // timestamp will affect that start tag. + if (!my_fed->is_transient) { + rti_remote->num_feds_proposed_start++; + } + if (rti_remote->num_feds_proposed_start == + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { + // This federate is the last persistent federate to proposed a start time. + lf_cond_broadcast(&received_start_times); + rti_remote->phase = execution_phase; + } else { + // Wait until all persistent federates have proposed a start time. + while (rti_remote->num_feds_proposed_start < + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { + lf_cond_wait(&received_start_times); + } + } + // Add an offset to the maximum tag to get everyone starting together. + start_time = rti_remote->max_start_time + DELAY_START; + my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; - // Condition 3. - if (lf_tag_compare(my_fed->enclave.completed, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = my_fed->enclave.completed; - my_fed->effective_start_tag.microstep++; - } + LF_MUTEX_UNLOCK(&rti_mutex); - // Condition 4. Iterate over the downstream federates - for (int j = 0; j < my_fed->enclave.num_downstream; j++) { - federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); + // Notify the federate of its start tag. + send_start_tag(my_fed, start_time, my_fed->effective_start_tag); + } else if (rti_remote->phase == shutdown_phase || !my_fed->is_transient) { + LF_MUTEX_UNLOCK(&rti_mutex); - // Get the max over the TAG of the downstreams - if (lf_tag_compare(downstream->enclave.last_granted, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = downstream->enclave.last_granted; - my_fed->effective_start_tag.microstep++; + // Send reject message if the federation is in shutdown phase or if + // it is in the execution phase but the federate is persistent. + send_reject(&my_fed->socket, JOINING_TOO_LATE); + return; + } else { + // The federate is transient and we are in the execution phase. + // At this point, we already hold the mutex. + + //// Algorithm for computing the effective_start_time of a joining transient + // The effective_start_time will be the max among all the following tags: + // 1. At tag: (joining time, 0 microstep) + // 2. (start_time, 0 microstep) + // 3. The latest completed logical tag + 1 microstep + // 4. The latest granted (P)TAG + 1 microstep, of every downstream federate + // 5. The maximun tag of messages from the upstream federates + 1 microstep + + // Condition 1. + my_fed->effective_start_tag = (tag_t){.time = timestamp, .microstep = 0u}; + + // Condition 2. + if (timestamp < start_time) { + my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; } - // Get the max over the PTAG of the downstreams - if (lf_tag_compare(downstream->enclave.last_provisionally_granted, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = downstream->enclave.last_provisionally_granted; + // Condition 3. + if (lf_tag_compare(my_fed->enclave.completed, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = my_fed->enclave.completed; my_fed->effective_start_tag.microstep++; } - } - // Condition 5. - // This one is a bit subtle. Any messages from upstream federates that the RTI has - // not yet seen will be sent to this joining federate after the effective_start_tag - // because the effective_start_tag is sent while still holding the mutex. + // Condition 4. Iterate over the downstream federates + for (int j = 0; j < my_fed->enclave.num_downstream; j++) { + federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); - // Iterate over the messages from the upstream federates - for (int j = 0; j < my_fed->enclave.num_upstream; j++) { - federate_info_t* upstream = GET_FED_INFO(my_fed->enclave.upstream[j]); - - size_t queue_size = pqueue_tag_size(upstream->in_transit_message_tags); - if (queue_size != 0) { - tag_t max_tag = pqueue_tag_max_tag(upstream->in_transit_message_tags); + // Get the max over the TAG of the downstreams + if (lf_tag_compare(downstream->enclave.last_granted, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = downstream->enclave.last_granted; + my_fed->effective_start_tag.microstep++; + } - if (lf_tag_compare(max_tag, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = max_tag; + // Get the max over the PTAG of the downstreams + if (lf_tag_compare(downstream->enclave.last_provisionally_granted, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = downstream->enclave.last_provisionally_granted; my_fed->effective_start_tag.microstep++; } } - } - // For every downstream that has a pending grant that is higher than the - // effective_start_time of the federate, cancel it. - for (int j = 0; j < my_fed->enclave.num_downstream; j++) { - federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); + // Condition 5. + // This one is a bit subtle. Any messages from upstream federates that the RTI has + // not yet seen will be sent to this joining federate after the effective_start_tag + // because the effective_start_tag is sent while still holding the mutex. - // Ignore this federate if it has resigned. - if (downstream->enclave.state == NOT_CONNECTED) { - continue; + // Iterate over the messages from the upstream federates + for (int j = 0; j < my_fed->enclave.num_upstream; j++) { + federate_info_t* upstream = GET_FED_INFO(my_fed->enclave.upstream[j]); + + size_t queue_size = pqueue_tag_size(upstream->in_transit_message_tags); + if (queue_size != 0) { + tag_t max_tag = pqueue_tag_max_tag(upstream->in_transit_message_tags); + + if (lf_tag_compare(max_tag, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = max_tag; + my_fed->effective_start_tag.microstep++; + } + } } - // Check the pending grants, if any, and keep it only if it is - // sooner than the effective start tag. - pqueue_delayed_grant_element_t* dge = - pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, downstream->enclave.id); - if (dge != NULL && lf_tag_compare(dge->base.tag, my_fed->effective_start_tag) > 0) { - pqueue_delayed_grants_remove(rti_remote->delayed_grants, dge); + // For every downstream that has a pending grant that is higher than the + // effective_start_time of the federate, cancel it. + for (int j = 0; j < my_fed->enclave.num_downstream; j++) { + federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); + + // Ignore this federate if it has resigned. + if (downstream->enclave.state == NOT_CONNECTED) { + continue; + } + + // Check the pending grants, if any, and keep it only if it is + // sooner than the effective start tag. + pqueue_delayed_grant_element_t* dge = + pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, downstream->enclave.id); + if (dge != NULL && lf_tag_compare(dge->base.tag, my_fed->effective_start_tag) > 0) { + pqueue_delayed_grants_remove(rti_remote->delayed_grants, dge); + } } - } - // Once the effective start time set, sent it to the joining transient, - // together with the start time of the federation. + // Once the effective start time set, sent it to the joining transient, + // together with the start time of the federation. - // Have to send the start tag while still holding the mutex to ensure that no message - // from an upstream federate is forwarded before the start tag. - send_start_tag(my_fed, start_time, my_fed->effective_start_tag); + // Have to send the start tag while still holding the mutex to ensure that no message + // from an upstream federate is forwarded before the start tag. + send_start_tag(my_fed, start_time, my_fed->effective_start_tag); - LF_MUTEX_UNLOCK(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); + } } - } - void send_physical_clock(unsigned char message_type, federate_info_t* fed, socket_type_t socket_type) { - if (fed->enclave.state == NOT_CONNECTED) { - lf_print_warning("Clock sync: RTI failed to send physical time to federate %d. Socket not connected.\n", - fed->enclave.id); - return; - } - unsigned char buffer[sizeof(int64_t) + 1]; - buffer[0] = message_type; - int64_t current_physical_time = lf_time_physical(); - encode_int64(current_physical_time, &(buffer[1])); - - // Send the message - if (socket_type == UDP) { - // FIXME: UDP_addr is never initialized. - LF_PRINT_DEBUG("Clock sync: RTI sending UDP message type %u.", buffer[0]); - ssize_t bytes_written = sendto(rti_remote->socket_descriptor_UDP, buffer, 1 + sizeof(int64_t), 0, - (struct sockaddr*)&fed->UDP_addr, sizeof(fed->UDP_addr)); - if (bytes_written < (ssize_t)sizeof(int64_t) + 1) { - lf_print_warning("Clock sync: RTI failed to send physical time to federate %d: %s\n", fed->enclave.id, - strerror(errno)); + void send_physical_clock(unsigned char message_type, federate_info_t* fed, socket_type_t socket_type) { + if (fed->enclave.state == NOT_CONNECTED) { + lf_print_warning("Clock sync: RTI failed to send physical time to federate %d. Socket not connected.\n", + fed->enclave.id); return; } - } else if (socket_type == TCP) { - LF_PRINT_DEBUG("Clock sync: RTI sending TCP message type %u.", buffer[0]); - LF_MUTEX_LOCK(&rti_mutex); - write_to_socket_fail_on_error(&fed->socket, 1 + sizeof(int64_t), buffer, &rti_mutex, - "Clock sync: RTI failed to send physical time to federate %d.", fed->enclave.id); - LF_MUTEX_UNLOCK(&rti_mutex); + unsigned char buffer[sizeof(int64_t) + 1]; + buffer[0] = message_type; + int64_t current_physical_time = lf_time_physical(); + encode_int64(current_physical_time, &(buffer[1])); + + // Send the message + if (socket_type == UDP) { + // FIXME: UDP_addr is never initialized. + LF_PRINT_DEBUG("Clock sync: RTI sending UDP message type %u.", buffer[0]); + ssize_t bytes_written = sendto(rti_remote->socket_descriptor_UDP, buffer, 1 + sizeof(int64_t), 0, + (struct sockaddr*)&fed->UDP_addr, sizeof(fed->UDP_addr)); + if (bytes_written < (ssize_t)sizeof(int64_t) + 1) { + lf_print_warning("Clock sync: RTI failed to send physical time to federate %d: %s\n", fed->enclave.id, + strerror(errno)); + return; + } + } else if (socket_type == TCP) { + LF_PRINT_DEBUG("Clock sync: RTI sending TCP message type %u.", buffer[0]); + LF_MUTEX_LOCK(&rti_mutex); + write_to_socket_fail_on_error(&fed->socket, 1 + sizeof(int64_t), buffer, &rti_mutex, + "Clock sync: RTI failed to send physical time to federate %d.", fed->enclave.id); + LF_MUTEX_UNLOCK(&rti_mutex); + } + LF_PRINT_DEBUG("Clock sync: RTI sent PHYSICAL_TIME_SYNC_MESSAGE with timestamp " PRINTF_TIME " to federate %d.", + current_physical_time, fed->enclave.id); } - LF_PRINT_DEBUG("Clock sync: RTI sent PHYSICAL_TIME_SYNC_MESSAGE with timestamp " PRINTF_TIME " to federate %d.", - current_physical_time, fed->enclave.id); - } - void handle_physical_clock_sync_message(federate_info_t * my_fed, socket_type_t socket_type) { - // Lock the mutex to prevent interference between sending the two - // coded probe messages. - LF_MUTEX_LOCK(&rti_mutex); - // Reply with a T4 type message - send_physical_clock(MSG_TYPE_CLOCK_SYNC_T4, my_fed, socket_type); - // Send the corresponding coded probe immediately after, - // but only if this is a UDP channel. - if (socket_type == UDP) { - send_physical_clock(MSG_TYPE_CLOCK_SYNC_CODED_PROBE, my_fed, socket_type); + void handle_physical_clock_sync_message(federate_info_t * my_fed, socket_type_t socket_type) { + // Lock the mutex to prevent interference between sending the two + // coded probe messages. + LF_MUTEX_LOCK(&rti_mutex); + // Reply with a T4 type message + send_physical_clock(MSG_TYPE_CLOCK_SYNC_T4, my_fed, socket_type); + // Send the corresponding coded probe immediately after, + // but only if this is a UDP channel. + if (socket_type == UDP) { + send_physical_clock(MSG_TYPE_CLOCK_SYNC_CODED_PROBE, my_fed, socket_type); + } + LF_MUTEX_UNLOCK(&rti_mutex); } - LF_MUTEX_UNLOCK(&rti_mutex); - } - void* clock_synchronization_thread(void* noargs) { - initialize_lf_thread_id(); - // Wait until all federates have been notified of the start time. - // FIXME: Use lf_ version of this when merged with master. - LF_MUTEX_LOCK(&rti_mutex); - while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) { - lf_cond_wait(&received_start_times); - } - LF_MUTEX_UNLOCK(&rti_mutex); + void* clock_synchronization_thread(void* noargs) { + initialize_lf_thread_id(); + // Wait until all federates have been notified of the start time. + // FIXME: Use lf_ version of this when merged with master. + LF_MUTEX_LOCK(&rti_mutex); + while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) { + lf_cond_wait(&received_start_times); + } + LF_MUTEX_UNLOCK(&rti_mutex); - // Wait until the start time before starting clock synchronization. - // The above wait ensures that start_time has been set. - interval_t ns_to_wait = start_time - lf_time_physical(); + // Wait until the start time before starting clock synchronization. + // The above wait ensures that start_time has been set. + interval_t ns_to_wait = start_time - lf_time_physical(); - if (ns_to_wait > 0LL) { - lf_sleep(ns_to_wait); - } + if (ns_to_wait > 0LL) { + lf_sleep(ns_to_wait); + } - // Initiate a clock synchronization every rti->clock_sync_period_ns - bool any_federates_connected = true; - while (any_federates_connected) { - // Sleep - lf_sleep(rti_remote->clock_sync_period_ns); // Can be interrupted - any_federates_connected = false; - for (int fed_id = 0; fed_id < rti_remote->base.number_of_scheduling_nodes; fed_id++) { - federate_info_t* fed = GET_FED_INFO(fed_id); - if (fed->enclave.state == NOT_CONNECTED) { - // FIXME: We need better error handling here, but clock sync failure - // should not stop execution. - lf_print_error("Clock sync failed with federate %d. Not connected.", fed_id); - continue; - } else if (!fed->clock_synchronization_enabled) { - continue; - } - // Send the RTI's current physical time to the federate - // Send on UDP. - LF_PRINT_DEBUG("RTI sending T1 message to initiate clock sync round."); - send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, UDP); - - // Listen for reply message, which should be T3. - size_t message_size = 1 + sizeof(uint16_t); - unsigned char buffer[message_size]; - // Maximum number of messages that we discard before giving up on this cycle. - // If the T3 message from this federate does not arrive and we keep receiving - // other messages, then give up on this federate and move to the next federate. - int remaining_attempts = 5; - while (remaining_attempts > 0) { - remaining_attempts--; - int read_failed = read_from_socket(rti_remote->socket_descriptor_UDP, message_size, buffer); - // If any errors occur, either discard the message or the clock sync round. - if (!read_failed) { - if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { - uint16_t fed_id_2 = extract_uint16(&(buffer[1])); - // Check that this message came from the correct federate. - if (fed_id_2 != fed->enclave.id) { - // Message is from the wrong federate. Discard the message. - lf_print_warning("Clock sync: Received T3 message from federate %d, " - "but expected one from %d. Discarding message.", - fed_id_2, fed->enclave.id); + // Initiate a clock synchronization every rti->clock_sync_period_ns + bool any_federates_connected = true; + while (any_federates_connected) { + // Sleep + lf_sleep(rti_remote->clock_sync_period_ns); // Can be interrupted + any_federates_connected = false; + for (int fed_id = 0; fed_id < rti_remote->base.number_of_scheduling_nodes; fed_id++) { + federate_info_t* fed = GET_FED_INFO(fed_id); + if (fed->enclave.state == NOT_CONNECTED) { + // FIXME: We need better error handling here, but clock sync failure + // should not stop execution. + lf_print_error("Clock sync failed with federate %d. Not connected.", fed_id); + continue; + } else if (!fed->clock_synchronization_enabled) { + continue; + } + // Send the RTI's current physical time to the federate + // Send on UDP. + LF_PRINT_DEBUG("RTI sending T1 message to initiate clock sync round."); + send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, UDP); + + // Listen for reply message, which should be T3. + size_t message_size = 1 + sizeof(uint16_t); + unsigned char buffer[message_size]; + // Maximum number of messages that we discard before giving up on this cycle. + // If the T3 message from this federate does not arrive and we keep receiving + // other messages, then give up on this federate and move to the next federate. + int remaining_attempts = 5; + while (remaining_attempts > 0) { + remaining_attempts--; + int read_failed = read_from_socket(rti_remote->socket_descriptor_UDP, message_size, buffer); + // If any errors occur, either discard the message or the clock sync round. + if (!read_failed) { + if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { + uint16_t fed_id_2 = extract_uint16(&(buffer[1])); + // Check that this message came from the correct federate. + if (fed_id_2 != fed->enclave.id) { + // Message is from the wrong federate. Discard the message. + lf_print_warning("Clock sync: Received T3 message from federate %d, " + "but expected one from %d. Discarding message.", + fed_id_2, fed->enclave.id); + continue; + } + LF_PRINT_DEBUG("Clock sync: RTI received T3 message from federate %d.", fed_id_2); + handle_physical_clock_sync_message(GET_FED_INFO(fed_id_2), UDP); + break; + } else { + // The message is not a T3 message. Discard the message and + // continue waiting for the T3 message. This is possibly a message + // from a previous cycle that was discarded. + lf_print_warning("Clock sync: Unexpected UDP message %u. Expected %u from federate %d. " + "Discarding message.", + buffer[0], MSG_TYPE_CLOCK_SYNC_T3, fed->enclave.id); continue; } - LF_PRINT_DEBUG("Clock sync: RTI received T3 message from federate %d.", fed_id_2); - handle_physical_clock_sync_message(GET_FED_INFO(fed_id_2), UDP); - break; } else { - // The message is not a T3 message. Discard the message and - // continue waiting for the T3 message. This is possibly a message - // from a previous cycle that was discarded. - lf_print_warning("Clock sync: Unexpected UDP message %u. Expected %u from federate %d. " - "Discarding message.", - buffer[0], MSG_TYPE_CLOCK_SYNC_T3, fed->enclave.id); - continue; + lf_print_warning("Clock sync: Read from UDP socket failed: %s. " + "Skipping clock sync round for federate %d.", + strerror(errno), fed->enclave.id); + remaining_attempts = -1; } - } else { - lf_print_warning("Clock sync: Read from UDP socket failed: %s. " - "Skipping clock sync round for federate %d.", - strerror(errno), fed->enclave.id); - remaining_attempts = -1; } - } - if (remaining_attempts > 0) { - any_federates_connected = true; + if (remaining_attempts > 0) { + any_federates_connected = true; + } } } - } - return NULL; - } - - /** - * Handle MSG_TYPE_FAILED sent by a federate. This message is sent by a federate - * that is exiting in failure. In this case, the RTI will - * also terminate abnormally, returning a non-zero exit code when it exits. - * - * This function assumes the caller does not hold the mutex. - * - * @param my_fed The federate sending a MSG_TYPE_FAILED message. - */ - static void handle_federate_failed(federate_info_t * my_fed) { - // Nothing more to do. Close the socket and exit. - LF_MUTEX_LOCK(&rti_mutex); - - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_FAILED, my_fed->enclave.id, NULL); - } - - // Set the flag telling the RTI to exit with an error code when it exits. - _lf_federate_reports_error = true; - lf_print_error("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); + return NULL; + } + + /** + * Handle MSG_TYPE_FAILED sent by a federate. This message is sent by a federate + * that is exiting in failure. In this case, the RTI will + * also terminate abnormally, returning a non-zero exit code when it exits. + * + * This function assumes the caller does not hold the mutex. + * + * @param my_fed The federate sending a MSG_TYPE_FAILED message. + */ + static void handle_federate_failed(federate_info_t * my_fed) { + // Nothing more to do. Close the socket and exit. + LF_MUTEX_LOCK(&rti_mutex); - notify_federate_disconnected(&my_fed->enclave); - my_fed->enclave.state = NOT_CONNECTED; + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_FAILED, my_fed->enclave.id, NULL); + } - // Indicate that there will no further events from this federate. - my_fed->enclave.next_event = FOREVER_TAG; + // Set the flag telling the RTI to exit with an error code when it exits. + _lf_federate_reports_error = true; + lf_print_error("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); - // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, - // the close should happen when receiving a 0 length message from the other end. - // Here, we just signal the other side that no further writes to the socket are - // forthcoming, which should result in the other end getting a zero-length reception. - shutdown(my_fed->socket, SHUT_RDWR); + notify_federate_disconnected(&my_fed->enclave); + my_fed->enclave.state = NOT_CONNECTED; - // We can now safely close the socket. - close(my_fed->socket); // from unistd.h + // Indicate that there will no further events from this federate. + my_fed->enclave.next_event = FOREVER_TAG; - // Check downstream federates to see whether they should now be granted a TAG. - // To handle cycles, need to create a boolean array to keep - // track of which upstream federates have been visited. - bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. - notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); - free(visited); + // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, + // the close should happen when receiving a 0 length message from the other end. + // Here, we just signal the other side that no further writes to the socket are + // forthcoming, which should result in the other end getting a zero-length reception. + shutdown(my_fed->socket, SHUT_RDWR); - LF_MUTEX_UNLOCK(&rti_mutex); - } + // We can now safely close the socket. + close(my_fed->socket); // from unistd.h - /** - * Handle MSG_TYPE_RESIGN sent by a federate. This message is sent at the time of termination - * after all shutdown events are processed on the federate. - * - * This function assumes the caller does not hold the mutex. - * - * @note At this point, the RTI might have outgoing messages to the federate. This - * function thus first performs a shutdown on the socket, which sends an EOF. It then - * waits for the remote socket to be closed before closing the socket itself. - * - * @param my_fed The federate sending a MSG_TYPE_RESIGN message. - */ - static void handle_federate_resign(federate_info_t * my_fed) { - // Nothing more to do. Close the socket and exit. - LF_MUTEX_LOCK(&rti_mutex); + // Check downstream federates to see whether they should now be granted a TAG. + // To handle cycles, need to create a boolean array to keep + // track of which upstream federates have been visited. + bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. + notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); + free(visited); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_RESIGN, my_fed->enclave.id, NULL); + LF_MUTEX_UNLOCK(&rti_mutex); } - lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); - - my_fed->enclave.state = NOT_CONNECTED; - - // Indicate that there will no further events from this federate. - my_fed->enclave.next_event = FOREVER_TAG; + /** + * Handle MSG_TYPE_RESIGN sent by a federate. This message is sent at the time of termination + * after all shutdown events are processed on the federate. + * + * This function assumes the caller does not hold the mutex. + * + * @note At this point, the RTI might have outgoing messages to the federate. This + * function thus first performs a shutdown on the socket, which sends an EOF. It then + * waits for the remote socket to be closed before closing the socket itself. + * + * @param my_fed The federate sending a MSG_TYPE_RESIGN message. + */ + static void handle_federate_resign(federate_info_t * my_fed) { + // Nothing more to do. Close the socket and exit. + LF_MUTEX_LOCK(&rti_mutex); - // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, - // the close should happen when receiving a 0 length message from the other end. - // Here, we just signal the other side that no further writes to the socket are - // forthcoming, which should result in the other end getting a zero-length reception. - shutdown(my_fed->socket, SHUT_WR); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_RESIGN, my_fed->enclave.id, NULL); + } - // Wait for the federate to send an EOF or a socket error to occur. - // Discard any incoming bytes. Normally, this read should return 0 because - // the federate is resigning and should itself invoke shutdown. - unsigned char buffer[10]; - while (read(my_fed->socket, buffer, 10) > 0) - ; + lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); - // We can now safely close the socket. - close(my_fed->socket); // from unistd.h + my_fed->enclave.state = NOT_CONNECTED; - // Check downstream federates to see whether they should now be granted a TAG. - // To handle cycles, need to create a boolean array to keep - // track of which upstream federates have been visited. - bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. - notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); - free(visited); + // Indicate that there will no further events from this federate. + my_fed->enclave.next_event = FOREVER_TAG; - LF_MUTEX_UNLOCK(&rti_mutex); - } + // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, + // the close should happen when receiving a 0 length message from the other end. + // Here, we just signal the other side that no further writes to the socket are + // forthcoming, which should result in the other end getting a zero-length reception. + shutdown(my_fed->socket, SHUT_WR); - void* federate_info_thread_TCP(void* fed) { - initialize_lf_thread_id(); - federate_info_t* my_fed = (federate_info_t*)fed; - - // Buffer for incoming messages. - // This does not constrain the message size because messages - // are forwarded piece by piece. - unsigned char buffer[FED_COM_BUFFER_SIZE]; - - // Listen for messages from the federate. - while (my_fed->enclave.state != NOT_CONNECTED) { - // Read no more than one byte to get the message type. - int read_failed = read_from_socket(my_fed->socket, 1, buffer); - if (read_failed) { - // Socket is closed - lf_print_error("RTI: Socket to federate %d is closed. Exiting the thread.", my_fed->enclave.id); - my_fed->enclave.state = NOT_CONNECTED; - my_fed->socket = -1; - // FIXME: We need better error handling here, but do not stop execution here. - break; - } - LF_PRINT_DEBUG("RTI: Received message type %u from federate %d.", buffer[0], my_fed->enclave.id); - switch (buffer[0]) { - case MSG_TYPE_TIMESTAMP: - handle_timestamp(my_fed); - break; - case MSG_TYPE_ADDRESS_QUERY: - handle_address_query(my_fed->enclave.id); - break; - case MSG_TYPE_ADDRESS_ADVERTISEMENT: - handle_address_ad(my_fed->enclave.id); - break; - case MSG_TYPE_TAGGED_MESSAGE: - handle_timed_message(my_fed, buffer); - break; - case MSG_TYPE_RESIGN: - handle_federate_resign(my_fed); - break; - case MSG_TYPE_NEXT_EVENT_TAG: - handle_next_event_tag(my_fed); - break; - case MSG_TYPE_LATEST_TAG_COMPLETE: - handle_latest_tag_complete(my_fed); - break; - case MSG_TYPE_STOP_REQUEST: - handle_stop_request_message(my_fed); // FIXME: Reviewed until here. - // Need to also look at - // notify_advance_grant_if_safe() - // and notify_downstream_advance_grant_if_safe() - break; - case MSG_TYPE_STOP_REQUEST_REPLY: - handle_stop_request_reply(my_fed); - break; - case MSG_TYPE_PORT_ABSENT: - handle_port_absent_message(my_fed, buffer); - break; - case MSG_TYPE_FAILED: - handle_federate_failed(my_fed); - return NULL; - default: - lf_print_error("RTI received from federate %d an unrecognized TCP message type: %u.", my_fed->enclave.id, - buffer[0]); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_UNIDENTIFIED, my_fed->enclave.id, NULL); - } - } - } + // Wait for the federate to send an EOF or a socket error to occur. + // Discard any incoming bytes. Normally, this read should return 0 because + // the federate is resigning and should itself invoke shutdown. + unsigned char buffer[10]; + while (read(my_fed->socket, buffer, 10) > 0) + ; - // Nothing more to do. Close the socket and exit. - // Prevent multiple threads from closing the same socket at the same time. - LF_MUTEX_LOCK(&rti_mutex); - close(my_fed->socket); // from unistd.h - // Manual clean, in case of a transient federate - if (my_fed->is_transient) { - // FIXME: Aren't there transit messages anymore??? - // free_in_transit_message_q(my_fed->in_transit_message_tags); - lf_print("RTI: Transient Federate %d thread exited.", my_fed->enclave.id); + // We can now safely close the socket. + close(my_fed->socket); // from unistd.h - // Update the number of connected transient federates - rti_remote->number_of_connected_transient_federates--; + // Check downstream federates to see whether they should now be granted a TAG. + // To handle cycles, need to create a boolean array to keep + // track of which upstream federates have been visited. + bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. + notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); + free(visited); - // Reset the status of the leaving federate - reset_transient_federate(my_fed); - } - // Signal the hot swap mechanism, if needed - if (hot_swap_in_progress && hot_swap_federate->enclave.id == my_fed->enclave.id) { - hot_swap_old_resigned = true; + LF_MUTEX_UNLOCK(&rti_mutex); } - LF_MUTEX_UNLOCK(&rti_mutex); - return NULL; - } - - void send_reject(int* socket_id, rejection_code_t error_code) { - LF_PRINT_DEBUG("RTI sending MSG_TYPE_REJECT."); - unsigned char response[2]; - response[0] = MSG_TYPE_REJECT; - response[1] = (unsigned char)error_code; - LF_MUTEX_LOCK(&rti_mutex); - // NOTE: Ignore errors on this response. - if (write_to_socket(*socket_id, 2, response)) { - lf_print_warning("RTI failed to write MSG_TYPE_REJECT message on the socket."); - } - // Close the socket. - shutdown(*socket_id, SHUT_RDWR); - close(*socket_id); - *socket_id = -1; - LF_MUTEX_UNLOCK(&rti_mutex); - } - /** - * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload - * a federate ID and a federation ID. If the federation ID - * matches this federation, send an MSG_TYPE_ACK and otherwise send - * a MSG_TYPE_REJECT message. - * @param socket_id Pointer to the socket on which to listen. - * @param client_fd The socket address. - * @return The federate ID for success or -1 for failure. - */ - static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_in* client_fd) { - // Buffer for message ID, federate ID, type (persistent or transient), and federation ID length. - size_t length = 1 + sizeof(uint16_t) + 1 + 1; // Message ID, federate ID, length of fedration ID, type. - unsigned char buffer[length]; + void* federate_info_thread_TCP(void* fed) { + initialize_lf_thread_id(); + federate_info_t* my_fed = (federate_info_t*)fed; + + // Buffer for incoming messages. + // This does not constrain the message size because messages + // are forwarded piece by piece. + unsigned char buffer[FED_COM_BUFFER_SIZE]; + + // Listen for messages from the federate. + while (my_fed->enclave.state != NOT_CONNECTED) { + // Read no more than one byte to get the message type. + int read_failed = read_from_socket(my_fed->socket, 1, buffer); + if (read_failed) { + // Socket is closed + lf_print_error("RTI: Socket to federate %d is closed. Exiting the thread.", my_fed->enclave.id); + my_fed->enclave.state = NOT_CONNECTED; + my_fed->socket = -1; + // FIXME: We need better error handling here, but do not stop execution here. + break; + } + LF_PRINT_DEBUG("RTI: Received message type %u from federate %d.", buffer[0], my_fed->enclave.id); + switch (buffer[0]) { + case MSG_TYPE_TIMESTAMP: + handle_timestamp(my_fed); + break; + case MSG_TYPE_ADDRESS_QUERY: + handle_address_query(my_fed->enclave.id); + break; + case MSG_TYPE_ADDRESS_ADVERTISEMENT: + handle_address_ad(my_fed->enclave.id); + break; + case MSG_TYPE_TAGGED_MESSAGE: + handle_timed_message(my_fed, buffer); + break; + case MSG_TYPE_RESIGN: + handle_federate_resign(my_fed); + break; + case MSG_TYPE_NEXT_EVENT_TAG: + handle_next_event_tag(my_fed); + break; + case MSG_TYPE_LATEST_TAG_COMPLETE: + handle_latest_tag_complete(my_fed); + break; + case MSG_TYPE_STOP_REQUEST: + handle_stop_request_message(my_fed); // FIXME: Reviewed until here. + // Need to also look at + // notify_advance_grant_if_safe() + // and notify_downstream_advance_grant_if_safe() + break; + case MSG_TYPE_STOP_REQUEST_REPLY: + handle_stop_request_reply(my_fed); + break; + case MSG_TYPE_PORT_ABSENT: + handle_port_absent_message(my_fed, buffer); + break; + case MSG_TYPE_FAILED: + handle_federate_failed(my_fed); + return NULL; + default: + lf_print_error("RTI received from federate %d an unrecognized TCP message type: %u.", my_fed->enclave.id, + buffer[0]); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_UNIDENTIFIED, my_fed->enclave.id, NULL); + } + } + } - // Read bytes from the socket. We need 4 bytes. - if (read_from_socket_close_on_error(socket_id, length, buffer)) { - lf_print_error("RTI failed to read from accepted socket."); - return -1; + // Nothing more to do. Close the socket and exit. + // Prevent multiple threads from closing the same socket at the same time. + LF_MUTEX_LOCK(&rti_mutex); + close(my_fed->socket); // from unistd.h + // Manual clean, in case of a transient federate + if (my_fed->is_transient) { + // FIXME: Aren't there transit messages anymore??? + // free_in_transit_message_q(my_fed->in_transit_message_tags); + lf_print("RTI: Transient Federate %d thread exited.", my_fed->enclave.id); + + // Update the number of connected transient federates + rti_remote->number_of_connected_transient_federates--; + + // Reset the status of the leaving federate + reset_transient_federate(my_fed); + } + // Signal the hot swap mechanism, if needed + if (hot_swap_in_progress && hot_swap_federate->enclave.id == my_fed->enclave.id) { + hot_swap_old_resigned = true; + } + LF_MUTEX_UNLOCK(&rti_mutex); + return NULL; } - uint16_t fed_id = rti_remote->base.number_of_scheduling_nodes; // Initialize to an invalid value. - bool is_transient = false; - - // First byte received is the message type. - if (buffer[0] != MSG_TYPE_FED_IDS) { - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); - } - if (buffer[0] == MSG_TYPE_P2P_SENDING_FED_ID || buffer[0] == MSG_TYPE_P2P_TAGGED_MESSAGE) { - // The federate is trying to connect to a peer, not to the RTI. - // It has connected to the RTI instead. - // FIXME: This should not happen, but apparently has been observed. - // It should not happen because the peers get the port and IP address - // of the peer they want to connect to from the RTI. - // If the connection is a peer-to-peer connection between two - // federates, reject the connection with the WRONG_SERVER error. - send_reject(socket_id, WRONG_SERVER); - } else if (buffer[0] == MSG_TYPE_FED_NONCE) { - send_reject(socket_id, RTI_NOT_EXECUTED_WITH_AUTH); - lf_print_error("RTI not executed with HMAC authentication option using -a or --auth."); - } else { - send_reject(socket_id, UNEXPECTED_MESSAGE); - } - lf_print_error("RTI expected a MSG_TYPE_FED_IDS message. Got %u (see net_common.h).", buffer[0]); - return -1; - } else { - // Received federate ID. - fed_id = extract_uint16(buffer + 1); - is_transient = (buffer[sizeof(uint16_t) + 1] == 1) ? true : false; - if (is_transient) { - LF_PRINT_LOG("RTI received federate ID: %d, which is transient.", fed_id); - } else { - LF_PRINT_LOG("RTI received federate ID: %d, which is persistent.", fed_id); + void send_reject(int* socket_id, rejection_code_t error_code) { + LF_PRINT_DEBUG("RTI sending MSG_TYPE_REJECT."); + unsigned char response[2]; + response[0] = MSG_TYPE_REJECT; + response[1] = (unsigned char)error_code; + LF_MUTEX_LOCK(&rti_mutex); + // NOTE: Ignore errors on this response. + if (write_to_socket(*socket_id, 2, response)) { + lf_print_warning("RTI failed to write MSG_TYPE_REJECT message on the socket."); } + // Close the socket. + shutdown(*socket_id, SHUT_RDWR); + close(*socket_id); + *socket_id = -1; + LF_MUTEX_UNLOCK(&rti_mutex); + } - // Read the federation ID. First read the length, which is one byte. - size_t federation_id_length = (size_t)buffer[sizeof(uint16_t) + 2]; - char federation_id_received[federation_id_length + 1]; // One extra for null terminator. - // Next read the actual federation ID. - if (read_from_socket_close_on_error(socket_id, federation_id_length, (unsigned char*)federation_id_received)) { - lf_print_error("RTI failed to read federation id from federate %d.", fed_id); + /** + * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload + * a federate ID and a federation ID. If the federation ID + * matches this federation, send an MSG_TYPE_ACK and otherwise send + * a MSG_TYPE_REJECT message. + * @param socket_id Pointer to the socket on which to listen. + * @param client_fd The socket address. + * @return The federate ID for success or -1 for failure. + */ + static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_in* client_fd) { + // Buffer for message ID, federate ID, type (persistent or transient), and federation ID length. + size_t length = 1 + sizeof(uint16_t) + 1 + 1; // Message ID, federate ID, length of fedration ID, type. + unsigned char buffer[length]; + + // Read bytes from the socket. We need 4 bytes. + if (read_from_socket_close_on_error(socket_id, length, buffer)) { + lf_print_error("RTI failed to read from accepted socket."); return -1; } - // Terminate the string with a null. - federation_id_received[federation_id_length] = 0; - - LF_PRINT_DEBUG("RTI received federation ID: %s.", federation_id_received); + uint16_t fed_id = rti_remote->base.number_of_scheduling_nodes; // Initialize to an invalid value. + bool is_transient = false; - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(rti_remote->base.trace, receive_FED_ID, fed_id, NULL); - } - // Compare the received federation ID to mine. - if (strncmp(rti_remote->federation_id, federation_id_received, federation_id_length) != 0) { - // Federation IDs do not match. Send back a MSG_TYPE_REJECT message. - lf_print_warning("Federate from another federation %s attempted to connect to RTI in federation %s.", - federation_id_received, rti_remote->federation_id); + // First byte received is the message type. + if (buffer[0] != MSG_TYPE_FED_IDS) { if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } - send_reject(socket_id, FEDERATION_ID_DOES_NOT_MATCH); + if (buffer[0] == MSG_TYPE_P2P_SENDING_FED_ID || buffer[0] == MSG_TYPE_P2P_TAGGED_MESSAGE) { + // The federate is trying to connect to a peer, not to the RTI. + // It has connected to the RTI instead. + // FIXME: This should not happen, but apparently has been observed. + // It should not happen because the peers get the port and IP address + // of the peer they want to connect to from the RTI. + // If the connection is a peer-to-peer connection between two + // federates, reject the connection with the WRONG_SERVER error. + send_reject(socket_id, WRONG_SERVER); + } else if (buffer[0] == MSG_TYPE_FED_NONCE) { + send_reject(socket_id, RTI_NOT_EXECUTED_WITH_AUTH); + lf_print_error("RTI not executed with HMAC authentication option using -a or --auth."); + } else { + send_reject(socket_id, UNEXPECTED_MESSAGE); + } + lf_print_error("RTI expected a MSG_TYPE_FED_IDS message. Got %u (see net_common.h).", buffer[0]); return -1; } else { - if (fed_id >= rti_remote->base.number_of_scheduling_nodes) { - // Federate ID is out of range. - lf_print_error("RTI received federate ID %d, which is out of range.", fed_id); + // Received federate ID. + fed_id = extract_uint16(buffer + 1); + is_transient = (buffer[sizeof(uint16_t) + 1] == 1) ? true : false; + if (is_transient) { + LF_PRINT_LOG("RTI received federate ID: %d, which is transient.", fed_id); + } else { + LF_PRINT_LOG("RTI received federate ID: %d, which is persistent.", fed_id); + } + + // Read the federation ID. First read the length, which is one byte. + size_t federation_id_length = (size_t)buffer[sizeof(uint16_t) + 2]; + char federation_id_received[federation_id_length + 1]; // One extra for null terminator. + // Next read the actual federation ID. + if (read_from_socket_close_on_error(socket_id, federation_id_length, (unsigned char*)federation_id_received)) { + lf_print_error("RTI failed to read federation id from federate %d.", fed_id); + return -1; + } + + // Terminate the string with a null. + federation_id_received[federation_id_length] = 0; + + LF_PRINT_DEBUG("RTI received federation ID: %s.", federation_id_received); + + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(rti_remote->base.trace, receive_FED_ID, fed_id, NULL); + } + // Compare the received federation ID to mine. + if (strncmp(rti_remote->federation_id, federation_id_received, federation_id_length) != 0) { + // Federation IDs do not match. Send back a MSG_TYPE_REJECT message. + lf_print_warning("Federate from another federation %s attempted to connect to RTI in federation %s.", + federation_id_received, rti_remote->federation_id); if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } - send_reject(socket_id, FEDERATE_ID_OUT_OF_RANGE); + send_reject(socket_id, FEDERATION_ID_DOES_NOT_MATCH); return -1; } else { - // Find out if it is a new connection or a hot swap. - // Reject if: - // - duplicate of a connected persistent federate - // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that - // particular federate - // - or it is a hot swap, but it is not the execution phase yet - // Find out if it is a new connection or a hot swap. - // Reject if: - // - duplicate of a connected persistent federate - // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that - // particular federate - // - or it is a hot swap, but it is not the execution phase yet - if ((rti_remote->base.scheduling_nodes[fed_id])->state != NOT_CONNECTED) { - if (!is_transient) { - lf_print_error("RTI received duplicate federate ID: %d.", fed_id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); - } - send_reject(socket_id, FEDERATE_ID_IN_USE); - return -1; - } else if (hot_swap_in_progress || rti_remote->phase != execution_phase) { - lf_print_warning("RTI rejects the connection of transient federate %d, \ + if (fed_id >= rti_remote->base.number_of_scheduling_nodes) { + // Federate ID is out of range. + lf_print_error("RTI received federate ID %d, which is out of range.", fed_id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); + } + send_reject(socket_id, FEDERATE_ID_OUT_OF_RANGE); + return -1; + } else { + // Find out if it is a new connection or a hot swap. + // Reject if: + // - duplicate of a connected persistent federate + // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that + // particular federate + // - or it is a hot swap, but it is not the execution phase yet + // Find out if it is a new connection or a hot swap. + // Reject if: + // - duplicate of a connected persistent federate + // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that + // particular federate + // - or it is a hot swap, but it is not the execution phase yet + if ((rti_remote->base.scheduling_nodes[fed_id])->state != NOT_CONNECTED) { + if (!is_transient) { + lf_print_error("RTI received duplicate federate ID: %d.", fed_id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + } + send_reject(socket_id, FEDERATE_ID_IN_USE); + return -1; + } else if (hot_swap_in_progress || rti_remote->phase != execution_phase) { + lf_print_warning("RTI rejects the connection of transient federate %d, \ because a hot swap is already in progress for federate %d. \n\ Only one hot swap operation is allowed at a time.", - fed_id, hot_swap_federate->enclave.id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + fed_id, hot_swap_federate->enclave.id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + } + send_reject(socket_id, FEDERATE_ID_IN_USE); + return -1; } - send_reject(socket_id, FEDERATE_ID_IN_USE); - return -1; } } } } - } - - federate_info_t* fed_twin = GET_FED_INFO(fed_id); - federate_info_t* fed; - // If the federate is already connected (making the request a duplicate), and that - // the federate is transient, and it is the execution phase, then mark that a hot - // swap is in progreass and initialize the hot_swap_federate. - // Otherwise, proceed with a normal transinet connection - if (fed_twin->enclave.state != NOT_CONNECTED && is_transient && fed_twin->is_transient && - rti_remote->phase == execution_phase && !hot_swap_in_progress) { - // Allocate memory for the new federate and initilize it - hot_swap_federate = (federate_info_t*)malloc(sizeof(federate_info_t)); - initialize_federate(hot_swap_federate, fed_id); - - // Set that hot swap is in progress - hot_swap_in_progress = true; - // free(fed); // Free the old memory to prevent memory leak - fed = hot_swap_federate; - lf_print("RTI: Hot Swap starting for federate %d.", fed_id); - } else { - fed = fed_twin; - fed->is_transient = is_transient; - } - - // The MSG_TYPE_FED_IDS message has the right federation ID. - // Get the peer address from the connected socket_id. Then assign it as the federate's socket server. - struct sockaddr_in peer_addr; - socklen_t addr_len = sizeof(peer_addr); - if (getpeername(*socket_id, (struct sockaddr*)&peer_addr, &addr_len) != 0) { - lf_print_error("RTI failed to get peer address."); - } - fed->server_ip_addr = peer_addr.sin_addr; - -#if LOG_LEVEL >= LOG_LEVEL_DEBUG - // Create the human readable format and copy that into - // the .server_hostname field of the federate. - char str[INET_ADDRSTRLEN + 1]; - inet_ntop(AF_INET, &fed->server_ip_addr, str, INET_ADDRSTRLEN); - strncpy(fed->server_hostname, str, INET_ADDRSTRLEN); - - LF_PRINT_DEBUG("RTI got address %s from federate %d.", fed->server_hostname, fed_id); -#endif - fed->socket = *socket_id; - - // Set the federate's state as pending - // because it is waiting for the start time to be - // sent by the RTI before beginning its execution. - fed->enclave.state = PENDING; - - LF_PRINT_DEBUG("RTI responding with MSG_TYPE_ACK to federate %d.", fed_id); - // Send an MSG_TYPE_ACK message. - unsigned char ack_message = MSG_TYPE_ACK; - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_ACK, fed_id, NULL); - } - LF_MUTEX_LOCK(&rti_mutex); - if (write_to_socket_close_on_error(&fed->socket, 1, &ack_message)) { - LF_MUTEX_UNLOCK(&rti_mutex); - lf_print_error("RTI failed to write MSG_TYPE_ACK message to federate %d.", fed_id); - return -1; - } - LF_MUTEX_UNLOCK(&rti_mutex); - - LF_PRINT_DEBUG("RTI sent MSG_TYPE_ACK to federate %d.", fed_id); + federate_info_t* fed_twin = GET_FED_INFO(fed_id); + federate_info_t* fed; + // If the federate is already connected (making the request a duplicate), and that + // the federate is transient, and it is the execution phase, then mark that a hot + // swap is in progreass and initialize the hot_swap_federate. + // Otherwise, proceed with a normal transinet connection + if (fed_twin->enclave.state != NOT_CONNECTED && is_transient && fed_twin->is_transient && + rti_remote->phase == execution_phase && !hot_swap_in_progress) { + // Allocate memory for the new federate and initilize it + hot_swap_federate = (federate_info_t*)malloc(sizeof(federate_info_t)); + initialize_federate(hot_swap_federate, fed_id); + + // Set that hot swap is in progress + hot_swap_in_progress = true; + // free(fed); // Free the old memory to prevent memory leak + fed = hot_swap_federate; + lf_print("RTI: Hot Swap starting for federate %d.", fed_id); + } else { + fed = fed_twin; + fed->is_transient = is_transient; + } - return (int32_t)fed_id; - } + // The MSG_TYPE_FED_IDS message has the right federation ID. - /** - * Listen for a MSG_TYPE_NEIGHBOR_STRUCTURE message, and upon receiving it, fill - * out the relevant information in the federate's struct. - * - * In case of a hot swap, check that no changes were made to the connections, compared - * to the first instance that joigned. This means that the first instance to join - * __is__ the reference. - * - * @return 1 on success and 0 on failure. - */ - static int receive_connection_information(int* socket_id, uint16_t fed_id) { - LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_NEIGHBOR_STRUCTURE from federate %d.", fed_id); - unsigned char connection_info_header[MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE]; - read_from_socket_fail_on_error(socket_id, MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE, connection_info_header, NULL, - "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message header from federate %d.", - fed_id); - - if (connection_info_header[0] != MSG_TYPE_NEIGHBOR_STRUCTURE) { - lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " - "Rejecting federate.", - fed_id, connection_info_header[0]); - send_reject(socket_id, UNEXPECTED_MESSAGE); - return 0; - } else { - // In case of a transient federate that is joining again, or a hot swap, then - // check that the connection information did not change. - federate_info_t* fed = GET_FED_INFO(fed_id); - federate_info_t* temp_fed = NULL; - if (lf_tag_compare(fed->effective_start_tag, NEVER_TAG) != 0) { - if (hot_swap_in_progress) { - fed = hot_swap_federate; - } else { - temp_fed = (federate_info_t*)calloc(1, sizeof(federate_info_t)); - initialize_federate(temp_fed, fed_id); - fed = temp_fed; - } - } - // Read the number of upstream and downstream connections - fed->enclave.num_upstream = extract_int32(&(connection_info_header[1])); - fed->enclave.num_downstream = extract_int32(&(connection_info_header[1 + sizeof(int32_t)])); - LF_PRINT_DEBUG("RTI got %d upstreams and %d downstreams from federate %d.", fed->enclave.num_upstream, - fed->enclave.num_downstream, fed_id); - - // Allocate memory for the upstream and downstream pointers - if (fed->enclave.num_upstream > 0) { - fed->enclave.upstream = (uint16_t*)malloc(sizeof(uint16_t) * fed->enclave.num_upstream); - LF_ASSERT_NON_NULL(fed->enclave.upstream); - // Allocate memory for the upstream delay pointers - fed->enclave.upstream_delay = (interval_t*)malloc(sizeof(interval_t) * fed->enclave.num_upstream); - LF_ASSERT_NON_NULL(fed->enclave.upstream_delay); - } else { - fed->enclave.upstream = (uint16_t*)NULL; - fed->enclave.upstream_delay = (interval_t*)NULL; + // Get the peer address from the connected socket_id. Then assign it as the federate's socket server. + struct sockaddr_in peer_addr; + socklen_t addr_len = sizeof(peer_addr); + if (getpeername(*socket_id, (struct sockaddr*)&peer_addr, &addr_len) != 0) { + lf_print_error("RTI failed to get peer address."); } - if (fed->enclave.num_downstream > 0) { - fed->enclave.downstream = (uint16_t*)malloc(sizeof(uint16_t) * fed->enclave.num_downstream); - LF_ASSERT_NON_NULL(fed->enclave.downstream); - } else { - fed->enclave.downstream = (uint16_t*)NULL; - } - - size_t connections_info_body_size = ((sizeof(uint16_t) + sizeof(int64_t)) * fed->enclave.num_upstream) + - (sizeof(uint16_t) * fed->enclave.num_downstream); - unsigned char* connections_info_body = NULL; - if (connections_info_body_size > 0) { - connections_info_body = (unsigned char*)malloc(connections_info_body_size); - LF_ASSERT_NON_NULL(connections_info_body); - read_from_socket_fail_on_error(socket_id, connections_info_body_size, connections_info_body, NULL, - "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message body from federate %d.", - fed_id); - // Keep track of where we are in the buffer - size_t message_head = 0; - // First, read the info about upstream federates - for (int i = 0; i < fed->enclave.num_upstream; i++) { - fed->enclave.upstream[i] = extract_uint16(&(connections_info_body[message_head])); - message_head += sizeof(uint16_t); - fed->enclave.upstream_delay[i] = extract_int64(&(connections_info_body[message_head])); - message_head += sizeof(int64_t); - } + fed->server_ip_addr = peer_addr.sin_addr; - // Next, read the info about downstream federates - for (int i = 0; i < fed->enclave.num_downstream; i++) { - fed->enclave.downstream[i] = extract_uint16(&(connections_info_body[message_head])); - message_head += sizeof(uint16_t); - } +#if LOG_LEVEL >= LOG_LEVEL_DEBUG + // Create the human readable format and copy that into + // the .server_hostname field of the federate. + char str[INET_ADDRSTRLEN + 1]; + inet_ntop(AF_INET, &fed->server_ip_addr, str, INET_ADDRSTRLEN); + strncpy(fed->server_hostname, str, INET_ADDRSTRLEN); - free(connections_info_body); + LF_PRINT_DEBUG("RTI got address %s from federate %d.", fed->server_hostname, fed_id); +#endif + fed->socket = *socket_id; + + // Set the federate's state as pending + // because it is waiting for the start time to be + // sent by the RTI before beginning its execution. + fed->enclave.state = PENDING; + + LF_PRINT_DEBUG("RTI responding with MSG_TYPE_ACK to federate %d.", fed_id); + // Send an MSG_TYPE_ACK message. + unsigned char ack_message = MSG_TYPE_ACK; + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_ACK, fed_id, NULL); } + LF_MUTEX_LOCK(&rti_mutex); + if (write_to_socket_close_on_error(&fed->socket, 1, &ack_message)) { + LF_MUTEX_UNLOCK(&rti_mutex); + lf_print_error("RTI failed to write MSG_TYPE_ACK message to federate %d.", fed_id); + return -1; + } + LF_MUTEX_UNLOCK(&rti_mutex); - // NOTE: In this design, changes in the connections are not allowed. This means that the first - // instance to join __is__ the reference. If this policy is to be changed, then it is in - // the following lines will be updated accordingly. - if (hot_swap_in_progress || temp_fed != NULL) { - if (temp_fed == NULL) { - temp_fed = hot_swap_federate; + LF_PRINT_DEBUG("RTI sent MSG_TYPE_ACK to federate %d.", fed_id); + + return (int32_t)fed_id; + } + + /** + * Listen for a MSG_TYPE_NEIGHBOR_STRUCTURE message, and upon receiving it, fill + * out the relevant information in the federate's struct. + * + * In case of a hot swap, check that no changes were made to the connections, compared + * to the first instance that joigned. This means that the first instance to join + * __is__ the reference. + * + * @return 1 on success and 0 on failure. + */ + static int receive_connection_information(int* socket_id, uint16_t fed_id) { + LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_NEIGHBOR_STRUCTURE from federate %d.", fed_id); + unsigned char connection_info_header[MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE]; + read_from_socket_fail_on_error(socket_id, MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE, connection_info_header, NULL, + "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message header from federate %d.", + fed_id); + + if (connection_info_header[0] != MSG_TYPE_NEIGHBOR_STRUCTURE) { + lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " + "Rejecting federate.", + fed_id, connection_info_header[0]); + send_reject(socket_id, UNEXPECTED_MESSAGE); + return 0; + } else { + // In case of a transient federate that is joining again, or a hot swap, then + // check that the connection information did not change. + federate_info_t* fed = GET_FED_INFO(fed_id); + federate_info_t* temp_fed = NULL; + if (lf_tag_compare(fed->effective_start_tag, NEVER_TAG) != 0) { + if (hot_swap_in_progress) { + fed = hot_swap_federate; + } else { + temp_fed = (federate_info_t*)calloc(1, sizeof(federate_info_t)); + initialize_federate(temp_fed, fed_id); + fed = temp_fed; + } + } + // Read the number of upstream and downstream connections + fed->enclave.num_upstream = extract_int32(&(connection_info_header[1])); + fed->enclave.num_downstream = extract_int32(&(connection_info_header[1 + sizeof(int32_t)])); + LF_PRINT_DEBUG("RTI got %d upstreams and %d downstreams from federate %d.", fed->enclave.num_upstream, + fed->enclave.num_downstream, fed_id); + + // Allocate memory for the upstream and downstream pointers + if (fed->enclave.num_upstream > 0) { + fed->enclave.upstream = (uint16_t*)malloc(sizeof(uint16_t) * fed->enclave.num_upstream); + LF_ASSERT_NON_NULL(fed->enclave.upstream); + // Allocate memory for the upstream delay pointers + fed->enclave.upstream_delay = (interval_t*)malloc(sizeof(interval_t) * fed->enclave.num_upstream); + LF_ASSERT_NON_NULL(fed->enclave.upstream_delay); + } else { + fed->enclave.upstream = (uint16_t*)NULL; + fed->enclave.upstream_delay = (interval_t*)NULL; } - // Now, compare the previous and the new neighberhood structure - // Start with the number of upstreams and downstreams - bool reject = false; - if ((fed->enclave.num_upstream != temp_fed->enclave.num_upstream) || - (fed->enclave.num_downstream != temp_fed->enclave.num_downstream)) { - reject = true; + if (fed->enclave.num_downstream > 0) { + fed->enclave.downstream = (uint16_t*)malloc(sizeof(uint16_t) * fed->enclave.num_downstream); + LF_ASSERT_NON_NULL(fed->enclave.downstream); } else { - // Then check all upstreams and their delays + fed->enclave.downstream = (uint16_t*)NULL; + } + + size_t connections_info_body_size = ((sizeof(uint16_t) + sizeof(int64_t)) * fed->enclave.num_upstream) + + (sizeof(uint16_t) * fed->enclave.num_downstream); + unsigned char* connections_info_body = NULL; + if (connections_info_body_size > 0) { + connections_info_body = (unsigned char*)malloc(connections_info_body_size); + LF_ASSERT_NON_NULL(connections_info_body); + read_from_socket_fail_on_error( + socket_id, connections_info_body_size, connections_info_body, NULL, + "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message body from federate %d.", fed_id); + // Keep track of where we are in the buffer + size_t message_head = 0; + // First, read the info about upstream federates for (int i = 0; i < fed->enclave.num_upstream; i++) { - if ((fed->enclave.upstream[i] != temp_fed->enclave.upstream[i]) || - (fed->enclave.upstream_delay[i] != temp_fed->enclave.upstream_delay[i])) { - reject = true; - break; - } + fed->enclave.upstream[i] = extract_uint16(&(connections_info_body[message_head])); + message_head += sizeof(uint16_t); + fed->enclave.upstream_delay[i] = extract_int64(&(connections_info_body[message_head])); + message_head += sizeof(int64_t); + } + + // Next, read the info about downstream federates + for (int i = 0; i < fed->enclave.num_downstream; i++) { + fed->enclave.downstream[i] = extract_uint16(&(connections_info_body[message_head])); + message_head += sizeof(uint16_t); + } + + free(connections_info_body); + } + + // NOTE: In this design, changes in the connections are not allowed. This means that the first + // instance to join __is__ the reference. If this policy is to be changed, then it is in + // the following lines will be updated accordingly. + if (hot_swap_in_progress || temp_fed != NULL) { + if (temp_fed == NULL) { + temp_fed = hot_swap_federate; } - if (!reject) { - // Finally, check all downstream federates - for (int i = 0; i < fed->enclave.num_downstream; i++) { - if (fed->enclave.downstream[i] != temp_fed->enclave.downstream[i]) { + // Now, compare the previous and the new neighberhood structure + // Start with the number of upstreams and downstreams + bool reject = false; + if ((fed->enclave.num_upstream != temp_fed->enclave.num_upstream) || + (fed->enclave.num_downstream != temp_fed->enclave.num_downstream)) { + reject = true; + } else { + // Then check all upstreams and their delays + for (int i = 0; i < fed->enclave.num_upstream; i++) { + if ((fed->enclave.upstream[i] != temp_fed->enclave.upstream[i]) || + (fed->enclave.upstream_delay[i] != temp_fed->enclave.upstream_delay[i])) { reject = true; break; } } + if (!reject) { + // Finally, check all downstream federates + for (int i = 0; i < fed->enclave.num_downstream; i++) { + if (fed->enclave.downstream[i] != temp_fed->enclave.downstream[i]) { + reject = true; + break; + } + } + } } - } - if (reject) { - if (temp_fed != hot_swap_federate) { - free(temp_fed); + if (reject) { + if (temp_fed != hot_swap_federate) { + free(temp_fed); + } + return 0; } - return 0; } } + LF_PRINT_DEBUG("RTI received neighbor structure from federate %d.", fed_id); + return 1; } - LF_PRINT_DEBUG("RTI received neighbor structure from federate %d.", fed_id); - return 1; - } - /** - * Listen for a MSG_TYPE_UDP_PORT message, and upon receiving it, set up - * clock synchronization and perform the initial clock synchronization. - * Initial clock synchronization is performed only if the MSG_TYPE_UDP_PORT message - * payload is not UINT16_MAX. If it is also not 0, then this function sets - * up to perform runtime clock synchronization using the UDP port number - * specified in the payload to communicate with the federate's clock - * synchronization logic. - * @param socket_id The socket on which to listen. - * @param fed_id The federate ID. - * @return 1 for success, 0 for failure. - */ - static int receive_udp_message_and_set_up_clock_sync(int* socket_id, uint16_t fed_id) { - // Read the MSG_TYPE_UDP_PORT message from the federate regardless of the status of - // clock synchronization. This message will tell the RTI whether the federate - // is doing clock synchronization, and if it is, what port to use for UDP. - LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_UDP_PORT from federate %d.", fed_id); - unsigned char response[1 + sizeof(uint16_t)]; - read_from_socket_fail_on_error(socket_id, 1 + sizeof(uint16_t), response, NULL, - "RTI failed to read MSG_TYPE_UDP_PORT message from federate %d.", fed_id); - if (response[0] != MSG_TYPE_UDP_PORT) { - lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " - "Rejecting federate.", - fed_id, response[0]); - send_reject(socket_id, UNEXPECTED_MESSAGE); - return 0; - } else { - federate_info_t* fed; - if (hot_swap_in_progress) { - fed = hot_swap_federate; + /** + * Listen for a MSG_TYPE_UDP_PORT message, and upon receiving it, set up + * clock synchronization and perform the initial clock synchronization. + * Initial clock synchronization is performed only if the MSG_TYPE_UDP_PORT message + * payload is not UINT16_MAX. If it is also not 0, then this function sets + * up to perform runtime clock synchronization using the UDP port number + * specified in the payload to communicate with the federate's clock + * synchronization logic. + * @param socket_id The socket on which to listen. + * @param fed_id The federate ID. + * @return 1 for success, 0 for failure. + */ + static int receive_udp_message_and_set_up_clock_sync(int* socket_id, uint16_t fed_id) { + // Read the MSG_TYPE_UDP_PORT message from the federate regardless of the status of + // clock synchronization. This message will tell the RTI whether the federate + // is doing clock synchronization, and if it is, what port to use for UDP. + LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_UDP_PORT from federate %d.", fed_id); + unsigned char response[1 + sizeof(uint16_t)]; + read_from_socket_fail_on_error(socket_id, 1 + sizeof(uint16_t), response, NULL, + "RTI failed to read MSG_TYPE_UDP_PORT message from federate %d.", fed_id); + if (response[0] != MSG_TYPE_UDP_PORT) { + lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " + "Rejecting federate.", + fed_id, response[0]); + send_reject(socket_id, UNEXPECTED_MESSAGE); + return 0; } else { - fed = GET_FED_INFO(fed_id); - } - if (rti_remote->clock_sync_global_status >= clock_sync_init) { - // If no initial clock sync, no need perform initial clock sync. - uint16_t federate_UDP_port_number = extract_uint16(&(response[1])); - - LF_PRINT_DEBUG("RTI got MSG_TYPE_UDP_PORT %u from federate %d.", federate_UDP_port_number, fed_id); - - // A port number of UINT16_MAX means initial clock sync should not be performed. - if (federate_UDP_port_number != UINT16_MAX) { - // Perform the initialization clock synchronization with the federate. - // Send the required number of messages for clock synchronization - for (int i = 0; i < rti_remote->clock_sync_exchanges_per_interval; i++) { - // Send the RTI's current physical time T1 to the federate. - send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, TCP); - - // Listen for reply message, which should be T3. - size_t message_size = 1 + sizeof(uint16_t); - unsigned char buffer[message_size]; - read_from_socket_fail_on_error(socket_id, message_size, buffer, NULL, - "Socket to federate %d unexpectedly closed.", fed_id); - if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { - uint16_t fed_id = extract_uint16(&(buffer[1])); - LF_PRINT_DEBUG("RTI received T3 clock sync message from federate %d.", fed_id); - handle_physical_clock_sync_message(fed, TCP); - } else { - lf_print_error("Unexpected message %u from federate %d.", buffer[0], fed_id); - send_reject(socket_id, UNEXPECTED_MESSAGE); - return 0; + federate_info_t* fed; + if (hot_swap_in_progress) { + fed = hot_swap_federate; + } else { + fed = GET_FED_INFO(fed_id); + } + if (rti_remote->clock_sync_global_status >= clock_sync_init) { + // If no initial clock sync, no need perform initial clock sync. + uint16_t federate_UDP_port_number = extract_uint16(&(response[1])); + + LF_PRINT_DEBUG("RTI got MSG_TYPE_UDP_PORT %u from federate %d.", federate_UDP_port_number, fed_id); + + // A port number of UINT16_MAX means initial clock sync should not be performed. + if (federate_UDP_port_number != UINT16_MAX) { + // Perform the initialization clock synchronization with the federate. + // Send the required number of messages for clock synchronization + for (int i = 0; i < rti_remote->clock_sync_exchanges_per_interval; i++) { + // Send the RTI's current physical time T1 to the federate. + send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, TCP); + + // Listen for reply message, which should be T3. + size_t message_size = 1 + sizeof(uint16_t); + unsigned char buffer[message_size]; + read_from_socket_fail_on_error(socket_id, message_size, buffer, NULL, + "Socket to federate %d unexpectedly closed.", fed_id); + if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { + uint16_t fed_id = extract_uint16(&(buffer[1])); + LF_PRINT_DEBUG("RTI received T3 clock sync message from federate %d.", fed_id); + handle_physical_clock_sync_message(fed, TCP); + } else { + lf_print_error("Unexpected message %u from federate %d.", buffer[0], fed_id); + send_reject(socket_id, UNEXPECTED_MESSAGE); + return 0; + } } + LF_PRINT_DEBUG("RTI finished initial clock synchronization with federate %d.", fed_id); } - LF_PRINT_DEBUG("RTI finished initial clock synchronization with federate %d.", fed_id); - } - if (rti_remote->clock_sync_global_status >= clock_sync_on) { - // If no runtime clock sync, no need to set up the UDP port. - if (federate_UDP_port_number > 0) { - // Initialize the UDP_addr field of the federate struct - fed->UDP_addr.sin_family = AF_INET; - fed->UDP_addr.sin_port = htons(federate_UDP_port_number); - fed->UDP_addr.sin_addr = fed->server_ip_addr; + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + // If no runtime clock sync, no need to set up the UDP port. + if (federate_UDP_port_number > 0) { + // Initialize the UDP_addr field of the federate struct + fed->UDP_addr.sin_family = AF_INET; + fed->UDP_addr.sin_port = htons(federate_UDP_port_number); + fed->UDP_addr.sin_addr = fed->server_ip_addr; + } + } else { + // Disable clock sync after initial round. + fed->clock_synchronization_enabled = false; } } else { - // Disable clock sync after initial round. + // No clock synchronization at all. + LF_PRINT_DEBUG("RTI: No clock synchronization for federate %d.", fed_id); + // Clock synchronization is universally disabled via the clock-sync command-line parameter + // (-c off was passed to the RTI). + // Note that the federates are still going to send a + // MSG_TYPE_UDP_PORT message but with a payload (port) of -1. fed->clock_synchronization_enabled = false; } - } else { - // No clock synchronization at all. - LF_PRINT_DEBUG("RTI: No clock synchronization for federate %d.", fed_id); - // Clock synchronization is universally disabled via the clock-sync command-line parameter - // (-c off was passed to the RTI). - // Note that the federates are still going to send a - // MSG_TYPE_UDP_PORT message but with a payload (port) of -1. - fed->clock_synchronization_enabled = false; } + return 1; } - return 1; - } #ifdef __RTI_AUTH__ - /** - * Authenticate incoming federate by performing HMAC-based authentication. - * - * @param socket Socket for the incoming federate tryting to authenticate. - * @return True if authentication is successful and false otherwise. - */ - static bool authenticate_federate(int* socket) { - // Wait for MSG_TYPE_FED_NONCE from federate. - size_t fed_id_length = sizeof(uint16_t); - unsigned char buffer[1 + fed_id_length + NONCE_LENGTH]; - read_from_socket_fail_on_error(socket, 1 + fed_id_length + NONCE_LENGTH, buffer, NULL, - "Failed to read MSG_TYPE_FED_NONCE"); - if (buffer[0] != MSG_TYPE_FED_NONCE) { - lf_print_error_and_exit("Received unexpected response %u from the FED (see net_common.h).", buffer[0]); - } - unsigned int hmac_length = SHA256_HMAC_LENGTH; - size_t federation_id_length = strnlen(rti_remote->federation_id, 255); - // HMAC tag is created with MSG_TYPE, federate ID, received federate nonce. - unsigned char mac_buf[1 + fed_id_length + NONCE_LENGTH]; - mac_buf[0] = MSG_TYPE_RTI_RESPONSE; - memcpy(&mac_buf[1], &buffer[1], fed_id_length); - memcpy(&mac_buf[1 + fed_id_length], &buffer[1 + fed_id_length], NONCE_LENGTH); - unsigned char hmac_tag[hmac_length]; - unsigned char* ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, mac_buf, - 1 + fed_id_length + NONCE_LENGTH, hmac_tag, &hmac_length); - if (ret == NULL) { - lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_RTI_RESPONSE."); - } - // Make buffer for message type, RTI's nonce, and HMAC tag. - unsigned char sender[1 + NONCE_LENGTH + hmac_length]; - sender[0] = MSG_TYPE_RTI_RESPONSE; - unsigned char rti_nonce[NONCE_LENGTH]; - RAND_bytes(rti_nonce, NONCE_LENGTH); - memcpy(&sender[1], rti_nonce, NONCE_LENGTH); - memcpy(&sender[1 + NONCE_LENGTH], hmac_tag, hmac_length); - if (write_to_socket(*socket, 1 + NONCE_LENGTH + hmac_length, sender)) { - lf_print_error("Failed to send nonce to federate."); - } - - // Wait for MSG_TYPE_FED_RESPONSE - unsigned char received[1 + hmac_length]; - read_from_socket_fail_on_error(socket, 1 + hmac_length, received, NULL, "Failed to read federate response."); - if (received[0] != MSG_TYPE_FED_RESPONSE) { - lf_print_error_and_exit("Received unexpected response %u from the federate (see net_common.h).", received[0]); - return false; - } - // HMAC tag is created with MSG_TYPE_FED_RESPONSE and RTI's nonce. - unsigned char mac_buf2[1 + NONCE_LENGTH]; - mac_buf2[0] = MSG_TYPE_FED_RESPONSE; - memcpy(&mac_buf2[1], rti_nonce, NONCE_LENGTH); - unsigned char rti_tag[hmac_length]; - ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, mac_buf2, 1 + NONCE_LENGTH, rti_tag, - &hmac_length); - if (ret == NULL) { - lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_FED_RESPONSE."); - } - // Compare received tag and created tag. - if (memcmp(&received[1], rti_tag, hmac_length) != 0) { - // Federation IDs do not match. Send back a HMAC_DOES_NOT_MATCH message. - lf_print_warning("HMAC authentication failed. Rejecting the federate."); - send_reject(socket, HMAC_DOES_NOT_MATCH); - return false; - } else { - LF_PRINT_LOG("Federate's HMAC verified."); - return true; + /** + * Authenticate incoming federate by performing HMAC-based authentication. + * + * @param socket Socket for the incoming federate tryting to authenticate. + * @return True if authentication is successful and false otherwise. + */ + static bool authenticate_federate(int* socket) { + // Wait for MSG_TYPE_FED_NONCE from federate. + size_t fed_id_length = sizeof(uint16_t); + unsigned char buffer[1 + fed_id_length + NONCE_LENGTH]; + read_from_socket_fail_on_error(socket, 1 + fed_id_length + NONCE_LENGTH, buffer, NULL, + "Failed to read MSG_TYPE_FED_NONCE"); + if (buffer[0] != MSG_TYPE_FED_NONCE) { + lf_print_error_and_exit("Received unexpected response %u from the FED (see net_common.h).", buffer[0]); + } + unsigned int hmac_length = SHA256_HMAC_LENGTH; + size_t federation_id_length = strnlen(rti_remote->federation_id, 255); + // HMAC tag is created with MSG_TYPE, federate ID, received federate nonce. + unsigned char mac_buf[1 + fed_id_length + NONCE_LENGTH]; + mac_buf[0] = MSG_TYPE_RTI_RESPONSE; + memcpy(&mac_buf[1], &buffer[1], fed_id_length); + memcpy(&mac_buf[1 + fed_id_length], &buffer[1 + fed_id_length], NONCE_LENGTH); + unsigned char hmac_tag[hmac_length]; + unsigned char* ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, mac_buf, + 1 + fed_id_length + NONCE_LENGTH, hmac_tag, &hmac_length); + if (ret == NULL) { + lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_RTI_RESPONSE."); + } + // Make buffer for message type, RTI's nonce, and HMAC tag. + unsigned char sender[1 + NONCE_LENGTH + hmac_length]; + sender[0] = MSG_TYPE_RTI_RESPONSE; + unsigned char rti_nonce[NONCE_LENGTH]; + RAND_bytes(rti_nonce, NONCE_LENGTH); + memcpy(&sender[1], rti_nonce, NONCE_LENGTH); + memcpy(&sender[1 + NONCE_LENGTH], hmac_tag, hmac_length); + if (write_to_socket(*socket, 1 + NONCE_LENGTH + hmac_length, sender)) { + lf_print_error("Failed to send nonce to federate."); + } + + // Wait for MSG_TYPE_FED_RESPONSE + unsigned char received[1 + hmac_length]; + read_from_socket_fail_on_error(socket, 1 + hmac_length, received, NULL, "Failed to read federate response."); + if (received[0] != MSG_TYPE_FED_RESPONSE) { + lf_print_error_and_exit("Received unexpected response %u from the federate (see net_common.h).", received[0]); + return false; + } + // HMAC tag is created with MSG_TYPE_FED_RESPONSE and RTI's nonce. + unsigned char mac_buf2[1 + NONCE_LENGTH]; + mac_buf2[0] = MSG_TYPE_FED_RESPONSE; + memcpy(&mac_buf2[1], rti_nonce, NONCE_LENGTH); + unsigned char rti_tag[hmac_length]; + ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, mac_buf2, 1 + NONCE_LENGTH, rti_tag, + &hmac_length); + if (ret == NULL) { + lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_FED_RESPONSE."); + } + // Compare received tag and created tag. + if (memcmp(&received[1], rti_tag, hmac_length) != 0) { + // Federation IDs do not match. Send back a HMAC_DOES_NOT_MATCH message. + lf_print_warning("HMAC authentication failed. Rejecting the federate."); + send_reject(socket, HMAC_DOES_NOT_MATCH); + return false; + } else { + LF_PRINT_LOG("Federate's HMAC verified."); + return true; + } } - } #endif - // FIXME: The socket descriptor here (parameter) is not used. Should be removed? - void lf_connect_to_persistent_federates(int socket_descriptor) { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote.number_of_transient_federates; i++) { - // Wait for an incoming connection request. - struct sockaddr client_fd; - uint32_t client_length = sizeof(client_fd); - // The following blocks until a federate connects. - int socket_id = -1; - while (1) { - socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); - if (socket_id >= 0) { - // Got a socket - break; - } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { - lf_print_error_system_failure("RTI failed to accept the socket."); - } else { - // Try again - lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); - continue; + // FIXME: The socket descriptor here (parameter) is not used. Should be removed? + void lf_connect_to_persistent_federates(int socket_descriptor) { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote.number_of_transient_federates; i++) { + // Wait for an incoming connection request. + struct sockaddr client_fd; + uint32_t client_length = sizeof(client_fd); + // The following blocks until a federate connects. + int socket_id = -1; + while (1) { + socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); + if (socket_id >= 0) { + // Got a socket + break; + } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { + lf_print_error_system_failure("RTI failed to accept the socket."); + } else { + // Try again + lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); + continue; + } } - } // Wait for the first message from the federate when RTI -a option is on. #ifdef __RTI_AUTH__ - if (rti_remote->authentication_enabled) { - if (!authenticate_federate(&socket_id)) { - lf_print_warning("RTI failed to authenticate the incoming federate."); - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - socket_id = -1; - // Ignore the federate that failed authentication. - i--; - continue; + if (rti_remote->authentication_enabled) { + if (!authenticate_federate(&socket_id)) { + lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + socket_id = -1; + // Ignore the federate that failed authentication. + i--; + continue; + } } - } #endif - // The first message from the federate should contain its ID and the federation ID. - int32_t fed_id = receive_and_check_fed_id_message(&socket_id); - if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && - receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { + // The first message from the federate should contain its ID and the federation ID. + int32_t fed_id = receive_and_check_fed_id_message(&socket_id); + if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && + receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - federate_info_t* fed = GET_FED_INFO(fed_id); - lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + federate_info_t* fed = GET_FED_INFO(fed_id); + lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); - // If the federate is transient, then do not count it. - if (fed->is_transient) { - rti_remote->number_of_connected_transient_federates++; - assert(rti_remote->number_of_connected_transient_federates <= rti_remote->number_of_transient_federates); + // If the federate is transient, then do not count it. + if (fed->is_transient) { + rti_remote->number_of_connected_transient_federates++; + assert(rti_remote->number_of_connected_transient_federates <= rti_remote->number_of_transient_federates); + i--; + lf_print("RTI: Transient federate %d joined.", fed->enclave.id); + } + } else { + // Received message was rejected. Try again. i--; - lf_print("RTI: Transient federate %d joined.", fed->enclave.id); } - } else { - // Received message was rejected. Try again. - i--; } - } - // All federates have connected. - LF_PRINT_DEBUG("All persistent federates have connected to RTI."); - - if (rti_remote->clock_sync_global_status >= clock_sync_on) { - // Create the thread that performs periodic PTP clock synchronization sessions - // over the UDP channel, but only if the UDP channel is open and at least one - // federate is performing runtime clock synchronization. - bool clock_sync_enabled = false; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed_info = GET_FED_INFO(i); - if (fed_info->clock_synchronization_enabled) { - clock_sync_enabled = true; - break; + // All federates have connected. + LF_PRINT_DEBUG("All persistent federates have connected to RTI."); + + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + // Create the thread that performs periodic PTP clock synchronization sessions + // over the UDP channel, but only if the UDP channel is open and at least one + // federate is performing runtime clock synchronization. + bool clock_sync_enabled = false; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed_info = GET_FED_INFO(i); + if (fed_info->clock_synchronization_enabled) { + clock_sync_enabled = true; + break; + } + } + if (rti_remote->final_port_UDP != UINT16_MAX && clock_sync_enabled) { + lf_thread_create(&rti_remote->clock_thread, clock_synchronization_thread, NULL); } } - if (rti_remote->final_port_UDP != UINT16_MAX && clock_sync_enabled) { - lf_thread_create(&rti_remote->clock_thread, clock_synchronization_thread, NULL); - } - } - } - - /** - * @brief A request for immediate stop to the federate - * - * @param fed: the deferate to stop - */ - void send_stop(federate_info_t * fed) { - // Reply with a stop granted to all federates - unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; - outgoing_buffer[0] = MSG_TYPE_STOP; - lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); - - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); } - write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, - "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); - LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); - } + /** + * @brief A request for immediate stop to the federate + * + * @param fed: the deferate to stop + */ + void send_stop(federate_info_t * fed) { + // Reply with a stop granted to all federates + unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; + outgoing_buffer[0] = MSG_TYPE_STOP; + lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); - /** - * @brief A request for immediate stop to the federate - * - * @param fed: the deferate to stop - */ - void send_stop(federate_info_t * fed) { - // Reply with a stop granted to all federates - unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; - outgoing_buffer[0] = MSG_TYPE_STOP; - lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); + } + write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, + "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); + LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); } - write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, - "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); - LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); - } + /** + * @brief A request for immediate stop to the federate + * + * @param fed: the deferate to stop + */ + void send_stop(federate_info_t * fed) { + // Reply with a stop granted to all federates + unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; + outgoing_buffer[0] = MSG_TYPE_STOP; + lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); - void* lf_connect_to_transient_federates_thread(void* nothing) { - // This loop will continue to accept connections of transient federates, as - // soon as there is room, or enable hot swap - - while (!rti_remote->all_persistent_federates_exited) { - // Continue waiting for an incoming connection requests from transients - // to join, or for hot swap. - // Wait for an incoming connection request. - struct sockaddr client_fd; - uint32_t client_length = sizeof(client_fd); - // The following blocks until a federate connects. - int socket_id = -1; - while (1) { - if (!rti_remote->all_persistent_federates_exited) { - return NULL; - } - socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); - if (socket_id >= 0) { - // Got a socket - break; - } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { - lf_print_error_system_failure("RTI failed to accept the socket."); - } else { - // Try again - lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); - continue; - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); } + write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, + "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); + + LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); + } + + void* lf_connect_to_transient_federates_thread(void* nothing) { + // This loop will continue to accept connections of transient federates, as + // soon as there is room, or enable hot swap + + while (!rti_remote->all_persistent_federates_exited) { + // Continue waiting for an incoming connection requests from transients + // to join, or for hot swap. + // Wait for an incoming connection request. + struct sockaddr client_fd; + uint32_t client_length = sizeof(client_fd); + // The following blocks until a federate connects. + int socket_id = -1; + while (1) { + if (!rti_remote->all_persistent_federates_exited) { + return NULL; + } + socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); + if (socket_id >= 0) { + // Got a socket + break; + } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { + lf_print_error_system_failure("RTI failed to accept the socket."); + } else { + // Try again + lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); + continue; + } + } // Wait for the first message from the federate when RTI -a option is on. #ifdef __RTI_AUTH__ - if (rti_remote->authentication_enabled) { - if (!authenticate_federate(&socket_id)) { - lf_print_warning("RTI failed to authenticate the incoming federate."); - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - socket_id = -1; - // Ignore the federate that failed authentication. - i--; - continue; + if (rti_remote->authentication_enabled) { + if (!authenticate_federate(&socket_id)) { + lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + socket_id = -1; + // Ignore the federate that failed authentication. + i--; + continue; + } } - } #endif - // The first message from the federate should contain its ID and the federation ID. - // The function also detects if a hot swap request is initiated. - int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in*)&client_fd); + // The first message from the federate should contain its ID and the federation ID. + // The function also detects if a hot swap request is initiated. + int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in*)&client_fd); - if (fed_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && - receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - LF_MUTEX_LOCK(&rti_mutex); - if (hot_swap_in_progress) { - lf_print("RTI: Hot swap confirmed for federate %d.", fed_id); + if (fed_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && + receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { + LF_MUTEX_LOCK(&rti_mutex); + if (hot_swap_in_progress) { + lf_print("RTI: Hot swap confirmed for federate %d.", fed_id); - // Then send STOP - federate_info_t* fed_old = GET_FED_INFO(fed_id); - hot_swap_federate->enclave.completed = fed_old->enclave.completed; + // Then send STOP + federate_info_t* fed_old = GET_FED_INFO(fed_id); + hot_swap_federate->enclave.completed = fed_old->enclave.completed; - LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); - send_stop(fed_old); - LF_MUTEX_UNLOCK(&rti_mutex); + LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); + send_stop(fed_old); + LF_MUTEX_UNLOCK(&rti_mutex); - // Wait for the old federate to send MSG_TYPE_RESIGN - LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); - // FIXME: This is a busy wait! Need instead a lf_cond_wait on a condition variable. - while (!hot_swap_old_resigned) { - } + // Wait for the old federate to send MSG_TYPE_RESIGN + LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); + // FIXME: This is a busy wait! Need instead a lf_cond_wait on a condition variable. + while (!hot_swap_old_resigned) { + } - // The latest LTC is the tag at which the old federate resigned. This is useful - // for computing the effective_start_time of the new joining federate. - hot_swap_federate->enclave.completed = fed_old->enclave.completed; + // The latest LTC is the tag at which the old federate resigned. This is useful + // for computing the effective_start_time of the new joining federate. + hot_swap_federate->enclave.completed = fed_old->enclave.completed; - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - lf_thread_create(&(hot_swap_federate->thread_id), federate_info_thread_TCP, hot_swap_federate); + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + lf_thread_create(&(hot_swap_federate->thread_id), federate_info_thread_TCP, hot_swap_federate); - // Redirect the federate in rti_remote - rti_remote->base.scheduling_nodes[fed_id] = (scheduling_node_t*)hot_swap_federate; + // Redirect the federate in rti_remote + rti_remote->base.scheduling_nodes[fed_id] = (scheduling_node_t*)hot_swap_federate; - // Free the old federate memory and reset the Hot wap indicators - // FIXME: Is this enough to free the memory allocated to the federate? - free(fed_old); - lf_mutex_lock(&rti_mutex); - hot_swap_in_progress = false; - lf_mutex_unlock(&rti_mutex); + // Free the old federate memory and reset the Hot wap indicators + // FIXME: Is this enough to free the memory allocated to the federate? + free(fed_old); + lf_mutex_lock(&rti_mutex); + hot_swap_in_progress = false; + lf_mutex_unlock(&rti_mutex); - lf_print("RTI: Hot swap succeeded for federate %d.", fed_id); + lf_print("RTI: Hot swap succeeded for federate %d.", fed_id); + } else { + lf_mutex_unlock(&rti_mutex); + + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + federate_info_t* fed = GET_FED_INFO(fed_id); + lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + lf_print("RTI: Transient federate %d joined.", fed_id); + } + rti_remote->number_of_connected_transient_federates++; } else { - lf_mutex_unlock(&rti_mutex); - - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - federate_info_t* fed = GET_FED_INFO(fed_id); - lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); - lf_print("RTI: Transient federate %d joined.", fed_id); - } - rti_remote->number_of_connected_transient_federates++; - } else { - // If a hot swap was initialed, but the connection information or/and clock - // synchronization fail, then reset hot_swap_in_profress, and free the memory - // allocated for hot_swap_federate - if (hot_swap_in_progress) { - lf_print("RTI: Hot swap canceled for federate %d.", fed_id); - lf_mutex_lock(&rti_mutex); - hot_swap_in_progress = false; - lf_mutex_unlock(&rti_mutex); - - // FIXME: Is this enough to free the memory of a federate_info_t data structure? - free(hot_swap_federate); + // If a hot swap was initialed, but the connection information or/and clock + // synchronization fail, then reset hot_swap_in_profress, and free the memory + // allocated for hot_swap_federate + if (hot_swap_in_progress) { + lf_print("RTI: Hot swap canceled for federate %d.", fed_id); + lf_mutex_lock(&rti_mutex); + hot_swap_in_progress = false; + lf_mutex_unlock(&rti_mutex); + + // FIXME: Is this enough to free the memory of a federate_info_t data structure? + free(hot_swap_federate); + } } } - } - return NULL; - } - - /** - * @brief Thread that manages the delayed grants using a priprity queue. - * - * This thread is responsible for managing the priority queue of delayed grants to be issued. - * It waits until the current time matches the highest priority tag time in the queue. - * If reached, it notifies the grant immediately. If, however, the current time has not yet - * reached the highest priority tag and the queue has been updated (either by inserting or - * canceling an entry), the thread stops waiting and restarts the process again. - */ - static void* lf_delayed_grants_thread(void* nothing) { - initialize_lf_thread_id(); - // Hold the mutex when not waiting. - LF_MUTEX_LOCK(&rti_mutex); - while (!rti_remote->all_federates_exited) { - if (pqueue_delayed_grants_size(rti_remote->delayed_grants) > 0) { - // Do not pop, but rather peek. - pqueue_delayed_grant_element_t* next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); - instant_t next_time = next->base.tag.time; - // Wait for expiration, or a signal to stop or terminate. - int ret = lf_clock_cond_timedwait(&updated_delayed_grants, next_time); - if (ret == LF_TIMEOUT) { - // Time reached to send the grant. - // However, the grant may have been canceled while we were waiting. - pqueue_delayed_grant_element_t* new_next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); - if (next == new_next) { - pqueue_delayed_grants_pop(rti_remote->delayed_grants); - federate_info_t* fed = GET_FED_INFO(next->fed_id); - if (next->is_provisional) { - notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - } else { - notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + return NULL; + } + + /** + * @brief Thread that manages the delayed grants using a priprity queue. + * + * This thread is responsible for managing the priority queue of delayed grants to be issued. + * It waits until the current time matches the highest priority tag time in the queue. + * If reached, it notifies the grant immediately. If, however, the current time has not yet + * reached the highest priority tag and the queue has been updated (either by inserting or + * canceling an entry), the thread stops waiting and restarts the process again. + */ + static void* lf_delayed_grants_thread(void* nothing) { + initialize_lf_thread_id(); + // Hold the mutex when not waiting. + LF_MUTEX_LOCK(&rti_mutex); + while (!rti_remote->all_federates_exited) { + if (pqueue_delayed_grants_size(rti_remote->delayed_grants) > 0) { + // Do not pop, but rather peek. + pqueue_delayed_grant_element_t* next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + instant_t next_time = next->base.tag.time; + // Wait for expiration, or a signal to stop or terminate. + int ret = lf_clock_cond_timedwait(&updated_delayed_grants, next_time); + if (ret == LF_TIMEOUT) { + // Time reached to send the grant. + // However, the grant may have been canceled while we were waiting. + pqueue_delayed_grant_element_t* new_next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + if (next == new_next) { + pqueue_delayed_grants_pop(rti_remote->delayed_grants); + federate_info_t* fed = GET_FED_INFO(next->fed_id); + if (next->is_provisional) { + notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } else { + notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } + free(next); } - free(next); + } else if (ret != 0) { + // An error occurred. + lf_print_error_and_exit("lf_delayed_grants_thread: lf_clock_cond_timedwait failed with code %d.", ret); } - } else if (ret != 0) { - // An error occurred. - lf_print_error_and_exit("lf_delayed_grants_thread: lf_clock_cond_timedwait failed with code %d.", ret); + } else if (pqueue_delayed_grants_size(rti_remote->delayed_grants) == 0) { + // Wait for something to appear on the queue. + lf_cond_wait(&updated_delayed_grants); } - } else if (pqueue_delayed_grants_size(rti_remote->delayed_grants) == 0) { - // Wait for something to appear on the queue. - lf_cond_wait(&updated_delayed_grants); } - } - // Free any delayed grants that are still on the queue. - pqueue_delayed_grants_free(rti_remote->delayed_grants); - LF_MUTEX_UNLOCK(&rti_mutex); - return NULL; - } - /** - * This thread is responsible for managing the priority queue of delayed grants to be issued. - * It waits until the current time matches the highest priority tag time in the queue. - * If reached, it notifies the grant immediately. If, however, the current time has not yet - * reached the highest priority tag and the queue has been updated (either by inserting or - * canceling an entry), the thread stops waiting and restarts the process again. - */ - void* lf_delayed_grants_thread(void* nothing) { - initialize_lf_thread_id(); - // Hold the mutex only when accessing rti_remote->delayed_grants pqueue - while (!rti_remote->all_federates_exited) { - if (pqueue_delayed_grants_size(rti_remote->delayed_grants) > 0) { - // Do not pop, but rather peek. - LF_MUTEX_LOCK(&rti_mutex); - pqueue_delayed_grant_element_t* next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); - instant_t next_time = next->base.tag.time; - LF_MUTEX_UNLOCK(&rti_mutex); - // Wait for expiration, or a signal to stop or terminate. - if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time)) { - // Time reached to send the grant. - // However, the grant may have been canceled while we were waiting. + // Free any delayed grants that are still on the queue. + pqueue_delayed_grants_free(rti_remote->delayed_grants); + LF_MUTEX_UNLOCK(&rti_mutex); + return NULL; + } + /** + * This thread is responsible for managing the priority queue of delayed grants to be issued. + * It waits until the current time matches the highest priority tag time in the queue. + * If reached, it notifies the grant immediately. If, however, the current time has not yet + * reached the highest priority tag and the queue has been updated (either by inserting or + * canceling an entry), the thread stops waiting and restarts the process again. + */ + void* lf_delayed_grants_thread(void* nothing) { + initialize_lf_thread_id(); + // Hold the mutex only when accessing rti_remote->delayed_grants pqueue + while (!rti_remote->all_federates_exited) { + if (pqueue_delayed_grants_size(rti_remote->delayed_grants) > 0) { + // Do not pop, but rather peek. LF_MUTEX_LOCK(&rti_mutex); - pqueue_delayed_grant_element_t* new_next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); - if (next == new_next) { - pqueue_delayed_grants_pop(rti_remote->delayed_grants); - federate_info_t* fed = GET_FED_INFO(next->fed_id); - if (next->is_provisional) { - notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - // FIXME: Send port absent notification to all federates downstream of absent federates. - } else { - notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + pqueue_delayed_grant_element_t* next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + instant_t next_time = next->base.tag.time; + LF_MUTEX_UNLOCK(&rti_mutex); + // Wait for expiration, or a signal to stop or terminate. + if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time)) { + // Time reached to send the grant. + // However, the grant may have been canceled while we were waiting. + LF_MUTEX_LOCK(&rti_mutex); + pqueue_delayed_grant_element_t* new_next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + if (next == new_next) { + pqueue_delayed_grants_pop(rti_remote->delayed_grants); + federate_info_t* fed = GET_FED_INFO(next->fed_id); + if (next->is_provisional) { + notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + // FIXME: Send port absent notification to all federates downstream of absent federates. + } else { + notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } + free(next); } - free(next); } + LF_MUTEX_UNLOCK(&rti_mutex); + } else if (pqueue_delayed_grants_size(rti_remote->delayed_grants) == 0) { + // Wait for something to appear on the queue. + lf_cond_wait(&updated_delayed_grants); } - LF_MUTEX_UNLOCK(&rti_mutex); - } else if (pqueue_delayed_grants_size(rti_remote->delayed_grants) == 0) { - // Wait for something to appear on the queue. - lf_cond_wait(&updated_delayed_grants); } + // Free any delayed grants that are still on the queue. + while (pqueue_delayed_grants_size(rti_remote->delayed_grants) > 0) { + pqueue_delayed_grant_element_t* next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); + free(next); + } + return NULL; } - // Free any delayed grants that are still on the queue. - while (pqueue_delayed_grants_size(rti_remote->delayed_grants) > 0) { - pqueue_delayed_grant_element_t* next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); - free(next); - } - return NULL; - } - void* respond_to_erroneous_connections(void* nothing) { - initialize_lf_thread_id(); - while (true) { - // Wait for an incoming connection request. - // The following will block until either a federate attempts to connect - // or close(rti->socket_descriptor_TCP) is called. - int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, -1); - if (socket_id < 0) { - return NULL; - } - if (rti_remote->all_federates_exited) { - return NULL; - } + void* respond_to_erroneous_connections(void* nothing) { + initialize_lf_thread_id(); + while (true) { + // Wait for an incoming connection request. + // The following will block until either a federate attempts to connect + // or close(rti->socket_descriptor_TCP) is called. + int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, -1); + if (socket_id < 0) { + return NULL; + } + if (rti_remote->all_federates_exited) { + return NULL; + } - lf_print_error("RTI received an unexpected connection request. Federation is running."); - unsigned char response[2]; - response[0] = MSG_TYPE_REJECT; - response[1] = FEDERATION_ID_DOES_NOT_MATCH; - // Ignore errors on this response. - if (write_to_socket(socket_id, 2, response)) { - lf_print_warning("RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); + lf_print_error("RTI received an unexpected connection request. Federation is running."); + unsigned char response[2]; + response[0] = MSG_TYPE_REJECT; + response[1] = FEDERATION_ID_DOES_NOT_MATCH; + // Ignore errors on this response. + if (write_to_socket(socket_id, 2, response)) { + lf_print_warning("RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); + } + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); } - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); + return NULL; + } + + void initialize_federate(federate_info_t * fed, uint16_t id) { + initialize_scheduling_node(&(fed->enclave), id); + fed->requested_stop = false; + fed->socket = -1; // No socket. + fed->clock_synchronization_enabled = true; + fed->in_transit_message_tags = pqueue_tag_init(10); + strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); + fed->server_ip_addr.s_addr = 0; + fed->server_port = -1; + fed->has_upstream_transient_federates = false; + fed->is_transient = true; + fed->effective_start_tag = NEVER_TAG; + } + + void reset_transient_federate(federate_info_t * fed) { + fed->enclave.next_event = NEVER_TAG; + fed->enclave.state = NOT_CONNECTED; + // Reset of the federate-related attributes + fed->socket = -1; // No socket. + fed->clock_synchronization_enabled = true; + fed->in_transit_message_tags = pqueue_tag_init(10); + strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); + fed->server_ip_addr.s_addr = 0; + fed->server_port = -1; + fed->requested_stop = false; + fed->is_transient = true; + fed->effective_start_tag = NEVER_TAG; + } + + int32_t start_rti_server(uint16_t port) { + _lf_initialize_clock(); + // Create the TCP socket server + if (create_server(port, &rti_remote->socket_descriptor_TCP, &rti_remote->final_port_TCP, TCP, true)) { + lf_print_error_system_failure("RTI failed to create TCP server: %s.", strerror(errno)); + }; + lf_print("RTI: Listening for federates."); + // Create the UDP socket server + // Try to get the rti_remote->final_port_TCP + 1 port + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + if (create_server(rti_remote->final_port_TCP + 1, &rti_remote->socket_descriptor_UDP, + &rti_remote->final_port_UDP, UDP, true)) { + lf_print_error_system_failure("RTI failed to create UDP server: %s.", strerror(errno)); + } + } + return rti_remote->socket_descriptor_TCP; } - return NULL; - } - - void initialize_federate(federate_info_t * fed, uint16_t id) { - initialize_scheduling_node(&(fed->enclave), id); - fed->requested_stop = false; - fed->socket = -1; // No socket. - fed->clock_synchronization_enabled = true; - fed->in_transit_message_tags = pqueue_tag_init(10); - strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); - fed->server_ip_addr.s_addr = 0; - fed->server_port = -1; - fed->has_upstream_transient_federates = false; - fed->is_transient = true; - fed->effective_start_tag = NEVER_TAG; - } - - void reset_transient_federate(federate_info_t * fed) { - fed->enclave.next_event = NEVER_TAG; - fed->enclave.state = NOT_CONNECTED; - // Reset of the federate-related attributes - fed->socket = -1; // No socket. - fed->clock_synchronization_enabled = true; - fed->in_transit_message_tags = pqueue_tag_init(10); - strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); - fed->server_ip_addr.s_addr = 0; - fed->server_port = -1; - fed->requested_stop = false; - fed->is_transient = true; - fed->effective_start_tag = NEVER_TAG; - } - int32_t start_rti_server(uint16_t port) { - _lf_initialize_clock(); - // Create the TCP socket server - if (create_server(port, &rti_remote->socket_descriptor_TCP, &rti_remote->final_port_TCP, TCP, true)) { - lf_print_error_system_failure("RTI failed to create TCP server: %s.", strerror(errno)); - }; - lf_print("RTI: Listening for federates."); - // Create the UDP socket server - // Try to get the rti_remote->final_port_TCP + 1 port - if (rti_remote->clock_sync_global_status >= clock_sync_on) { - if (create_server(rti_remote->final_port_TCP + 1, &rti_remote->socket_descriptor_UDP, &rti_remote->final_port_UDP, - UDP, true)) { - lf_print_error_system_failure("RTI failed to create UDP server: %s.", strerror(errno)); - } - } - return rti_remote->socket_descriptor_TCP; - } + /** + * Iterate over the federates and sets 'has_upstream_transient_federates'. + * Once done, check that no transient federate has an upstream transient federate. + * and compute the number of persistent federates that do have upstream transients, + * which is the maximun number of delayed grants that can be pending at the same time. + * This is useful for initialyzing the queue of delayed grants. - /** - * Iterate over the federates and sets 'has_upstream_transient_federates'. - * Once done, check that no transient federate has an upstream transient federate. - * and compute the number of persistent federates that do have upstream transients, - * which is the maximun number of delayed grants that can be pending at the same time. - * This is useful for initialyzing the queue of delayed grants. - - * @return -1, if there is more than one level of transiency, else, the number of - * persistents that have an upstream transient - */ - static int set_has_upstream_transient_federates_parameter_and_check() { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - for (int j = 0; j < fed->enclave.num_upstream; j++) { - federate_info_t* upstream_fed = GET_FED_INFO(fed->enclave.upstream[j]); - if (upstream_fed->is_transient) { - fed->has_upstream_transient_federates = true; - break; + * @return -1, if there is more than one level of transiency, else, the number of + * persistents that have an upstream transient + */ + static int set_has_upstream_transient_federates_parameter_and_check() { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + for (int j = 0; j < fed->enclave.num_upstream; j++) { + federate_info_t* upstream_fed = GET_FED_INFO(fed->enclave.upstream[j]); + if (upstream_fed->is_transient) { + fed->has_upstream_transient_federates = true; + break; + } } } - } - // Now check that no transient has an upstream transient - // FIXME: Do we really need this? Or should it be the job of the validator? - int max_number_of_delayed_grants = 0; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->is_transient && fed->has_upstream_transient_federates) { - return -1; - } - if (!fed->is_transient && fed->has_upstream_transient_federates) { - max_number_of_delayed_grants++; - } - } - // Now check that no transient has an upstream transient - // FIXME: Do we really need this? Or should it be the job of the validator? - uint16_t max_number_of_delayed_grants = 0; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->is_transient && fed->has_upstream_transient_federates) { - return -1; + // Now check that no transient has an upstream transient + // FIXME: Do we really need this? Or should it be the job of the validator? + int max_number_of_delayed_grants = 0; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->is_transient && fed->has_upstream_transient_federates) { + return -1; + } + if (!fed->is_transient && fed->has_upstream_transient_federates) { + max_number_of_delayed_grants++; + } } - if (!fed->is_transient && fed->has_upstream_transient_federates) { - max_number_of_delayed_grants++; + // Now check that no transient has an upstream transient + // FIXME: Do we really need this? Or should it be the job of the validator? + uint16_t max_number_of_delayed_grants = 0; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->is_transient && fed->has_upstream_transient_federates) { + return -1; + } + if (!fed->is_transient && fed->has_upstream_transient_federates) { + max_number_of_delayed_grants++; + } } - } + return max_number_of_delayed_grants; + } return max_number_of_delayed_grants; } - return max_number_of_delayed_grants; -} -void wait_for_federates(int socket_descriptor) { - // Wait for connections from persistent federates and create a thread for each. - lf_connect_to_persistent_federates(socket_descriptor); + void wait_for_federates(int socket_descriptor) { + // Wait for connections from persistent federates and create a thread for each. + lf_connect_to_persistent_federates(socket_descriptor); - // Set has_upstream_transient_federates parameter in all federates and check - // that there is no more than one level of transiency - if (rti_remote->number_of_transient_federates > 0) { - int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); - if (max_number_of_pending_grants == -1) { - lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); + // Set the start_time in the RTI trace + if (rti_remote->base.tracing_enabled) { + lf_tracing_set_start_time(start_time); } - rti_remote->delayed_grants = pqueue_delayed_grants_init(max_number_of_pending_grants); - } - - // All persistent federates have connected. - lf_print("RTI: All expected persistent federates have connected. Starting execution."); - if (rti_remote->number_of_transient_federates > 0) { - lf_print("RTI: Transient Federates can join and leave the federation at anytime."); - } - // The socket server will only continue to accept connections from transient - // federates. - // In case some other federation's federates are trying to join the wrong - // federation, need to respond. Start a separate thread to do that. - lf_thread_t responder_thread; - lf_thread_t transient_thread; - lf_thread_t delayed_grants_thread; - - // If the federation does not include transient federates, then respond to - // erronous connections. Otherwise, continue to accept transients joining and - // respond to duplicate joing requests. - if (rti_remote->number_of_transient_federates == 0) { - lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); - } else if (rti_remote->number_of_transient_federates > 0) { - lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); - lf_thread_create(&delayed_grants_thread, lf_delayed_grants_thread, NULL); - } + // Set has_upstream_transient_federates parameter in all federates and check + // that there is no more than one level of transiency + if (rti_remote->number_of_transient_federates > 0) { + int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); + if (max_number_of_pending_grants == -1) { + lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); + } + rti_remote->delayed_grants = pqueue_delayed_grants_init(max_number_of_pending_grants); + } - // Wait for persistent federate threads to exit. - void* thread_exit_status; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (!fed->is_transient) { - lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); - lf_thread_join(fed->thread_id, &thread_exit_status); - pqueue_tag_free(fed->in_transit_message_tags); - lf_print("RTI: Persistent federate %d thread exited.", fed->enclave.id); + // All persistent federates have connected. + lf_print("RTI: All expected persistent federates have connected. Starting execution."); + if (rti_remote->number_of_transient_federates > 0) { + lf_print("RTI: Transient Federates can join and leave the federation at anytime."); } - } - rti_remote->all_persistent_federates_exited = true; - rti_remote->phase = shutdown_phase; - lf_print("RTI: All persistent threads exited."); + // The socket server will only continue to accept connections from transient + // federates. + // In case some other federation's federates are trying to join the wrong + // federation, need to respond. Start a separate thread to do that. + lf_thread_t responder_thread; + lf_thread_t transient_thread; + lf_thread_t delayed_grants_thread; - // Wait for transient federate threads to exit, if any. - if (rti_remote->number_of_transient_federates > 0) { + // If the federation does not include transient federates, then respond to + // erronous connections. Otherwise, continue to accept transients joining and + // respond to duplicate joing requests. + if (rti_remote->number_of_transient_federates == 0) { + lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); + } else if (rti_remote->number_of_transient_federates > 0) { + lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); + lf_thread_create(&delayed_grants_thread, lf_delayed_grants_thread, NULL); + } + + // Wait for persistent federate threads to exit. + void* thread_exit_status; for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { federate_info_t* fed = GET_FED_INFO(i); - if (fed->is_transient) { + if (!fed->is_transient) { lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); lf_thread_join(fed->thread_id, &thread_exit_status); pqueue_tag_free(fed->in_transit_message_tags); - lf_print("RTI: Transient federate %d thread exited.", fed->enclave.id); + lf_print("RTI: Persistent federate %d thread exited.", fed->enclave.id); } } - } - rti_remote->all_federates_exited = true; + rti_remote->all_persistent_federates_exited = true; + rti_remote->phase = shutdown_phase; + lf_print("RTI: All persistent threads exited."); - // Shutdown and close the socket that is listening for incoming connections - // so that the accept() call in respond_to_erroneous_connections returns. - // That thread should then check rti->all_federates_exited and it should exit. - if (shutdown(socket_descriptor, SHUT_RDWR)) { - LF_PRINT_LOG("On shut down TCP socket, received reply: %s", strerror(errno)); - } - // NOTE: In all common TCP/IP stacks, there is a time period, - // typically between 30 and 120 seconds, called the TIME_WAIT period, - // before the port is released after this close. This is because - // the OS is preventing another program from accidentally receiving - // duplicated packets intended for this program. - close(socket_descriptor); - - if (rti_remote->socket_descriptor_UDP > 0) { - if (shutdown(rti_remote->socket_descriptor_UDP, SHUT_RDWR)) { - LF_PRINT_LOG("On shut down UDP socket, received reply: %s", strerror(errno)); - } - close(rti_remote->socket_descriptor_UDP); - } -} + // Wait for transient federate threads to exit, if any. + if (rti_remote->number_of_transient_federates > 0) { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->is_transient) { + lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); + lf_thread_join(fed->thread_id, &thread_exit_status); + pqueue_tag_free(fed->in_transit_message_tags); + lf_print("RTI: Transient federate %d thread exited.", fed->enclave.id); + } + } + } -void initialize_RTI(rti_remote_t* rti) { - rti_remote = rti; - - // Initialize thread synchronization primitives - LF_MUTEX_INIT(&rti_mutex); - LF_COND_INIT(&received_start_times, &rti_mutex); - LF_COND_INIT(&sent_start_time, &rti_mutex); - LF_COND_INIT(&updated_delayed_grants, &rti_mutex); - - initialize_rti_common(&rti_remote->base); - rti_remote->base.mutex = &rti_mutex; - - // federation_rti related initializations - rti_remote->max_start_time = 0LL; - rti_remote->num_feds_proposed_start = 0; - rti_remote->all_federates_exited = false; - rti_remote->federation_id = "Unidentified Federation"; - rti_remote->user_specified_port = 0; - rti_remote->final_port_TCP = 0; - rti_remote->socket_descriptor_TCP = -1; - rti_remote->final_port_UDP = UINT16_MAX; - rti_remote->socket_descriptor_UDP = -1; - rti_remote->clock_sync_global_status = clock_sync_init; - rti_remote->clock_sync_period_ns = MSEC(10); - rti_remote->clock_sync_exchanges_per_interval = 10; - rti_remote->authentication_enabled = false; - rti_remote->base.tracing_enabled = false; - rti_remote->stop_in_progress = false; - rti_remote->number_of_transient_federates = 0; - rti_remote->phase = startup_phase; -} + rti_remote->all_federates_exited = true; -void free_scheduling_nodes(scheduling_node_t** scheduling_nodes, uint16_t number_of_scheduling_nodes) { - for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { - // FIXME: Gives error freeing memory not allocated!!!! - scheduling_node_t* node = scheduling_nodes[i]; - if (node->upstream != NULL) - free(node->upstream); - if (node->downstream != NULL) - free(node->downstream); + // Shutdown and close the socket that is listening for incoming connections + // so that the accept() call in respond_to_erroneous_connections returns. + // That thread should then check rti->all_federates_exited and it should exit. + if (shutdown(socket_descriptor, SHUT_RDWR)) { + LF_PRINT_LOG("On shut down TCP socket, received reply: %s", strerror(errno)); + } + // NOTE: In all common TCP/IP stacks, there is a time period, + // typically between 30 and 120 seconds, called the TIME_WAIT period, + // before the port is released after this close. This is because + // the OS is preventing another program from accidentally receiving + // duplicated packets intended for this program. + close(socket_descriptor); + + if (rti_remote->socket_descriptor_UDP > 0) { + if (shutdown(rti_remote->socket_descriptor_UDP, SHUT_RDWR)) { + LF_PRINT_LOG("On shut down UDP socket, received reply: %s", strerror(errno)); + } + close(rti_remote->socket_descriptor_UDP); + } + } + + void initialize_RTI(rti_remote_t * rti) { + rti_remote = rti; + + // Initialize thread synchronization primitives + LF_MUTEX_INIT(&rti_mutex); + LF_COND_INIT(&received_start_times, &rti_mutex); + LF_COND_INIT(&sent_start_time, &rti_mutex); + LF_COND_INIT(&updated_delayed_grants, &rti_mutex); + + initialize_rti_common(&rti_remote->base); + rti_remote->base.mutex = &rti_mutex; + + // federation_rti related initializations + rti_remote->max_start_time = 0LL; + rti_remote->num_feds_proposed_start = 0; + rti_remote->all_federates_exited = false; + rti_remote->federation_id = "Unidentified Federation"; + rti_remote->user_specified_port = 0; + rti_remote->final_port_TCP = 0; + rti_remote->socket_descriptor_TCP = -1; + rti_remote->final_port_UDP = UINT16_MAX; + rti_remote->socket_descriptor_UDP = -1; + rti_remote->clock_sync_global_status = clock_sync_init; + rti_remote->clock_sync_period_ns = MSEC(10); + rti_remote->clock_sync_exchanges_per_interval = 10; + rti_remote->authentication_enabled = false; + rti_remote->base.tracing_enabled = false; + rti_remote->stop_in_progress = false; + rti_remote->number_of_transient_federates = 0; + rti_remote->phase = startup_phase; + } + + void free_scheduling_nodes(scheduling_node_t * *scheduling_nodes, uint16_t number_of_scheduling_nodes) { + for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { + // FIXME: Gives error freeing memory not allocated!!!! + scheduling_node_t* node = scheduling_nodes[i]; + if (node->upstream != NULL) + free(node->upstream); + if (node->downstream != NULL) + free(node->downstream); + } + free(scheduling_nodes); } - free(scheduling_nodes); -} #endif // STANDALONE_RTI From 38c571abf4301369ee23f6ab7f51aabdf58a830e Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 24 Jul 2024 15:53:28 +0100 Subject: [PATCH 109/148] Update lingua-franca-ref.txt --- lingua-franca-ref.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingua-franca-ref.txt b/lingua-franca-ref.txt index 52199a147..23343040e 100644 --- a/lingua-franca-ref.txt +++ b/lingua-franca-ref.txt @@ -1 +1 @@ -transient-fed +transient-fed-cycles From 755f8faa97fe701d21780d6b700287c2767b3fb0 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sat, 27 Jul 2024 11:10:35 -0400 Subject: [PATCH 110/148] Revert to 0 microstep --- core/federated/RTI/rti_remote.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 9c1d02287..33e4bba7d 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1352,6 +1352,9 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque // For every downstream that has a pending grant that is higher than the // effective_start_time of the federate, cancel it. + // FIXME: Should this be higher-than or equal to? + // FIXME: Also, won't the grant simply be lost? + // If the joining federate doesn't send anything, the downstream federate won't issue another NET. for (int j = 0; j < my_fed->enclave.num_downstream; j++) { federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); From 7d9733113c0e7d914187af7f7227f6a108b8a46d Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 7 Aug 2024 19:20:15 +0100 Subject: [PATCH 111/148] Fix lf_get_federates_bin_directory() + Fix its scope, as well as lf_get_federation_id() --- core/federated/federate.c | 9 +-------- include/core/federated/federate.h | 18 ++++++++++++++++++ include/core/utils/util.h | 7 ------- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 62e835c78..40a73e9e3 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2716,14 +2716,7 @@ void lf_stop() { } char* lf_get_federates_bin_directory() { - bool bin_directory_defined = false; -#ifdef LF_FEDERATES_BIN_DIRECTORY - bin_directory_defined = true; -#endif - if (bin_directory_defined) { - return LF_FEDERATES_BIN_DIRECTORY; - } - return NULL; + return LF_SOURCE_GEN_DIRECTORY LF_FILE_SEPARATOR ".." LF_FILE_SEPARATOR ".." LF_FILE_SEPARATOR "bin"; } const char* lf_get_federation_id() { return federation_metadata.federation_id; } diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index 26550d60c..ef5c4a02d 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -530,4 +530,22 @@ void lf_synchronize_with_other_federates(); */ bool lf_update_max_level(tag_t tag, bool is_provisional); +/** + * @brief Return the directory containing the executables of the individual + * federates. + * + * This function is useful for testing purposes only. + * Note that it assumes that all federates are running on the same machine. + * In order for a program to use this function, it needs to include "federate.h" in the preamble. + */ +char* lf_get_federates_bin_directory(); + +/** + * @brief Returns the federation id. + * + * This function is useful for testing purposes only. + * In order for a program to use this function, it needs to include "federate.h" in the preamble. + */ +const char* lf_get_federation_id(); + #endif // FEDERATE_H diff --git a/include/core/utils/util.h b/include/core/utils/util.h index 39f910b4d..34ce8a301 100644 --- a/include/core/utils/util.h +++ b/include/core/utils/util.h @@ -204,11 +204,4 @@ void lf_vprint_error_and_exit(const char* format, va_list args) ATTRIBUTE_FORMAT */ void lf_stop(); -/** - * @brief Returns the federation id. - * - * This function is useful for creating federates on runtime. - */ -char* lf_get_federation_id(); - #endif /* UTIL_H */ From f4e96237c4f9dbb4df5131d85432b74bd8f69dd3 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 7 Aug 2024 19:41:33 +0100 Subject: [PATCH 112/148] Remove no more needed LF_FEDERATED_BIN_DIRECTORY --- core/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 04443d5ab..6d938ae0c 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -181,7 +181,6 @@ define(SCHEDULER) define(LF_SOURCE_DIRECTORY) define(LF_SOURCE_GEN_DIRECTORY) define(LF_PACKAGE_DIRECTORY) -define(LF_FEDERATES_BIN_DIRECTORY) define(LF_FILE_SEPARATOR) define(WORKERS_NEEDED_FOR_FEDERATE) define(LF_ENCLAVES) From d6d1d0e542aa2974668b71735a0e845409d43af7 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 7 Aug 2024 19:41:53 +0100 Subject: [PATCH 113/148] Run Clang formatter --- include/core/federated/network/net_common.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index a58476a10..5c341a90e 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -663,14 +663,14 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * They are limited to one byte (uchar). */ typedef enum { - FEDERATION_ID_DOES_NOT_MATCH = 1, - FEDERATE_ID_IN_USE = 2, - FEDERATE_ID_OUT_OF_RANGE = 3, - UNEXPECTED_MESSAGE = 4, - WRONG_SERVER = 5, - HMAC_DOES_NOT_MATCH = 6, - RTI_NOT_EXECUTED_WITH_AUTH = 7, - JOINING_TOO_LATE = 8 + FEDERATION_ID_DOES_NOT_MATCH = 1, + FEDERATE_ID_IN_USE = 2, + FEDERATE_ID_OUT_OF_RANGE = 3, + UNEXPECTED_MESSAGE = 4, + WRONG_SERVER = 5, + HMAC_DOES_NOT_MATCH = 6, + RTI_NOT_EXECUTED_WITH_AUTH = 7, + JOINING_TOO_LATE = 8 } rejection_code_t; #endif /* NET_COMMON_H */ From 09e1f870e3bc00858e14112c4cc415002ffd6a21 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 14 Aug 2024 07:49:03 +0100 Subject: [PATCH 114/148] Remove overlooked code when merging --- core/federated/federate.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 40a73e9e3..820d03465 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2721,4 +2721,29 @@ char* lf_get_federates_bin_directory() { const char* lf_get_federation_id() { return federation_metadata.federation_id; } -#endif +#ifdef FEDERATED_DECENTRALIZED +instant_t lf_wait_until_time(tag_t tag) { + instant_t result = tag.time; // Default. + + // Do not add the STA if the tag is the starting tag. + if (tag.time != start_time || tag.microstep != 0u) { + + // Apply the STA to the logical time, but only if at least one network input port is not known up to this tag. + // Subtract one microstep because it is sufficient to commit to a tag if the input ports are known + // up to one microstep earlier. + if (tag.microstep > 0) { + tag.microstep--; + } else { + tag.microstep = UINT_MAX; + tag.time -= 1; + } + + if (!inputs_known_to(tag)) { + result = lf_time_add(result, lf_fed_STA_offset); + } + } + return result; +} +#endif // FEDERATED_DECENTRALIZED + +#endif // FEDERATED From 23b714c4f06750053e5ccc5a27ba2f9767297952 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 21 Aug 2024 11:26:01 +0100 Subject: [PATCH 115/148] Handle corner cases where connection messages about transients are received before the start time --- core/federated/federate.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 820d03465..94eac6f21 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -958,17 +958,34 @@ static instant_t get_start_time_from_rti(instant_t my_physical_time) { size_t buffer_length = MSG_TYPE_TIMESTAMP_START_LENGTH; unsigned char buffer[buffer_length]; - read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, buffer_length, buffer, NULL, - "Failed to read MSG_TYPE_TIMESTAMP_START message from RTI."); - LF_PRINT_DEBUG("Read 21 bytes."); - - // First byte received is the message ID. - if (buffer[0] != MSG_TYPE_TIMESTAMP_START) { - if (buffer[0] == MSG_TYPE_FAILED) { - lf_print_error_and_exit("RTI has failed."); + while (true) { + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, 1, buffer, NULL, + "Failed to read MSG_TYPE_TIMESTAMP_START message from RTI."); + // First byte received is the message ID. + if (buffer[0] != MSG_TYPE_TIMESTAMP_START) { + if (buffer[0] == MSG_TYPE_FAILED) { + lf_print_error_and_exit("RTI has failed."); + } else if (buffer[0] == MSG_TYPE_UPSTREAM_CONNECTED) { + // We need to swallow this message so that we continue waiting for MSG_TYPE_TIMESTAMP_START to arrive + // FIXME: Shouldn't we keep the ids, so that these messages are handled right after the startime is set? + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, MSG_TYPE_UPSTREAM_CONNECTED_LENGTH - 1, buffer + 1, NULL, + "Failed to complete reading MSG_TYPE_UPSTREAM_CONNECTED."); + continue; + } else if (buffer[0] == MSG_TYPE_UPSTREAM_DISCONNECTED) { + // We need to swallow this message so that we continue waiting for MSG_TYPE_TIMESTAMP_START to arrive + // FIXME: Shouldn't we keep the ids, so that these messages are handled right after the startime is set? + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH - 1, buffer + 1, + NULL, "Failed to complete reading MSG_TYPE_UPSTREAM_DISCONNECTED."); + continue; + } else { + lf_print_error_and_exit("Expected a MSG_TYPE_TIMESTAMP_START message from the RTI. Got %u (see net_common.h).", + buffer[0]); + } + } else { + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, buffer_length - 1, buffer + 1, NULL, + "Failed to read MSG_TYPE_TIMESTAMP_START message from RTI."); + break; } - lf_print_error_and_exit("Expected a MSG_TYPE_TIMESTAMP_START message from the RTI. Got %u (see net_common.h).", - buffer[0]); } instant_t timestamp = extract_int64(&(buffer[1])); From d14f811a3491c6df4f90b3f8deea4f4fdec18cf3 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 21 Aug 2024 11:28:15 +0100 Subject: [PATCH 116/148] Reset timing info from previous runs --- core/federated/RTI/rti_remote.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 33e4bba7d..5f8a129fb 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -2597,18 +2597,22 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque } void reset_transient_federate(federate_info_t * fed) { + // Reset all the timing information from the previous run + fed->enclave.completed = NEVER_TAG; + fed->enclave.last_granted = NEVER_TAG; + fed->enclave.last_provisionally_granted = NEVER_TAG; fed->enclave.next_event = NEVER_TAG; - fed->enclave.state = NOT_CONNECTED; // Reset of the federate-related attributes fed->socket = -1; // No socket. fed->clock_synchronization_enabled = true; + // FIXME: The following two lines can be improved? + pqueue_tag_free(fed->in_transit_message_tags); fed->in_transit_message_tags = pqueue_tag_init(10); strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); fed->server_ip_addr.s_addr = 0; fed->server_port = -1; fed->requested_stop = false; - fed->is_transient = true; - fed->effective_start_tag = NEVER_TAG; + invalidate_min_delays_upstream(&(fed->enclave)); } int32_t start_rti_server(uint16_t port) { From 35e7bc7211c5134bf03b76e82b595363389bd1ac Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 21 Aug 2024 11:56:09 +0100 Subject: [PATCH 117/148] Skip checking the satet in the first call of _update_min_delays_upsteam --- core/federated/RTI/rti_common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/federated/RTI/rti_common.c b/core/federated/RTI/rti_common.c index f1229493f..d0f063652 100644 --- a/core/federated/RTI/rti_common.c +++ b/core/federated/RTI/rti_common.c @@ -305,9 +305,10 @@ static void _update_min_delays_upstream(scheduling_node_t* end, scheduling_node_ // Not the first call, so intermediate is upstream of end. delay_from_intermediate_so_far = path_delays[intermediate->id]; } - if (intermediate->state == NOT_CONNECTED) { + if (intermediate->state == NOT_CONNECTED && end->id != intermediate->id) { // Enclave or federate is not connected. // No point in checking upstream scheduling_nodes. + // Skip the first call return; } // Check nodes upstream of intermediate (or end on first call). From 0dbe9f68405ad396db35b100b012fc898f2d5a8d Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 28 Aug 2024 14:23:54 +0100 Subject: [PATCH 118/148] Invalidate min delays of all federates once a tansient joins --- core/federated/RTI/rti_remote.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 5f8a129fb..06002b0c2 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1379,6 +1379,14 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque // from an upstream federate is forwarded before the start tag. send_start_tag(my_fed, start_time, my_fed->effective_start_tag); + // Whenver a transient joins, invalidate all federates, so that all min_delays_upstream + // get re-computed. + // FIXME: Needs to be optimized to only invalidate those affected by the transient + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + invalidate_min_delays_upstream(&(fed->enclave)); + } + LF_MUTEX_UNLOCK(&rti_mutex); } } @@ -2612,7 +2620,7 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque fed->server_ip_addr.s_addr = 0; fed->server_port = -1; fed->requested_stop = false; - invalidate_min_delays_upstream(&(fed->enclave)); + // invalidate_all_min_delays(); } int32_t start_rti_server(uint16_t port) { From 00de25cf077a86ae2a11cf2d80664ce94fb874e7 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 28 Aug 2024 14:27:03 +0100 Subject: [PATCH 119/148] Do not skip the node itself in _updat_min_delays --- core/federated/RTI/rti_common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/federated/RTI/rti_common.c b/core/federated/RTI/rti_common.c index d0f063652..f1229493f 100644 --- a/core/federated/RTI/rti_common.c +++ b/core/federated/RTI/rti_common.c @@ -305,10 +305,9 @@ static void _update_min_delays_upstream(scheduling_node_t* end, scheduling_node_ // Not the first call, so intermediate is upstream of end. delay_from_intermediate_so_far = path_delays[intermediate->id]; } - if (intermediate->state == NOT_CONNECTED && end->id != intermediate->id) { + if (intermediate->state == NOT_CONNECTED) { // Enclave or federate is not connected. // No point in checking upstream scheduling_nodes. - // Skip the first call return; } // Check nodes upstream of intermediate (or end on first call). From 58eecb11362b42579e81daf9218869c0dcd2817b Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 2 Dec 2024 09:05:33 +0100 Subject: [PATCH 120/148] Removed duplicated function --- core/federated/federate.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/core/federated/federate.c b/core/federated/federate.c index 94eac6f21..6b0ef6975 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -295,6 +295,34 @@ static void update_last_known_status_on_input_port(environment_t* env, tag_t tag } } +/** + * @brief Mark all the input ports connected to the given federate as known to be absent until FOREVER. + * + * This does nothing if the federate is not using decentralized coordination. + * This function acquires the mutex on the top-level environment. + * @param fed_id The ID of the federate. + */ +static void mark_inputs_known_absent(int fed_id) { +#ifdef FEDERATED_DECENTRALIZED + // Note that when transient federates are supported, this will need to be updated because the + // federate could rejoin. + environment_t* env; + _lf_get_environments(&env); + LF_MUTEX_LOCK(&env->mutex); + + for (size_t i = 0; i < _lf_action_table_size; i++) { + lf_action_base_t* action = _lf_action_table[i]; + if (action->source_id == fed_id) { + update_last_known_status_on_input_port(env, FOREVER_TAG, i); + } + } + LF_MUTEX_UNLOCK(&env->mutex); +#else + // Do nothing, except suppress unused parameter error. + (void)fed_id; +#endif // FEDERATED_DECENTRALIZED +} + /** * @brief Update the last known status tag of a network input action. * From 4d6587b19a73292edf2d26c82f487a71f2e84e4f Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 2 Dec 2024 09:32:59 +0100 Subject: [PATCH 121/148] Formatting --- include/core/utils/pqueue_tag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/core/utils/pqueue_tag.h b/include/core/utils/pqueue_tag.h index dec61a5cd..907428cac 100644 --- a/include/core/utils/pqueue_tag.h +++ b/include/core/utils/pqueue_tag.h @@ -218,7 +218,7 @@ void pqueue_tag_dump(pqueue_tag_t* q); /** * @brief Return the maximum tag in the queue or NEVER_TAG if the queue is empty. - * + * * @param q The queue. */ tag_t pqueue_tag_max_tag(pqueue_tag_t* q); From 6e9b2a654c6ba79a1a13ca12119691f9b633c692 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 2 Dec 2024 10:12:41 +0100 Subject: [PATCH 122/148] Attempt to pass tests by manually adding prototypes to lf code --- include/core/federated/federate.h | 34 +++++++++++++++---------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index ef5c4a02d..dd828b902 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -530,22 +530,22 @@ void lf_synchronize_with_other_federates(); */ bool lf_update_max_level(tag_t tag, bool is_provisional); -/** - * @brief Return the directory containing the executables of the individual - * federates. - * - * This function is useful for testing purposes only. - * Note that it assumes that all federates are running on the same machine. - * In order for a program to use this function, it needs to include "federate.h" in the preamble. - */ -char* lf_get_federates_bin_directory(); - -/** - * @brief Returns the federation id. - * - * This function is useful for testing purposes only. - * In order for a program to use this function, it needs to include "federate.h" in the preamble. - */ -const char* lf_get_federation_id(); +// /** +// * @brief Return the directory containing the executables of the individual +// * federates. +// * +// * This function is useful for testing purposes only. +// * Note that it assumes that all federates are running on the same machine. +// * In order for a program to use this function, it needs to include "federate.h" in the preamble. +// */ +// char* lf_get_federates_bin_directory(); + +// /** +// * @brief Returns the federation id. +// * +// * This function is useful for testing purposes only. +// * In order for a program to use this function, it needs to include "federate.h" in the preamble. +// */ +// const char* lf_get_federation_id(); #endif // FEDERATE_H From 96efa72c8c312a9f951c24d7581501f2b74148b3 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Mon, 2 Dec 2024 11:48:19 -0800 Subject: [PATCH 123/148] Removed lf_get_federates_bin_directory. Use LF_FED_PACKAGE_DIRECTORY --- core/federated/federate.c | 4 ---- include/core/federated/federate.h | 10 ---------- 2 files changed, 14 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 6b0ef6975..a308fb80e 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2760,10 +2760,6 @@ void lf_stop() { LF_PRINT_LOG("Federate is stopping."); } -char* lf_get_federates_bin_directory() { - return LF_SOURCE_GEN_DIRECTORY LF_FILE_SEPARATOR ".." LF_FILE_SEPARATOR ".." LF_FILE_SEPARATOR "bin"; -} - const char* lf_get_federation_id() { return federation_metadata.federation_id; } #ifdef FEDERATED_DECENTRALIZED diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index dd828b902..b9c6d5dae 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -530,16 +530,6 @@ void lf_synchronize_with_other_federates(); */ bool lf_update_max_level(tag_t tag, bool is_provisional); -// /** -// * @brief Return the directory containing the executables of the individual -// * federates. -// * -// * This function is useful for testing purposes only. -// * Note that it assumes that all federates are running on the same machine. -// * In order for a program to use this function, it needs to include "federate.h" in the preamble. -// */ -// char* lf_get_federates_bin_directory(); - // /** // * @brief Returns the federation id. // * From 98032c11a6f8690170553386d871b17ef938bad9 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 4 Dec 2024 14:11:56 +0100 Subject: [PATCH 124/148] Do not account for absent trnsients when calculating efimt --- core/federated/RTI/rti_common.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/federated/RTI/rti_common.c b/core/federated/RTI/rti_common.c index f1229493f..fd9b01ee3 100644 --- a/core/federated/RTI/rti_common.c +++ b/core/federated/RTI/rti_common.c @@ -94,6 +94,8 @@ tag_t earliest_future_incoming_message_tag(scheduling_node_t* e) { for (int i = 0; i < e->num_min_delays; i++) { // Node e->min_delays[i].id is upstream of e with min delay e->min_delays[i].min_delay. scheduling_node_t* upstream = rti_common->scheduling_nodes[e->min_delays[i].id]; + if (upstream->state == NOT_CONNECTED) + continue; // If we haven't heard from the upstream node, then assume it can send an event at the start time. if (lf_tag_compare(upstream->next_event, NEVER_TAG) == 0) { tag_t start_tag = {.time = start_time, .microstep = 0}; From 666bc1f7ff42ef673aba8d8cd86c84466ff2ed6a Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 8 Dec 2024 15:23:29 -0800 Subject: [PATCH 125/148] Restored declaration of lf_get_federation_id --- include/core/federated/federate.h | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index b9c6d5dae..e7f99eaa9 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -477,6 +477,11 @@ int lf_send_tagged_message(environment_t* env, interval_t additional_delay, int */ void lf_set_federation_id(const char* fid); +/** + * @brief Return the federation id. + */ +const char* lf_get_federation_id(); + #ifdef FEDERATED_DECENTRALIZED /** * @brief Spawn a thread to iterate through STAA structs. @@ -530,12 +535,19 @@ void lf_synchronize_with_other_federates(); */ bool lf_update_max_level(tag_t tag, bool is_provisional); -// /** -// * @brief Returns the federation id. -// * -// * This function is useful for testing purposes only. -// * In order for a program to use this function, it needs to include "federate.h" in the preamble. -// */ -// const char* lf_get_federation_id(); +#ifdef FEDERATED_DECENTRALIZED +/** + * @brief Return the physical time that we should wait until before advancing to the specified tag. + * + * This function adds the STA offset (STP_offset parameter) to the time of the specified tag unless + * the tag is the starting tag (it is always safe to advance to the starting tag). It also avoids + * adding the STA offset if all network input ports are known at least up to one microstep earlier + * than the specified tag. + * + * This function assumes that the caller holds the environment mutex. + * @param time The specified time. + */ +instant_t lf_wait_until_time(tag_t tag); +#endif // FEDERATED_DECENTRALIZED #endif // FEDERATE_H From 832c831ca96a1b7ac38294a0fabf73dce2ab7ea9 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 9 Dec 2024 13:07:42 +0100 Subject: [PATCH 126/148] Attempt to make sure a persistent knows that a transient joined --- core/federated/RTI/rti_remote.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 06002b0c2..20dbc0b98 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1226,11 +1226,18 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); // Notify downstream federates of this now connected transient. - for (int i = 0; i < my_fed->enclave.num_upstream; i++) { - send_upstream_connected_locked(GET_FED_INFO(my_fed->enclave.upstream[i]), my_fed); + for (int i = 0; i < my_fed->enclave.num_downstream; i++) { + send_upstream_connected_locked(GET_FED_INFO(my_fed->enclave.downstream[i]), my_fed); } - LF_MUTEX_UNLOCK(&rti_mutex); + // A corner case was identified where a transient joins at tag (0, 0) and one of its persistent downstreams misses + // the notification. The following is an attempt to make sure it is notified. + for (int i = 0; i < my_fed->enclave.num_upstream; i++) { + federate_info_t* fed = GET_FED_INFO(my_fed->enclave.upstream[i]); + if (fed->is_transient && fed->enclave.state == GRANTED) { + send_upstream_connected_locked(my_fed, fed); + } + } } void handle_timestamp(federate_info_t * my_fed) { From b85bcac5a42393c45424f5a85a4c132aaa61fb25 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Tue, 10 Dec 2024 15:13:30 +0100 Subject: [PATCH 127/148] Fix send_start_tag() function name, comments, and actions ordering --- core/federated/RTI/rti_remote.c | 80 ++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 20dbc0b98..47c87abbe 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -425,23 +425,23 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque } /** - * @brief Send MSG_TYPE_UPSTREAM_CONNECTED to the specified federate. - * + * @brief Send MSG_TYPE_UPSTREAM_CONNECTED to the specified federate, only if it is connected. * * This function assumes that the mutex lock is already held. * @param destination The destination federate. * @param disconnected The connected federate. */ static void send_upstream_connected_locked(federate_info_t * destination, federate_info_t * connected) { - if (!connected->is_transient) { - // No need to send connected message for persistent federates. + if (destination->enclave.state == NOT_CONNECTED) { + LF_PRINT_LOG("RTI did not send upstream connected message to federate %d, because it is not connected.", + destination->enclave.id); return; } unsigned char buffer[MSG_TYPE_UPSTREAM_CONNECTED_LENGTH]; buffer[0] = MSG_TYPE_UPSTREAM_CONNECTED; encode_uint16(connected->enclave.id, &buffer[1]); if (write_to_socket_close_on_error(&destination->socket, MSG_TYPE_UPSTREAM_CONNECTED_LENGTH, buffer)) { - lf_print_warning("RTI: Failed to send upstream connected message to federate %d.", connected->enclave.id); + lf_print_warning("RTI: Failed to send upstream connected message to federate %d.", destination->enclave.id); } } @@ -1183,32 +1183,53 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque } /** - * Send to the start time to the federate my_fed. - * This function assumes the caller does not hold the mutex. + * Send the start time and tag to the federate my_fed. * - * If it is the startup phase, the start_time will be the maximum received timestamps - * plus an offset. The federate will then receive identical federation_start_time - * and federate_start_tag.time (the federate_start_tag.microstep will be 0). - * If, however, the startup phase is passed, the federate will receive different - * values than stated above. + * During the startup phase, the start_time is calculated as the maximum received timestamp, plus an offset. + * The federate will then receive identical values for federation_start_time` and `federate_start_tag.time` (with + * `federate_start_tag.microstep` set to 0). After the startup phase, the federate will receive different values for + * these parameters. * - * This will also notify federates downstream of my_fed that this federate is now - * connected. This is important when there are zero-delay cycles. + * Before sending the start time and tag, this function performs the following actions: + * - If my_fed is transient, notify federates downstream of its connection, ensuring proper handling of zero-delay + * cycles. + * - Notify my_fed of all upstream transient federates that are connected. * - * This function assumes the caller holds the mutex. + * This function assumes that the mutex lock is already held. * * @param my_fed the federate to send the start time to. * @param federation_start_time the federation start_time * @param federate_start_tag the federate effective start tag */ - static void send_start_tag(federate_info_t * my_fed, instant_t federation_start_time, tag_t federate_start_tag) { + static void send_start_tag_locked(federate_info_t * my_fed, instant_t federation_start_time, + tag_t federate_start_tag) { + // If this is a transient federate, notify its downstream federates that it is now connected. + if (my_fed->is_transient) { + for (int i = 0; i < my_fed->enclave.num_downstream; i++) { + send_upstream_connected_locked(GET_FED_INFO(my_fed->enclave.downstream[i]), my_fed); + } + } + // A corner case occurs when an upstream transient joins at tag (0, 0), but `my_fed` + // either misses the notification or receives it late. The following ensures that + // `my_fed` is informed of all currently connected upstream transients. + // This also prevents `my_fed` from receiving the start time and starting execution + // before the upstream connection message is received. + // This also deals with an even less likely corner case where two transients are joining simultaneously and one is + // upstream of the other. + for (int i = 0; i < my_fed->enclave.num_upstream; i++) { + federate_info_t* fed = GET_FED_INFO(my_fed->enclave.upstream[i]); + if (fed->is_transient && fed->enclave.state == GRANTED) { + send_upstream_connected_locked(my_fed, fed); + } + } + // Send back to the federate the maximum time plus an offset on a TIMESTAMP_START // message. // In the startup phase, federates will receive identical start_time and // effective_start_tag unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_START_LENGTH]; start_time_buffer[0] = MSG_TYPE_TIMESTAMP_START; - encode_int64(swap_bytes_if_big_endian_int64(start_time), &start_time_buffer[1]); + encode_int64(swap_bytes_if_big_endian_int64(federation_start_time), &start_time_buffer[1]); encode_tag(&(start_time_buffer[1 + sizeof(instant_t)]), federate_start_tag); if (rti_remote->base.tracing_enabled) { @@ -1218,26 +1239,12 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); } - // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP - // message has been sent. That MSG_TYPE_TIMESTAMP message grants time advance to - // the federate to the start time. + // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP_START + // message has been sent. That MSG_TYPE_TIMESTAMP_START message grants time advance to + // the federate to the federate_start_tag.time. my_fed->enclave.state = GRANTED; lf_cond_broadcast(&sent_start_time); LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); - - // Notify downstream federates of this now connected transient. - for (int i = 0; i < my_fed->enclave.num_downstream; i++) { - send_upstream_connected_locked(GET_FED_INFO(my_fed->enclave.downstream[i]), my_fed); - } - - // A corner case was identified where a transient joins at tag (0, 0) and one of its persistent downstreams misses - // the notification. The following is an attempt to make sure it is notified. - for (int i = 0; i < my_fed->enclave.num_upstream; i++) { - federate_info_t* fed = GET_FED_INFO(my_fed->enclave.upstream[i]); - if (fed->is_transient && fed->enclave.state == GRANTED) { - send_upstream_connected_locked(my_fed, fed); - } - } } void handle_timestamp(federate_info_t * my_fed) { @@ -1286,7 +1293,10 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque LF_MUTEX_UNLOCK(&rti_mutex); // Notify the federate of its start tag. + // This has to be done while still holding the mutex. send_start_tag(my_fed, start_time, my_fed->effective_start_tag); + + LF_MUTEX_UNLOCK(&rti_mutex); } else if (rti_remote->phase == shutdown_phase || !my_fed->is_transient) { LF_MUTEX_UNLOCK(&rti_mutex); @@ -1384,7 +1394,7 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque // Have to send the start tag while still holding the mutex to ensure that no message // from an upstream federate is forwarded before the start tag. - send_start_tag(my_fed, start_time, my_fed->effective_start_tag); + send_start_tag_locked(my_fed, start_time, my_fed->effective_start_tag); // Whenver a transient joins, invalidate all federates, so that all min_delays_upstream // get re-computed. From de5a81bcfc8a3e7c520912cacb6f60b44d15f37d Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 11 Dec 2024 12:18:52 +0100 Subject: [PATCH 128/148] Fix message ordering in send_start_tag_locked() + apply suggestions to improve comments --- core/federated/RTI/rti_remote.c | 58 ++++++++++++++++----------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 47c87abbe..f5bc13603 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -425,7 +425,8 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque } /** - * @brief Send MSG_TYPE_UPSTREAM_CONNECTED to the specified federate, only if it is connected. + * @brief Send MSG_TYPE_UPSTREAM_CONNECTED to the specified `destination` if it is connected to the RTI, + * telling it that the specified `upstream` federate is also now connected. * * This function assumes that the mutex lock is already held. * @param destination The destination federate. @@ -1183,17 +1184,17 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque } /** - * Send the start time and tag to the federate my_fed. + * @brief Send the global federation start time and the federate-specific starting tag to the specified federate. * - * During the startup phase, the start_time is calculated as the maximum received timestamp, plus an offset. - * The federate will then receive identical values for federation_start_time` and `federate_start_tag.time` (with - * `federate_start_tag.microstep` set to 0). After the startup phase, the federate will receive different values for - * these parameters. + * For persistent federates and transient federates that happen to join during federation startup, the + * `federation_start_time` will match the time in the `federate_start_tag`, and the microstep will be 0. + * For a transient federate that joins later, the time in the `federate_start_tag` will be greater than the + * federation_start_time`. * - * Before sending the start time and tag, this function performs the following actions: - * - If my_fed is transient, notify federates downstream of its connection, ensuring proper handling of zero-delay - * cycles. - * - Notify my_fed of all upstream transient federates that are connected. + * + * Before sending the start time and tag, this function notifies my_fed of all upstream transient federates that are + * connected. After sending the start time and tag, and if my_fed is transient, notify federates downstream of its + * connection, ensuring proper handling of zero-delay cycles. * * This function assumes that the mutex lock is already held. * @@ -1203,19 +1204,9 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque */ static void send_start_tag_locked(federate_info_t * my_fed, instant_t federation_start_time, tag_t federate_start_tag) { - // If this is a transient federate, notify its downstream federates that it is now connected. - if (my_fed->is_transient) { - for (int i = 0; i < my_fed->enclave.num_downstream; i++) { - send_upstream_connected_locked(GET_FED_INFO(my_fed->enclave.downstream[i]), my_fed); - } - } - // A corner case occurs when an upstream transient joins at tag (0, 0), but `my_fed` - // either misses the notification or receives it late. The following ensures that - // `my_fed` is informed of all currently connected upstream transients. - // This also prevents `my_fed` from receiving the start time and starting execution - // before the upstream connection message is received. - // This also deals with an even less likely corner case where two transients are joining simultaneously and one is - // upstream of the other. + // Notify my_fed of any upstream transient federates that are connected. + // This has to occur before sending the start tag so that my_fed does not begin executing thinking that these + // upstream federates are not connected. for (int i = 0; i < my_fed->enclave.num_upstream; i++) { federate_info_t* fed = GET_FED_INFO(my_fed->enclave.upstream[i]); if (fed->is_transient && fed->enclave.state == GRANTED) { @@ -1237,14 +1228,21 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque } if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_START_LENGTH, start_time_buffer)) { lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); + } else { + // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP_START + // message has been sent. That MSG_TYPE_TIMESTAMP_START message grants time advance to + // the federate to the federate_start_tag.time. + my_fed->enclave.state = GRANTED; + lf_cond_broadcast(&sent_start_time); + LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); + + // If this is a transient federate, notify its downstream federates that it is now connected. + if (my_fed->is_transient) { + for (int i = 0; i < my_fed->enclave.num_downstream; i++) { + send_upstream_connected_locked(GET_FED_INFO(my_fed->enclave.downstream[i]), my_fed); + } + } } - - // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP_START - // message has been sent. That MSG_TYPE_TIMESTAMP_START message grants time advance to - // the federate to the federate_start_tag.time. - my_fed->enclave.state = GRANTED; - lf_cond_broadcast(&sent_start_time); - LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); } void handle_timestamp(federate_info_t * my_fed) { From 09deefffc07f028326ee3a36696cd91be0027b60 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 11 Dec 2024 14:33:15 +0100 Subject: [PATCH 129/148] Allow federate to handle that an upstream has connected or disconnected even before receiving the start time --- core/federated/federate.c | 94 +++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 49 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index a308fb80e..9e9fb9499 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -968,6 +968,44 @@ static int perform_hmac_authentication() { } #endif +/** + * @brief Handle message from the RTI that an upstream federate has connected. + * + */ +static void handle_upstream_connected_message(void) { + size_t bytes_to_read = sizeof(uint16_t); + unsigned char buffer[bytes_to_read]; + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, + "Failed to read upstream connected message from RTI."); + uint16_t connected = extract_uint16(buffer); + LF_PRINT_DEBUG("Received notification that upstream federate %d has connected", connected); + // Mark the upstream as connected. + for (size_t i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { + if (_lf_zero_delay_cycle_upstream_ids[i] == connected) { + _lf_zero_delay_cycle_upstream_disconnected[i] = false; + } + } +} + +/** + * @brief Handle message from the RTI that an upstream federate has disconnected. + * + */ +static void handle_upstream_disconnected_message(void) { + size_t bytes_to_read = sizeof(uint16_t); + unsigned char buffer[bytes_to_read]; + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, + "Failed to read upstream disconnected message from RTI."); + uint16_t disconnected = extract_uint16(buffer); + LF_PRINT_DEBUG("Received notification that upstream federate %d has disconnected", disconnected); + // Mark the upstream as disconnected. + for (size_t i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { + if (_lf_zero_delay_cycle_upstream_ids[i] == disconnected) { + _lf_zero_delay_cycle_upstream_disconnected[i] = true; + } + } +} + /** * Send the specified timestamp to the RTI and wait for a response. * The specified timestamp should be current physical time of the @@ -994,16 +1032,12 @@ static instant_t get_start_time_from_rti(instant_t my_physical_time) { if (buffer[0] == MSG_TYPE_FAILED) { lf_print_error_and_exit("RTI has failed."); } else if (buffer[0] == MSG_TYPE_UPSTREAM_CONNECTED) { - // We need to swallow this message so that we continue waiting for MSG_TYPE_TIMESTAMP_START to arrive - // FIXME: Shouldn't we keep the ids, so that these messages are handled right after the startime is set? - read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, MSG_TYPE_UPSTREAM_CONNECTED_LENGTH - 1, buffer + 1, NULL, - "Failed to complete reading MSG_TYPE_UPSTREAM_CONNECTED."); + // We need to handle this message and continue waiting for MSG_TYPE_TIMESTAMP_START to arrive + handle_upstream_connected_message(); continue; } else if (buffer[0] == MSG_TYPE_UPSTREAM_DISCONNECTED) { - // We need to swallow this message so that we continue waiting for MSG_TYPE_TIMESTAMP_START to arrive - // FIXME: Shouldn't we keep the ids, so that these messages are handled right after the startime is set? - read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH - 1, buffer + 1, - NULL, "Failed to complete reading MSG_TYPE_UPSTREAM_DISCONNECTED."); + // We need to handle this message and continue waiting for MSG_TYPE_TIMESTAMP_START to arrive + handle_upstream_disconnected_message(); continue; } else { lf_print_error_and_exit("Expected a MSG_TYPE_TIMESTAMP_START message from the RTI. Got %u (see net_common.h).", @@ -1560,44 +1594,6 @@ static void send_failed_signal() { */ static void handle_rti_failed_message(void) { exit(1); } -/** - * @brief Handle message from the RTI that an upstream federate has connected. - * - */ -static void handle_upstream_connected_message(void) { - size_t bytes_to_read = sizeof(uint16_t); - unsigned char buffer[bytes_to_read]; - read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, - "Failed to read upstream connected message from RTI."); - uint16_t connected = extract_uint16(buffer); - LF_PRINT_DEBUG("Received notification that upstream federate %d has connected", connected); - // Mark the upstream as connected. - for (size_t i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { - if (_lf_zero_delay_cycle_upstream_ids[i] == connected) { - _lf_zero_delay_cycle_upstream_disconnected[i] = false; - } - } -} - -/** - * @brief Handle message from the RTI that an upstream federate has disconnected. - * - */ -static void handle_upstream_disconnected_message(void) { - size_t bytes_to_read = sizeof(uint16_t); - unsigned char buffer[bytes_to_read]; - read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, - "Failed to read upstream disconnected message from RTI."); - uint16_t disconnected = extract_uint16(buffer); - LF_PRINT_DEBUG("Received notification that upstream federate %d has disconnected", disconnected); - // Mark the upstream as disconnected. - for (size_t i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { - if (_lf_zero_delay_cycle_upstream_ids[i] == disconnected) { - _lf_zero_delay_cycle_upstream_disconnected[i] = true; - } - } -} - /** * Thread that listens for TCP inputs from the RTI. * When messages arrive, this calls the appropriate handler. @@ -1963,9 +1959,9 @@ void lf_connect_to_rti(const char* hostname, int port) { instant_t start_connect = lf_time_physical(); while (!CHECK_TIMEOUT(start_connect, CONNECT_TIMEOUT) && !_lf_termination_executed) { - // Have connected to an RTI, but not sure it's the right RTI. - // Send a MSG_TYPE_FED_IDS message and wait for a reply. - // Notify the RTI of the ID of this federate and its federation. + // Have connected to an RTI, but not sure it's the right RTI. + // Send a MSG_TYPE_FED_IDS message and wait for a reply. + // Notify the RTI of the ID of this federate and its federation. #ifdef FEDERATED_AUTHENTICATED LF_PRINT_LOG("Connected to an RTI. Performing HMAC-based authentication using federation ID."); From c465bc76ea7f6ab55fed26f75554033cd30f952a Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 11 Dec 2024 14:38:56 +0100 Subject: [PATCH 130/148] Format?! --- include/core/utils/impl/hashmap.h | 2 +- include/core/utils/impl/pointer_hashmap.h | 2 +- tag/api/tag.h | 9 ++++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/core/utils/impl/hashmap.h b/include/core/utils/impl/hashmap.h index 94d5969a7..e64774887 100644 --- a/include/core/utils/impl/hashmap.h +++ b/include/core/utils/impl/hashmap.h @@ -19,7 +19,7 @@ #define V void* #endif #ifndef HASH_OF -#define HASH_OF(key) (size_t)key +#define HASH_OF(key) (size_t) key #endif #ifndef HASHMAP #define HASHMAP(token) hashmap##_##token diff --git a/include/core/utils/impl/pointer_hashmap.h b/include/core/utils/impl/pointer_hashmap.h index c2a60aef1..2184518b3 100644 --- a/include/core/utils/impl/pointer_hashmap.h +++ b/include/core/utils/impl/pointer_hashmap.h @@ -30,7 +30,7 @@ #define HASHMAP(token) hashmap_object2int##_##token #define K void* #define V int -#define HASH_OF(key) (size_t)key +#define HASH_OF(key) (size_t) key #include "hashmap.h" #undef HASHMAP #undef K diff --git a/tag/api/tag.h b/tag/api/tag.h index 2784e1c84..9b1a3f9ad 100644 --- a/tag/api/tag.h +++ b/tag/api/tag.h @@ -37,12 +37,15 @@ #define NEVER_TAG \ (tag_t) { .time = NEVER, .microstep = NEVER_MICROSTEP } // Need a separate initializer expression to comply with some C compilers -#define NEVER_TAG_INITIALIZER {NEVER, NEVER_MICROSTEP} +#define NEVER_TAG_INITIALIZER \ + { NEVER, NEVER_MICROSTEP } #define FOREVER_TAG \ (tag_t) { .time = FOREVER, .microstep = FOREVER_MICROSTEP } // Need a separate initializer expression to comply with some C compilers -#define FOREVER_TAG_INITIALIZER {FOREVER, FOREVER_MICROSTEP} -#define ZERO_TAG (tag_t){.time = 0LL, .microstep = 0u} +#define FOREVER_TAG_INITIALIZER \ + { FOREVER, FOREVER_MICROSTEP } +#define ZERO_TAG \ + (tag_t) { .time = 0LL, .microstep = 0u } // Returns true if timeout has elapsed. #define CHECK_TIMEOUT(start, duration) (lf_time_physical() > ((start) + (duration))) From fd5fe0623fa43a75643a5687563d958012a56324 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 11 Dec 2024 15:34:12 +0100 Subject: [PATCH 131/148] Formatting using clang-format-19 --- core/federated/federate.c | 6 +++--- include/core/utils/impl/hashmap.h | 2 +- include/core/utils/impl/pointer_hashmap.h | 2 +- tag/api/tag.h | 9 +++------ 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 9e9fb9499..aefebf5fe 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1959,9 +1959,9 @@ void lf_connect_to_rti(const char* hostname, int port) { instant_t start_connect = lf_time_physical(); while (!CHECK_TIMEOUT(start_connect, CONNECT_TIMEOUT) && !_lf_termination_executed) { - // Have connected to an RTI, but not sure it's the right RTI. - // Send a MSG_TYPE_FED_IDS message and wait for a reply. - // Notify the RTI of the ID of this federate and its federation. + // Have connected to an RTI, but not sure it's the right RTI. + // Send a MSG_TYPE_FED_IDS message and wait for a reply. + // Notify the RTI of the ID of this federate and its federation. #ifdef FEDERATED_AUTHENTICATED LF_PRINT_LOG("Connected to an RTI. Performing HMAC-based authentication using federation ID."); diff --git a/include/core/utils/impl/hashmap.h b/include/core/utils/impl/hashmap.h index e64774887..94d5969a7 100644 --- a/include/core/utils/impl/hashmap.h +++ b/include/core/utils/impl/hashmap.h @@ -19,7 +19,7 @@ #define V void* #endif #ifndef HASH_OF -#define HASH_OF(key) (size_t) key +#define HASH_OF(key) (size_t)key #endif #ifndef HASHMAP #define HASHMAP(token) hashmap##_##token diff --git a/include/core/utils/impl/pointer_hashmap.h b/include/core/utils/impl/pointer_hashmap.h index 2184518b3..c2a60aef1 100644 --- a/include/core/utils/impl/pointer_hashmap.h +++ b/include/core/utils/impl/pointer_hashmap.h @@ -30,7 +30,7 @@ #define HASHMAP(token) hashmap_object2int##_##token #define K void* #define V int -#define HASH_OF(key) (size_t) key +#define HASH_OF(key) (size_t)key #include "hashmap.h" #undef HASHMAP #undef K diff --git a/tag/api/tag.h b/tag/api/tag.h index 9b1a3f9ad..2784e1c84 100644 --- a/tag/api/tag.h +++ b/tag/api/tag.h @@ -37,15 +37,12 @@ #define NEVER_TAG \ (tag_t) { .time = NEVER, .microstep = NEVER_MICROSTEP } // Need a separate initializer expression to comply with some C compilers -#define NEVER_TAG_INITIALIZER \ - { NEVER, NEVER_MICROSTEP } +#define NEVER_TAG_INITIALIZER {NEVER, NEVER_MICROSTEP} #define FOREVER_TAG \ (tag_t) { .time = FOREVER, .microstep = FOREVER_MICROSTEP } // Need a separate initializer expression to comply with some C compilers -#define FOREVER_TAG_INITIALIZER \ - { FOREVER, FOREVER_MICROSTEP } -#define ZERO_TAG \ - (tag_t) { .time = 0LL, .microstep = 0u } +#define FOREVER_TAG_INITIALIZER {FOREVER, FOREVER_MICROSTEP} +#define ZERO_TAG (tag_t){.time = 0LL, .microstep = 0u} // Returns true if timeout has elapsed. #define CHECK_TIMEOUT(start, duration) (lf_time_physical() > ((start) + (duration))) From c4c2b359bd736188ffe5601387a488755bf9ee6b Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Wed, 11 Dec 2024 22:47:44 +0100 Subject: [PATCH 132/148] Fix dropping the message when a transient's effective start tag is not yet reached. --- core/federated/RTI/rti_remote.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index f5bc13603..c9dc53f8f 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -807,10 +807,26 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque LF_MUTEX_UNLOCK(&rti_mutex); return; } else { + // Do not forward the message if the federate is connected, but its + // start_time is not reached yet if (lf_tag_compare(intended_tag, fed->effective_start_tag) < 0) { - // Do not forward the message if the federate is connected, but its - // start_time is not reached yet - lf_mutex_unlock(&rti_mutex); + LF_PRINT_LOG("RTI: Effective start tag of the destination federate %d (" PRINTF_TAG "), " + "is not reached yet, while the received message tag is ()" PRINTF_TAG "). " + "Dropping message.", + federate_id, fed->effective_start_tag.time - start_time, fed->effective_start_tag.microstep, + intended_tag.time - start_time, intended_tag.microstep); + // Similarly, if the message was larger than the buffer, we must empty out the remainder also. + size_t total_bytes_read = bytes_read; + while (total_bytes_read < total_bytes_to_read) { + bytes_to_read = total_bytes_to_read - total_bytes_read; + if (bytes_to_read > FED_COM_BUFFER_SIZE) { + bytes_to_read = FED_COM_BUFFER_SIZE; + } + read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, + "RTI failed to clear message chunks."); + total_bytes_read += bytes_to_read; + } + LF_MUTEX_UNLOCK(&rti_mutex); return; } } @@ -2635,6 +2651,7 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque fed->server_ip_addr.s_addr = 0; fed->server_port = -1; fed->requested_stop = false; + fed->effective_start_tag = NEVER_TAG; // invalidate_all_min_delays(); } From b942c889baacbe5b33aea83b9bf9430f4e78f4a0 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 13 Dec 2024 06:57:56 +0100 Subject: [PATCH 133/148] Simplify the code of message dropping in the remote RTI. --- core/federated/RTI/rti_remote.c | 43 ++++++--------------------------- 1 file changed, 7 insertions(+), 36 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index c9dc53f8f..c5e53547a 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -779,20 +779,14 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque // issue a TAG before this message has been forwarded. LF_MUTEX_LOCK(&rti_mutex); - // If the destination federate is no longer connected, issue a warning, - // remove the message from the socket and return. + // If the destination federate is no longer connected, or it is a transient that has not started executing yet + // (the intended tag is less than the effective start tag of the destination), issue a warning, remove the message + // from the socket, and return. federate_info_t* fed = GET_FED_INFO(federate_id); - if (fed->enclave.state == NOT_CONNECTED) { - lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); - LF_PRINT_LOG("Fed status: next_event " PRINTF_TAG ", " - "completed " PRINTF_TAG ", " - "last_granted " PRINTF_TAG ", " - "last_provisionally_granted " PRINTF_TAG ".", - fed->enclave.next_event.time - start_time, fed->enclave.next_event.microstep, - fed->enclave.completed.time - start_time, fed->enclave.completed.microstep, - fed->enclave.last_granted.time - start_time, fed->enclave.last_granted.microstep, - fed->enclave.last_provisionally_granted.time - start_time, - fed->enclave.last_provisionally_granted.microstep); + if (fed->enclave.state == NOT_CONNECTED || lf_tag_compare(intended_tag, fed->effective_start_tag) < 0) { + lf_print_warning("RTI: Destination federate %d is not connected at logical time (" PRINTF_TAG + "). Dropping message.", + federate_id, intended_tag.time - start_time, intended_tag.microstep); // If the message was larger than the buffer, we must empty out the remainder also. size_t total_bytes_read = bytes_read; while (total_bytes_read < total_bytes_to_read) { @@ -806,29 +800,6 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque } LF_MUTEX_UNLOCK(&rti_mutex); return; - } else { - // Do not forward the message if the federate is connected, but its - // start_time is not reached yet - if (lf_tag_compare(intended_tag, fed->effective_start_tag) < 0) { - LF_PRINT_LOG("RTI: Effective start tag of the destination federate %d (" PRINTF_TAG "), " - "is not reached yet, while the received message tag is ()" PRINTF_TAG "). " - "Dropping message.", - federate_id, fed->effective_start_tag.time - start_time, fed->effective_start_tag.microstep, - intended_tag.time - start_time, intended_tag.microstep); - // Similarly, if the message was larger than the buffer, we must empty out the remainder also. - size_t total_bytes_read = bytes_read; - while (total_bytes_read < total_bytes_to_read) { - bytes_to_read = total_bytes_to_read - total_bytes_read; - if (bytes_to_read > FED_COM_BUFFER_SIZE) { - bytes_to_read = FED_COM_BUFFER_SIZE; - } - read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, - "RTI failed to clear message chunks."); - total_bytes_read += bytes_to_read; - } - LF_MUTEX_UNLOCK(&rti_mutex); - return; - } } LF_PRINT_DEBUG("RTI forwarding message to port %d of federate %hu of length %zu.", reactor_port_id, federate_id, From c6712c6ccead754c1f333dd87031b8a8e7dd5248 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 16 Dec 2024 11:52:36 +0100 Subject: [PATCH 134/148] Account for delay when dropping a message to be forwarded to a transient federate --- core/federated/RTI/rti_remote.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index c5e53547a..eca312503 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -780,10 +780,18 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque LF_MUTEX_LOCK(&rti_mutex); // If the destination federate is no longer connected, or it is a transient that has not started executing yet - // (the intended tag is less than the effective start tag of the destination), issue a warning, remove the message - // from the socket, and return. + // (the delayed intended tag is less than the effective start tag of the destination), issue a warning, remove the + // message from the socket, and return. federate_info_t* fed = GET_FED_INFO(federate_id); - if (fed->enclave.state == NOT_CONNECTED || lf_tag_compare(intended_tag, fed->effective_start_tag) < 0) { + interval_t delay = NEVER; + for (int i = 0; i < fed->enclave.num_upstream; i++) { + if (fed->enclave.upstream[i] == sending_federate->enclave.id) { + delay = fed->enclave.upstream_delay[i]; + break; + } + } + if (fed->enclave.state == NOT_CONNECTED || + lf_tag_compare(lf_delay_tag(intended_tag, delay), fed->effective_start_tag) < 0) { lf_print_warning("RTI: Destination federate %d is not connected at logical time (" PRINTF_TAG "). Dropping message.", federate_id, intended_tag.time - start_time, intended_tag.microstep); From 4c1e9c857793e6c9e35ce75c77a5d5074b532e1d Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 3 Jan 2025 12:13:09 +0100 Subject: [PATCH 135/148] Cleanup after rebase --- core/federated/RTI/rti_remote.c | 4472 ++++++++++++++----------------- core/federated/federate.c | 6 +- trace/api/types/trace_types.h | 4 + 3 files changed, 2077 insertions(+), 2405 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index eca312503..13c0cae0c 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -30,7 +30,6 @@ #include "net_util.h" #include #include "clock.h" // For lf_clock_cond_timedwait() -#include "clock.h" // For lf_clock_cond_timedwait() // Global variables defined in tag.c: extern instant_t start_time; @@ -153,2694 +152,2363 @@ pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_with_tag(pqueue_delay */ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t* q, uint16_t fed_id) { - static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t * q, - uint16_t fed_id) { - pqueue_delayed_grant_element_t* dge; - pqueue_t* _q = (pqueue_t*)q; - if (!q || q->size == 1) - return NULL; - for (int i = 1; i <= q->size; i++) { - dge = (pqueue_delayed_grant_element_t*)q->d[i]; - if (dge->fed_id == fed_id) { - return dge; - } - } - return NULL; - } - - // Utility functions to simplify the call of pqueue_tag routines. - // These functions mainly do the casting. - // FIXME: Should we remove the queue parameter from the functions? - - /** - * @brief Creates a priority queue of delayed grants that is sorted by tags. - * - * @param nbr_delayed_grants The size. - * @return The dynamically allocated queue or NULL. - */ - pqueue_delayed_grants_t* pqueue_delayed_grants_init(uint16_t nbr_delayed_grants) { - return (pqueue_delayed_grants_t*)pqueue_tag_init((size_t)nbr_delayed_grants); - } - - /** - * @brief Return the size of the queue. - * - * @param q The queue. - * @return The size. - */ - size_t pqueue_delayed_grants_size(pqueue_delayed_grants_t * q) { return pqueue_tag_size((pqueue_tag_t*)q); } - - /** - * @brief Insert an\ delayed grant element into the queue. - * - * @param q The queue. - * @param e The delayed grant element to insert. - * @return 0 on success - */ - int pqueue_delayed_grants_insert(pqueue_delayed_grants_t * q, pqueue_delayed_grant_element_t * d) { - return pqueue_tag_insert((pqueue_tag_t*)q, (void*)d); - } - - /** - * @brief Pop the least-tag element from the queue. - * - * @param q The queue. - * @return NULL on error, otherwise the entry - */ - pqueue_delayed_grant_element_t* pqueue_delayed_grants_pop(pqueue_delayed_grants_t * q) { - return (pqueue_delayed_grant_element_t*)pqueue_tag_pop((pqueue_tag_t*)q); - } - - /** - * @brief Return highest-ranking element without removing it. - * - * @param q The queue. - * @return NULL on if the queue is empty, otherwise the delayed grant element. - */ - pqueue_delayed_grant_element_t* pqueue_delayed_grants_peek(pqueue_delayed_grants_t * q) { - return (pqueue_delayed_grant_element_t*)pqueue_tag_peek((pqueue_tag_t*)q); - } - - /** - * @brief Free all memory used by the queue including elements that are marked dynamic. - * - * @param q The queue. - */ - void pqueue_delayed_grants_free(pqueue_delayed_grants_t * q) { pqueue_tag_free((pqueue_tag_t*)q); } - - /** - * @brief Remove an item from the delayed grants queue. - * - * @param q The queue. - * @param e The entry to remove. - */ - void pqueue_delayed_grants_remove(pqueue_delayed_grants_t * q, pqueue_delayed_grant_element_t * e) { - pqueue_tag_remove((pqueue_tag_t*)q, (void*)e); - } - - /** - * @brief Return the first item with the specified tag or NULL if there is none. - * @param q The queue. - * @param t The tag. - * @return An entry with the specified tag or NULL if there isn't one. - */ - pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_with_tag(pqueue_delayed_grants_t * q, tag_t t) { - return (pqueue_delayed_grant_element_t*)pqueue_tag_find_with_tag((pqueue_tag_t*)q, t); - } - - // Function that does not in pqueue_tag.c - /** - * @brief Return the first item with the specified federate id or NULL if there is none. - * @param q The queue. - * @param fed_id The federate id. - * @return An entry with the specified federate if or NULL if there isn't one. - */ - - pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t * q, uint16_t fed_id) { - pqueue_delayed_grant_element_t* dge; - pqueue_t* _q = (pqueue_t*)q; - if (!q || q->size == 1) - return NULL; - for (int i = 1; i <= q->size; i++) { - dge = (pqueue_delayed_grant_element_t*)q->d[i]; - if (dge->fed_id == fed_id) { - return dge; - } - } + pqueue_delayed_grant_element_t* dge; + pqueue_t* _q = (pqueue_t*)q; + if (!q || q->size == 1) return NULL; - } - - // Utility functions to simplify the call of pqueue_tag routines. - // These functions mainly do the casting. - // FIXME: Should we remove the queue parameter from the functions? - - /** - * @brief Creates a priority queue of delayed grants that is sorted by tags. - * - * @param nbr_delayed_grants The size. - * @return The dynamically allocated queue or NULL. - */ - pqueue_delayed_grants_t* pqueue_delayed_grants_init(uint16_t nbr_delayed_grants) { - return (pqueue_delayed_grants_t*)pqueue_tag_init((size_t)nbr_delayed_grants); - } - - /** - * @brief Return the size of the queue. - * - * @param q The queue. - * @return The size. - */ - size_t pqueue_delayed_grants_size(pqueue_delayed_grants_t * q) { return pqueue_tag_size((pqueue_tag_t*)q); } - - /** - * @brief Insert an\ delayed grant element into the queue. - * - * @param q The queue. - * @param e The delayed grant element to insert. - * @return 0 on success - */ - int pqueue_delayed_grants_insert(pqueue_delayed_grants_t * q, pqueue_delayed_grant_element_t * d) { - return pqueue_tag_insert((pqueue_tag_t*)q, (void*)d); - } - - /** - * @brief Pop the least-tag element from the queue. - * - * @param q The queue. - * @return NULL on error, otherwise the entry - */ - pqueue_delayed_grant_element_t* pqueue_delayed_grants_pop(pqueue_delayed_grants_t * q) { - return (pqueue_delayed_grant_element_t*)pqueue_tag_pop((pqueue_tag_t*)q); - } - - /** - * @brief Return highest-ranking element without removing it. - * - * @param q The queue. - * @return NULL on if the queue is empty, otherwise the delayed grant element. - */ - pqueue_delayed_grant_element_t* pqueue_delayed_grants_peek(pqueue_delayed_grants_t * q) { - return (pqueue_delayed_grant_element_t*)pqueue_tag_peek((pqueue_tag_t*)q); - } - - /** - * @brief Free all memory used by the queue including elements that are marked dynamic. - * - * @param q The queue. - */ - void pqueue_delayed_grants_free(pqueue_delayed_grants_t * q) { pqueue_tag_free((pqueue_tag_t*)q); } - - /** - * @brief Remove an item from the delayed grants queue. - * - * @param q The queue. - * @param e The entry to remove. - */ - void pqueue_delayed_grants_remove(pqueue_delayed_grants_t * q, pqueue_delayed_grant_element_t * e) { - pqueue_tag_remove((pqueue_tag_t*)q, (void*)e); - } - - // Function that does not exist in pqueue_tag.c - /** - * @brief Return the first item with the specified federate id or NULL if there is none. - * @param q The queue. - * @param fed_id The federate id. - * @return An entry with the specified federate if or NULL if there isn't one. - */ - pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t * q, uint16_t fed_id) { - pqueue_delayed_grant_element_t* dge; - if (!q || q->size == 1) - return NULL; - for (int i = 1; i < q->size; i++) { - dge = (pqueue_delayed_grant_element_t*)q->d[i]; - if (dge) { - if (dge->fed_id == fed_id) { - return dge; - } - } + for (int i = 1; i <= q->size; i++) { + dge = (pqueue_delayed_grant_element_t*)q->d[i]; + if (dge->fed_id == fed_id) { + return dge; } - return NULL; } + return NULL; +} - /** - * @brief Insert the delayed grant into the delayed_grants queue and notify. - * - * - * This function assumes the caller holds the rti_mutex. - * @param fed The federate. - * @param tag The tag to grant. - * @param is_provisional State whther the grant is provisional. - */ - static void notify_grant_delayed(federate_info_t * fed, tag_t tag, bool is_provisional) { - // Check wether there is already a pending grant. +/** + * @brief Insert the delayed grant into the delayed_grants queue and notify. + * + * + * This function assumes the caller holds the rti_mutex. + * @param fed The federate. + * @param tag The tag to grant. + * @param is_provisional State whther the grant is provisional. + */ +static void notify_grant_delayed(federate_info_t* fed, tag_t tag, bool is_provisional) { + // Check wether there is already a pending grant. + pqueue_delayed_grant_element_t* dge = + pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, fed->enclave.id); + if (dge == NULL) { pqueue_delayed_grant_element_t* dge = - pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, fed->enclave.id); - if (dge == NULL) { - pqueue_delayed_grant_element_t* dge = - (pqueue_delayed_grant_element_t*)malloc(sizeof(pqueue_delayed_grant_element_t)); - dge->base.is_dynamic = 1; + (pqueue_delayed_grant_element_t*)malloc(sizeof(pqueue_delayed_grant_element_t)); + dge->base.is_dynamic = 1; + dge->base.tag = tag; + dge->fed_id = fed->enclave.id; + dge->is_provisional = is_provisional; + pqueue_delayed_grants_insert(rti_remote->delayed_grants, dge); + LF_PRINT_LOG("RTI: Inserting a delayed grant of " PRINTF_TAG " for federate %d.", dge->base.tag.time - start_time, + dge->base.tag.microstep, dge->fed_id); + lf_cond_signal(&updated_delayed_grants); + } else { + // Note that there should never be more than one pending grant for a federate. + int compare = lf_tag_compare(dge->base.tag, tag); + if (compare > 0) { + // Update the pre-existing grant. dge->base.tag = tag; - dge->fed_id = fed->enclave.id; dge->is_provisional = is_provisional; - pqueue_delayed_grants_insert(rti_remote->delayed_grants, dge); - LF_PRINT_LOG("RTI: Inserting a delayed grant of " PRINTF_TAG " for federate %d.", dge->base.tag.time - start_time, - dge->base.tag.microstep, dge->fed_id); + LF_PRINT_LOG("RTI: Updating a delayed grant of " PRINTF_TAG " for federate %d.", tag.time - start_time, + tag.microstep, dge->fed_id); lf_cond_signal(&updated_delayed_grants); - } else { - // Note that there should never be more than one pending grant for a federate. - int compare = lf_tag_compare(dge->base.tag, tag); - if (compare > 0) { - // Update the pre-existing grant. - dge->base.tag = tag; + } else if (compare == 0) { + if (dge->is_provisional != is_provisional) { + // Update the grant to keep the most recent is_provisional status. dge->is_provisional = is_provisional; - LF_PRINT_LOG("RTI: Updating a delayed grant of " PRINTF_TAG " for federate %d.", tag.time - start_time, - tag.microstep, dge->fed_id); - lf_cond_signal(&updated_delayed_grants); - } else if (compare == 0) { - if (dge->is_provisional != is_provisional) { - // Update the grant to keep the most recent is_provisional status. - dge->is_provisional = is_provisional; - LF_PRINT_LOG("RTI: Changing status of a delayed grant of " PRINTF_TAG " for federate %d to provisional: %d.", - dge->base.tag.time - start_time, dge->base.tag.microstep, dge->fed_id, is_provisional); - } + LF_PRINT_LOG("RTI: Changing status of a delayed grant of " PRINTF_TAG " for federate %d to provisional: %d.", + dge->base.tag.time - start_time, dge->base.tag.microstep, dge->fed_id, is_provisional); } } } +} - /** - * Find the number of non connected upstream transients - * @param fed The federate - * @return the number of non connected upstream transients - */ - static int get_num_absent_upstream_transients(federate_info_t * fed) { - int num_absent_upstream_transients = 0; - for (int j = 0; j < fed->enclave.num_upstream; j++) { - federate_info_t* upstream = GET_FED_INFO(fed->enclave.upstream[j]); - // Ignore this enclave if it no longer connected. - if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { - num_absent_upstream_transients++; - } - } - return num_absent_upstream_transients; - } - - /** - * @brief Send MSG_TYPE_UPSTREAM_CONNECTED to the specified `destination` if it is connected to the RTI, - * telling it that the specified `upstream` federate is also now connected. - * - * This function assumes that the mutex lock is already held. - * @param destination The destination federate. - * @param disconnected The connected federate. - */ - static void send_upstream_connected_locked(federate_info_t * destination, federate_info_t * connected) { - if (destination->enclave.state == NOT_CONNECTED) { - LF_PRINT_LOG("RTI did not send upstream connected message to federate %d, because it is not connected.", - destination->enclave.id); - return; - } - unsigned char buffer[MSG_TYPE_UPSTREAM_CONNECTED_LENGTH]; - buffer[0] = MSG_TYPE_UPSTREAM_CONNECTED; - encode_uint16(connected->enclave.id, &buffer[1]); - if (write_to_socket_close_on_error(&destination->socket, MSG_TYPE_UPSTREAM_CONNECTED_LENGTH, buffer)) { - lf_print_warning("RTI: Failed to send upstream connected message to federate %d.", destination->enclave.id); +/** + * Find the number of non connected upstream transients + * @param fed The federate + * @return the number of non connected upstream transients + */ +static int get_num_absent_upstream_transients(federate_info_t* fed) { + int num_absent_upstream_transients = 0; + for (int j = 0; j < fed->enclave.num_upstream; j++) { + federate_info_t* upstream = GET_FED_INFO(fed->enclave.upstream[j]); + // Ignore this enclave if it no longer connected. + if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { + num_absent_upstream_transients++; } } + return num_absent_upstream_transients; +} - /** - * @brief Send MSG_TYPE_UPSTREAM_DISCONNECTED to the specified federate. - * - * This function assumes that the mutex lock is already held. - * @param destination The destination federate. - * @param disconnected The disconnected federate. - */ - static void send_upstream_disconnected_locked(federate_info_t * destination, federate_info_t * disconnected) { - unsigned char buffer[MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH]; - buffer[0] = MSG_TYPE_UPSTREAM_DISCONNECTED; - encode_uint16(disconnected->enclave.id, &buffer[1]); - if (write_to_socket_close_on_error(&destination->socket, MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH, buffer)) { - lf_print_warning("RTI: Failed to send upstream disconnected message to federate %d.", disconnected->enclave.id); - } +/** + * @brief Send MSG_TYPE_UPSTREAM_CONNECTED to the specified `destination` if it is connected to the RTI, + * telling it that the specified `upstream` federate is also now connected. + * + * This function assumes that the mutex lock is already held. + * @param destination The destination federate. + * @param disconnected The connected federate. + */ +static void send_upstream_connected_locked(federate_info_t* destination, federate_info_t* connected) { + if (destination->enclave.state == NOT_CONNECTED) { + LF_PRINT_LOG("RTI did not send upstream connected message to federate %d, because it is not connected.", + destination->enclave.id); + return; } - - /** - * @brief Mark a federate as disconnected and inform downstream federates. - * @param e The enclave corresponding to the disconnected federate. - */ - static void notify_federate_disconnected(scheduling_node_t * e) { - e->state = NOT_CONNECTED; - // Notify downstream federates. Need to hold the mutex lock to do this. - LF_MUTEX_LOCK(&rti_mutex); - for (int j = 0; j < e->num_downstream; j++) { - federate_info_t* downstream = GET_FED_INFO(e->downstream[j]); - // Ignore this enclave if it no longer connected. - if (downstream->enclave.state != NOT_CONNECTED) { - // Notify the downstream enclave. - send_upstream_disconnected_locked(downstream, GET_FED_INFO(e->id)); - } - } - LF_MUTEX_UNLOCK(&rti_mutex); + unsigned char buffer[MSG_TYPE_UPSTREAM_CONNECTED_LENGTH]; + buffer[0] = MSG_TYPE_UPSTREAM_CONNECTED; + encode_uint16(connected->enclave.id, &buffer[1]); + if (write_to_socket_close_on_error(&destination->socket, MSG_TYPE_UPSTREAM_CONNECTED_LENGTH, buffer)) { + lf_print_warning("RTI: Failed to send upstream connected message to federate %d.", destination->enclave.id); } +} - /** - * Notify a tag advance grant (TAG) message to the specified federate immediately. - * - * This function will keep a record of this TAG in the enclave's last_granted - * field. - * - * @param e The enclave. - * @param tag The tag to grant. - */ - static void notify_tag_advance_grant_immediate(scheduling_node_t * e, tag_t tag) { - size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); - unsigned char buffer[message_length]; - buffer[0] = MSG_TYPE_TAG_ADVANCE_GRANT; - encode_int64(tag.time, &(buffer[1])); - encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); - - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_TAG, e->id, &tag); - } - // This function is called in notify_advance_grant_if_safe(), which is a long - // function. During this call, the socket might close, causing the following write_to_socket - // to fail. Consider a failure here a soft failure and update the federate's status. - if (write_to_socket(((federate_info_t*)e)->socket, message_length, buffer)) { - lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); - notify_federate_disconnected(e); - } else { - e->last_granted = tag; - LF_PRINT_LOG("RTI sent to federate %d the tag advance grant (TAG) " PRINTF_TAG ".", e->id, tag.time - start_time, - tag.microstep); - } +/** + * @brief Send MSG_TYPE_UPSTREAM_DISCONNECTED to the specified federate. + * + * This function assumes that the mutex lock is already held. + * @param destination The destination federate. + * @param disconnected The disconnected federate. + */ +static void send_upstream_disconnected_locked(federate_info_t* destination, federate_info_t* disconnected) { + unsigned char buffer[MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH]; + buffer[0] = MSG_TYPE_UPSTREAM_DISCONNECTED; + encode_uint16(disconnected->enclave.id, &buffer[1]); + if (write_to_socket_close_on_error(&destination->socket, MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH, buffer)) { + lf_print_warning("RTI: Failed to send upstream disconnected message to federate %d.", disconnected->enclave.id); } +} - /** - * Notify a tag advance grant (TAG) message to the specified federate after - * the physical time reaches the tag. A thread is created to this end. - * - * If a provisionl tag advance grant is pending, cancel it. If there is another - * pending tag advance grant, do not proceed with the thread creation. - * - * @param fed The federate. - * @param tag The tag to grant. - */ - static void notify_tag_advance_grant_delayed(federate_info_t * fed, tag_t tag) { - // Check wether there is already a pending grant - // And check the pending provisional grant as well - lf_mutex_lock(&rti_mutex); - if (lf_tag_compare(fed->pending_grant, NEVER_TAG) == 0) { - // If a tag is issued, then stop any possible provisional tag grant - fed->pending_grant = tag; - fed->pending_provisional_grant = NEVER_TAG; - lf_thread_create(&(fed->pending_grant_thread_id), pending_grant_thread, fed); - } else { - // If there is already a pending tag grant, then let it be sent first - // FIXME: Is this correct? +/** + * @brief Mark a federate as disconnected and inform downstream federates. + * @param e The enclave corresponding to the disconnected federate. + */ +static void notify_federate_disconnected(scheduling_node_t* e) { + e->state = NOT_CONNECTED; + // Notify downstream federates. Need to hold the mutex lock to do this. + LF_MUTEX_LOCK(&rti_mutex); + for (int j = 0; j < e->num_downstream; j++) { + federate_info_t* downstream = GET_FED_INFO(e->downstream[j]); + // Ignore this enclave if it no longer connected. + if (downstream->enclave.state != NOT_CONNECTED) { + // Notify the downstream enclave. + send_upstream_disconnected_locked(downstream, GET_FED_INFO(e->id)); } - lf_mutex_unlock(&rti_mutex); } + LF_MUTEX_UNLOCK(&rti_mutex); +} - void notify_tag_advance_grant(scheduling_node_t * e, tag_t tag) { - if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || - lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { - return; - } - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (e->state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); - } +/** + * Notify a tag advance grant (TAG) message to the specified federate immediately. + * + * This function will keep a record of this TAG in the enclave's last_granted + * field. + * + * @param e The enclave. + * @param tag The tag to grant. + */ +static void notify_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { + size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); + unsigned char buffer[message_length]; + buffer[0] = MSG_TYPE_TAG_ADVANCE_GRANT; + encode_int64(tag.time, &(buffer[1])); + encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); + + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_TAG, e->id, &tag); + } + // This function is called in notify_advance_grant_if_safe(), which is a long + // function. During this call, the socket might close, causing the following write_to_socket + // to fail. Consider a failure here a soft failure and update the federate's status. + if (write_to_socket(((federate_info_t*)e)->socket, message_length, buffer)) { + lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); + notify_federate_disconnected(e); + } else { + e->last_granted = tag; + LF_PRINT_LOG("RTI sent to federate %d the tag advance grant (TAG) " PRINTF_TAG ".", e->id, tag.time - start_time, + tag.microstep); + } +} - // Check if sending the tag advance grant needs to be delayed or not. - // Delay is needed when a federate has at least one absent upstream transient. +void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { + if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || + lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { + return; + } + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (e->state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); + } - // Check if sending the tag advance grant needs to be delayed or not - // Delay is needed when a federate has, at least one, absent upstream transient - federate_info_t* fed = GET_FED_INFO(e->id); - if (!fed->has_upstream_transient_federates) { - notify_tag_advance_grant_immediate(e, tag); + // Check if sending the tag advance grant needs to be delayed or not + // Delay is needed when a federate has, at least one, absent upstream transient + federate_info_t* fed = GET_FED_INFO(e->id); + if (!fed->has_upstream_transient_federates) { + notify_tag_advance_grant_immediate(e, tag); + } else { + if (get_num_absent_upstream_transients(fed) > 0) { + notify_grant_delayed(fed, tag, false); } else { - if (get_num_absent_upstream_transients(fed) > 0) { - notify_grant_delayed(fed, tag, false); - } else { - notify_tag_advance_grant_immediate(e, tag); - } + notify_tag_advance_grant_immediate(e, tag); } } +} - /** - * Notify a provisional tag advance grant (PTAG) message to the specified federate - * immediately. - * - * This function will keep a record of this TAG in the enclave's last_provisionally_granted - * field. - * - * @param e The scheduling node. - * @param tag The tag to grant. - */ - void notify_provisional_tag_advance_grant_immediate(scheduling_node_t * e, tag_t tag) { - size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); - unsigned char buffer[message_length]; - buffer[0] = MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT; - encode_int64(tag.time, &(buffer[1])); - encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); - - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_PTAG, e->id, &tag); - } - // This function is called in notify_advance_grant_if_safe(), which is a long - // function. During this call, the socket might close, causing the following write_to_socket - // to fail. Consider a failure here a soft failure and update the federate's status. - if (write_to_socket(((federate_info_t*)e)->socket, message_length, buffer)) { - lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); - notify_federate_disconnected(e); - } else { - e->last_provisionally_granted = tag; - LF_PRINT_LOG("RTI sent to federate %d the Provisional Tag Advance Grant (PTAG) " PRINTF_TAG ".", e->id, - tag.time - start_time, tag.microstep); - - // Send PTAG to all upstream federates, if they have not had - // a later or equal PTAG or TAG sent previously and if their transitive - // NET is greater than or equal to the tag. - // This is needed to stimulate absent messages from upstream and break deadlocks. - // The scenario this deals with is illustrated in `test/C/src/federated/FeedbackDelay2.lf` - // and `test/C/src/federated/FeedbackDelay4.lf`. - // Note that this is transitive. - // NOTE: This is not needed for enclaves because zero-delay loops are prohibited. - // It's only needed for federates, which is why this is implemented here. - for (int j = 0; j < e->num_upstream; j++) { - scheduling_node_t* upstream = rti_remote->base.scheduling_nodes[e->upstream[j]]; - - // Ignore this federate if it has resigned. - if (upstream->state == NOT_CONNECTED) - continue; - - tag_t earliest = earliest_future_incoming_message_tag(upstream); - tag_t strict_earliest = eimt_strict(upstream); // Non-ZDC version. - - // If these tags are equal, then a TAG or PTAG should have already been granted, - // in which case, another will not be sent. But it may not have been already granted. - if (lf_tag_compare(earliest, tag) > 0) { - notify_tag_advance_grant(upstream, tag); - } else if (lf_tag_compare(earliest, tag) == 0 && lf_tag_compare(strict_earliest, tag) > 0) { - notify_provisional_tag_advance_grant(upstream, tag); - } +/** + * Notify a provisional tag advance grant (PTAG) message to the specified federate + * immediately. + * + * This function will keep a record of this TAG in the enclave's last_provisionally_granted + * field. + * + * @param e The scheduling node. + * @param tag The tag to grant. + */ +void notify_provisional_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { + size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); + unsigned char buffer[message_length]; + buffer[0] = MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT; + encode_int64(tag.time, &(buffer[1])); + encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); + + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_PTAG, e->id, &tag); + } + // This function is called in notify_advance_grant_if_safe(), which is a long + // function. During this call, the socket might close, causing the following write_to_socket + // to fail. Consider a failure here a soft failure and update the federate's status. + if (write_to_socket(((federate_info_t*)e)->socket, message_length, buffer)) { + lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); + notify_federate_disconnected(e); + } else { + e->last_provisionally_granted = tag; + LF_PRINT_LOG("RTI sent to federate %d the Provisional Tag Advance Grant (PTAG) " PRINTF_TAG ".", e->id, + tag.time - start_time, tag.microstep); + + // Send PTAG to all upstream federates, if they have not had + // a later or equal PTAG or TAG sent previously and if their transitive + // NET is greater than or equal to the tag. + // This is needed to stimulate absent messages from upstream and break deadlocks. + // The scenario this deals with is illustrated in `test/C/src/federated/FeedbackDelay2.lf` + // and `test/C/src/federated/FeedbackDelay4.lf`. + // Note that this is transitive. + // NOTE: This is not needed for enclaves because zero-delay loops are prohibited. + // It's only needed for federates, which is why this is implemented here. + for (int j = 0; j < e->num_upstream; j++) { + scheduling_node_t* upstream = rti_remote->base.scheduling_nodes[e->upstream[j]]; + + // Ignore this federate if it has resigned. + if (upstream->state == NOT_CONNECTED) + continue; + + tag_t earliest = earliest_future_incoming_message_tag(upstream); + tag_t strict_earliest = eimt_strict(upstream); // Non-ZDC version. + + // If these tags are equal, then a TAG or PTAG should have already been granted, + // in which case, another will not be sent. But it may not have been already granted. + if (lf_tag_compare(earliest, tag) > 0) { + notify_tag_advance_grant(upstream, tag); + } else if (lf_tag_compare(earliest, tag) == 0 && lf_tag_compare(strict_earliest, tag) > 0) { + notify_provisional_tag_advance_grant(upstream, tag); } } } +} - void notify_provisional_tag_advance_grant(scheduling_node_t * e, tag_t tag) { - if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || - lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { - return; - } - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (e->state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); - } +void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { + if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || + lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { + return; + } + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (e->state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); + } - // Check if sending the tag advance grant needs to be delayed or not - // Delay is needed when a federate has, at least one, absent upstream transient - federate_info_t* fed = GET_FED_INFO(e->id); - if (!fed->has_upstream_transient_federates) { - notify_provisional_tag_advance_grant_immediate(e, tag); + // Check if sending the tag advance grant needs to be delayed or not + // Delay is needed when a federate has, at least one, absent upstream transient + federate_info_t* fed = GET_FED_INFO(e->id); + if (!fed->has_upstream_transient_federates) { + notify_provisional_tag_advance_grant_immediate(e, tag); + } else { + if (get_num_absent_upstream_transients(fed) > 0) { + notify_grant_delayed(fed, tag, true); } else { - if (get_num_absent_upstream_transients(fed) > 0) { - notify_provisional_tag_advance_grant_delayed(fed, tag); - } else { - notify_provisional_tag_advance_grant_immediate(e, tag); - } - federate_info_t* fed = GET_FED_INFO(e->id); - if (!fed->has_upstream_transient_federates) { - notify_provisional_tag_advance_grant_immediate(e, tag); - } else { - if (get_num_absent_upstream_transients(fed) > 0) { - notify_grant_delayed(fed, tag, true); - } else { - notify_provisional_tag_advance_grant_immediate(e, tag); - } - } + notify_provisional_tag_advance_grant_immediate(e, tag); } + } +} - void update_federate_next_event_tag_locked(uint16_t federate_id, tag_t next_event_tag) { - federate_info_t* fed = GET_FED_INFO(federate_id); - tag_t min_in_transit_tag = pqueue_tag_peek_tag(fed->in_transit_message_tags); - if (lf_tag_compare(min_in_transit_tag, next_event_tag) < 0) { - next_event_tag = min_in_transit_tag; - } - update_scheduling_node_next_event_tag_locked(&(fed->enclave), next_event_tag); - } +void update_federate_next_event_tag_locked(uint16_t federate_id, tag_t next_event_tag) { + federate_info_t* fed = GET_FED_INFO(federate_id); + tag_t min_in_transit_tag = pqueue_tag_peek_tag(fed->in_transit_message_tags); + if (lf_tag_compare(min_in_transit_tag, next_event_tag) < 0) { + next_event_tag = min_in_transit_tag; + } + update_scheduling_node_next_event_tag_locked(&(fed->enclave), next_event_tag); +} - void handle_port_absent_message(federate_info_t * sending_federate, unsigned char* buffer) { - size_t message_size = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int64_t) + sizeof(uint32_t); +void handle_port_absent_message(federate_info_t* sending_federate, unsigned char* buffer) { + size_t message_size = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int64_t) + sizeof(uint32_t); - read_from_socket_fail_on_error(&sending_federate->socket, message_size, &(buffer[1]), NULL, - " RTI failed to read port absent message from federate %u.", - sending_federate->enclave.id); + read_from_socket_fail_on_error(&sending_federate->socket, message_size, &(buffer[1]), NULL, + " RTI failed to read port absent message from federate %u.", + sending_federate->enclave.id); - uint16_t reactor_port_id = extract_uint16(&(buffer[1])); - uint16_t federate_id = extract_uint16(&(buffer[1 + sizeof(uint16_t)])); - tag_t tag = extract_tag(&(buffer[1 + 2 * sizeof(uint16_t)])); + uint16_t reactor_port_id = extract_uint16(&(buffer[1])); + uint16_t federate_id = extract_uint16(&(buffer[1 + sizeof(uint16_t)])); + tag_t tag = extract_tag(&(buffer[1 + 2 * sizeof(uint16_t)])); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_PORT_ABS, sending_federate->enclave.id, &tag); - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_PORT_ABS, sending_federate->enclave.id, &tag); + } - // Need to acquire the mutex lock to ensure that the thread handling - // messages coming from the socket connected to the destination does not - // issue a TAG before this message has been forwarded. - LF_MUTEX_LOCK(&rti_mutex); + // Need to acquire the mutex lock to ensure that the thread handling + // messages coming from the socket connected to the destination does not + // issue a TAG before this message has been forwarded. + LF_MUTEX_LOCK(&rti_mutex); - // If the destination federate is no longer connected, issue a warning - // and return. - federate_info_t* fed = GET_FED_INFO(federate_id); - if (fed->enclave.state == NOT_CONNECTED) { - LF_MUTEX_UNLOCK(&rti_mutex); - lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); - LF_PRINT_LOG("Fed status: next_event " PRINTF_TAG ", " - "completed " PRINTF_TAG ", " - "last_granted " PRINTF_TAG ", " - "last_provisionally_granted " PRINTF_TAG ".", - fed->enclave.next_event.time - start_time, fed->enclave.next_event.microstep, - fed->enclave.completed.time - start_time, fed->enclave.completed.microstep, - fed->enclave.last_granted.time - start_time, fed->enclave.last_granted.microstep, - fed->enclave.last_provisionally_granted.time - start_time, - fed->enclave.last_provisionally_granted.microstep); - return; - } + // If the destination federate is no longer connected, issue a warning + // and return. + federate_info_t* fed = GET_FED_INFO(federate_id); + if (fed->enclave.state == NOT_CONNECTED) { + LF_MUTEX_UNLOCK(&rti_mutex); + lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); + LF_PRINT_LOG("Fed status: next_event " PRINTF_TAG ", " + "completed " PRINTF_TAG ", " + "last_granted " PRINTF_TAG ", " + "last_provisionally_granted " PRINTF_TAG ".", + fed->enclave.next_event.time - start_time, fed->enclave.next_event.microstep, + fed->enclave.completed.time - start_time, fed->enclave.completed.microstep, + fed->enclave.last_granted.time - start_time, fed->enclave.last_granted.microstep, + fed->enclave.last_provisionally_granted.time - start_time, + fed->enclave.last_provisionally_granted.microstep); + return; + } - LF_PRINT_LOG("RTI forwarding port absent message for port %u to federate %u.", reactor_port_id, federate_id); + LF_PRINT_LOG("RTI forwarding port absent message for port %u to federate %u.", reactor_port_id, federate_id); - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (fed->enclave.state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); - } + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (fed->enclave.state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); + } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_PORT_ABS, federate_id, &tag); - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_PORT_ABS, federate_id, &tag); + } - // Forward the message. - write_to_socket_fail_on_error(&fed->socket, message_size + 1, buffer, &rti_mutex, - "RTI failed to forward message to federate %d.", federate_id); + // Forward the message. + write_to_socket_fail_on_error(&fed->socket, message_size + 1, buffer, &rti_mutex, + "RTI failed to forward message to federate %d.", federate_id); - LF_MUTEX_UNLOCK(&rti_mutex); - } + LF_MUTEX_UNLOCK(&rti_mutex); +} - void handle_timed_message(federate_info_t * sending_federate, unsigned char* buffer) { - size_t header_size = - 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t) + sizeof(int64_t) + sizeof(uint32_t); - // Read the header, minus the first byte which has already been read. - read_from_socket_fail_on_error(&sending_federate->socket, header_size - 1, &(buffer[1]), NULL, - "RTI failed to read the timed message header from remote federate."); - // Extract the header information. of the sender - uint16_t reactor_port_id; - uint16_t federate_id; - size_t length; - tag_t intended_tag; - // Extract information from the header. - extract_timed_header(&(buffer[1]), &reactor_port_id, &federate_id, &length, &intended_tag); - - size_t total_bytes_to_read = length + header_size; - size_t bytes_to_read = length; - - if (FED_COM_BUFFER_SIZE < header_size + 1) { - lf_print_error_and_exit("Buffer size (%d) is not large enough to " - "read the header plus one byte.", - FED_COM_BUFFER_SIZE); - } +void handle_timed_message(federate_info_t* sending_federate, unsigned char* buffer) { + size_t header_size = 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t) + sizeof(int64_t) + sizeof(uint32_t); + // Read the header, minus the first byte which has already been read. + read_from_socket_fail_on_error(&sending_federate->socket, header_size - 1, &(buffer[1]), NULL, + "RTI failed to read the timed message header from remote federate."); + // Extract the header information. of the sender + uint16_t reactor_port_id; + uint16_t federate_id; + size_t length; + tag_t intended_tag; + // Extract information from the header. + extract_timed_header(&(buffer[1]), &reactor_port_id, &federate_id, &length, &intended_tag); + + size_t total_bytes_to_read = length + header_size; + size_t bytes_to_read = length; + + if (FED_COM_BUFFER_SIZE < header_size + 1) { + lf_print_error_and_exit("Buffer size (%d) is not large enough to " + "read the header plus one byte.", + FED_COM_BUFFER_SIZE); + } - // Cut up the payload in chunks. - if (bytes_to_read > FED_COM_BUFFER_SIZE - header_size) { - bytes_to_read = FED_COM_BUFFER_SIZE - header_size; - } + // Cut up the payload in chunks. + if (bytes_to_read > FED_COM_BUFFER_SIZE - header_size) { + bytes_to_read = FED_COM_BUFFER_SIZE - header_size; + } - LF_PRINT_LOG("RTI received message from federate %d for federate %u port %u with intended tag " PRINTF_TAG - ". Forwarding.", - sending_federate->enclave.id, federate_id, reactor_port_id, intended_tag.time - lf_time_start(), - intended_tag.microstep); + LF_PRINT_LOG("RTI received message from federate %d for federate %u port %u with intended tag " PRINTF_TAG + ". Forwarding.", + sending_federate->enclave.id, federate_id, reactor_port_id, intended_tag.time - lf_time_start(), + intended_tag.microstep); - read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, &(buffer[header_size]), NULL, - "RTI failed to read timed message from federate %d.", federate_id); - size_t bytes_read = bytes_to_read + header_size; - // Following only works for string messages. - // LF_PRINT_DEBUG("Message received by RTI: %s.", buffer + header_size); + read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, &(buffer[header_size]), NULL, + "RTI failed to read timed message from federate %d.", federate_id); + size_t bytes_read = bytes_to_read + header_size; + // Following only works for string messages. + // LF_PRINT_DEBUG("Message received by RTI: %s.", buffer + header_size); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_TAGGED_MSG, sending_federate->enclave.id, &intended_tag); - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_TAGGED_MSG, sending_federate->enclave.id, &intended_tag); + } - // Need to acquire the mutex lock to ensure that the thread handling - // messages coming from the socket connected to the destination does not - // issue a TAG before this message has been forwarded. - LF_MUTEX_LOCK(&rti_mutex); + // Need to acquire the mutex lock to ensure that the thread handling + // messages coming from the socket connected to the destination does not + // issue a TAG before this message has been forwarded. + LF_MUTEX_LOCK(&rti_mutex); + + // If the destination federate is no longer connected, or it is a transient that has not started executing yet + // (the delayed intended tag is less than the effective start tag of the destination), issue a warning, remove the + // message from the socket, and return. + federate_info_t* fed = GET_FED_INFO(federate_id); + interval_t delay = NEVER; + for (int i = 0; i < fed->enclave.num_upstream; i++) { + if (fed->enclave.upstream[i] == sending_federate->enclave.id) { + delay = fed->enclave.upstream_delay[i]; + break; + } + } + if (fed->enclave.state == NOT_CONNECTED || + lf_tag_compare(lf_delay_tag(intended_tag, delay), fed->effective_start_tag) < 0) { + lf_print_warning("RTI: Destination federate %d is not connected at logical time (" PRINTF_TAG + "). Dropping message.", + federate_id, intended_tag.time - start_time, intended_tag.microstep); + // If the message was larger than the buffer, we must empty out the remainder also. + size_t total_bytes_read = bytes_read; + while (total_bytes_read < total_bytes_to_read) { + bytes_to_read = total_bytes_to_read - total_bytes_read; + if (bytes_to_read > FED_COM_BUFFER_SIZE) { + bytes_to_read = FED_COM_BUFFER_SIZE; + } + read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, + "RTI failed to clear message chunks."); + total_bytes_read += bytes_to_read; + } + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } - // If the destination federate is no longer connected, or it is a transient that has not started executing yet - // (the delayed intended tag is less than the effective start tag of the destination), issue a warning, remove the - // message from the socket, and return. - federate_info_t* fed = GET_FED_INFO(federate_id); - interval_t delay = NEVER; - for (int i = 0; i < fed->enclave.num_upstream; i++) { - if (fed->enclave.upstream[i] == sending_federate->enclave.id) { - delay = fed->enclave.upstream_delay[i]; - break; - } - } - if (fed->enclave.state == NOT_CONNECTED || - lf_tag_compare(lf_delay_tag(intended_tag, delay), fed->effective_start_tag) < 0) { - lf_print_warning("RTI: Destination federate %d is not connected at logical time (" PRINTF_TAG - "). Dropping message.", - federate_id, intended_tag.time - start_time, intended_tag.microstep); - // If the message was larger than the buffer, we must empty out the remainder also. - size_t total_bytes_read = bytes_read; - while (total_bytes_read < total_bytes_to_read) { - bytes_to_read = total_bytes_to_read - total_bytes_read; - if (bytes_to_read > FED_COM_BUFFER_SIZE) { - bytes_to_read = FED_COM_BUFFER_SIZE; - } - read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, - "RTI failed to clear message chunks."); - total_bytes_read += bytes_to_read; - } - LF_MUTEX_UNLOCK(&rti_mutex); - return; - } + LF_PRINT_DEBUG("RTI forwarding message to port %d of federate %hu of length %zu.", reactor_port_id, federate_id, + length); - LF_PRINT_DEBUG("RTI forwarding message to port %d of federate %hu of length %zu.", reactor_port_id, federate_id, - length); + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (fed->enclave.state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); + } - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (fed->enclave.state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_TAGGED_MSG, federate_id, &intended_tag); + } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_TAGGED_MSG, federate_id, &intended_tag); - } + write_to_socket_fail_on_error(&fed->socket, bytes_read, buffer, &rti_mutex, + "RTI failed to forward message to federate %d.", federate_id); + + // The message length may be longer than the buffer, + // in which case we have to handle it in chunks. + size_t total_bytes_read = bytes_read; + while (total_bytes_read < total_bytes_to_read) { + LF_PRINT_DEBUG("Forwarding message in chunks."); + bytes_to_read = total_bytes_to_read - total_bytes_read; + if (bytes_to_read > FED_COM_BUFFER_SIZE) { + bytes_to_read = FED_COM_BUFFER_SIZE; + } + read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, + "RTI failed to read message chunks."); + total_bytes_read += bytes_to_read; + + // FIXME: a mutex needs to be held for this so that other threads + // do not write to destination_socket and cause interleaving. However, + // holding the rti_mutex might be very expensive. Instead, each outgoing + // socket should probably have its own mutex. + write_to_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, &rti_mutex, + "RTI failed to send message chunks."); + } - write_to_socket_fail_on_error(&fed->socket, bytes_read, buffer, &rti_mutex, - "RTI failed to forward message to federate %d.", federate_id); - - // The message length may be longer than the buffer, - // in which case we have to handle it in chunks. - size_t total_bytes_read = bytes_read; - while (total_bytes_read < total_bytes_to_read) { - LF_PRINT_DEBUG("Forwarding message in chunks."); - bytes_to_read = total_bytes_to_read - total_bytes_read; - if (bytes_to_read > FED_COM_BUFFER_SIZE) { - bytes_to_read = FED_COM_BUFFER_SIZE; - } - read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, - "RTI failed to read message chunks."); - total_bytes_read += bytes_to_read; - - // FIXME: a mutex needs to be held for this so that other threads - // do not write to destination_socket and cause interleaving. However, - // holding the rti_mutex might be very expensive. Instead, each outgoing - // socket should probably have its own mutex. - write_to_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, &rti_mutex, - "RTI failed to send message chunks."); - } + // Record this in-transit message in federate's in-transit message queue. + if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) { + // Add a record of this message to the list of in-transit messages to this federate. + pqueue_tag_insert_if_no_match(fed->in_transit_message_tags, intended_tag); + LF_PRINT_DEBUG("RTI: Adding a message with tag " PRINTF_TAG " to the list of in-transit messages for federate %d.", + intended_tag.time - lf_time_start(), intended_tag.microstep, federate_id); + } else { + lf_print_error("RTI: Federate %d has already completed tag " PRINTF_TAG + ", but there is an in-transit message with tag " PRINTF_TAG " from federate %hu. " + "This is going to cause an STP violation under centralized coordination.", + federate_id, fed->enclave.completed.time - lf_time_start(), fed->enclave.completed.microstep, + intended_tag.time - lf_time_start(), intended_tag.microstep, sending_federate->enclave.id); + // FIXME: Drop the federate? + } - // Record this in-transit message in federate's in-transit message queue. - if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) { - // Add a record of this message to the list of in-transit messages to this federate. - pqueue_tag_insert_if_no_match(fed->in_transit_message_tags, intended_tag); - LF_PRINT_DEBUG("RTI: Adding a message with tag " PRINTF_TAG - " to the list of in-transit messages for federate %d.", - intended_tag.time - lf_time_start(), intended_tag.microstep, federate_id); - } else { - lf_print_error("RTI: Federate %d has already completed tag " PRINTF_TAG - ", but there is an in-transit message with tag " PRINTF_TAG " from federate %hu. " - "This is going to cause an STP violation under centralized coordination.", - federate_id, fed->enclave.completed.time - lf_time_start(), fed->enclave.completed.microstep, - intended_tag.time - lf_time_start(), intended_tag.microstep, sending_federate->enclave.id); - // FIXME: Drop the federate? - } + // If the message tag is less than the most recently received NET from the federate, + // then update the federate's next event tag to match the message tag. + if (lf_tag_compare(intended_tag, fed->enclave.next_event) < 0) { + update_federate_next_event_tag_locked(federate_id, intended_tag); + } - // If the message tag is less than the most recently received NET from the federate, - // then update the federate's next event tag to match the message tag. - if (lf_tag_compare(intended_tag, fed->enclave.next_event) < 0) { - update_federate_next_event_tag_locked(federate_id, intended_tag); - } + LF_MUTEX_UNLOCK(&rti_mutex); +} - LF_MUTEX_UNLOCK(&rti_mutex); - } +void handle_latest_tag_confirmed(federate_info_t* fed) { + unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, + "RTI failed to read the content of the logical tag complete from federate %d.", + fed->enclave.id); + tag_t completed = extract_tag(buffer); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_LTC, fed->enclave.id, &completed); + } + _logical_tag_complete(&(fed->enclave), completed); - void handle_latest_tag_confirmed(federate_info_t * fed) { - unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; - read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, - "RTI failed to read the content of the logical tag complete from federate %d.", - fed->enclave.id); - tag_t completed = extract_tag(buffer); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_LTC, fed->enclave.id, &completed); - } - _logical_tag_complete(&(fed->enclave), completed); + // FIXME: Should this function be in the enclave version? + LF_MUTEX_LOCK(&rti_mutex); + // See if we can remove any of the recorded in-transit messages for this. + pqueue_tag_remove_up_to(fed->in_transit_message_tags, completed); + LF_MUTEX_UNLOCK(&rti_mutex); +} - // FIXME: Should this function be in the enclave version? - LF_MUTEX_LOCK(&rti_mutex); - // See if we can remove any of the recorded in-transit messages for this. - pqueue_tag_remove_up_to(fed->in_transit_message_tags, completed); - LF_MUTEX_UNLOCK(&rti_mutex); - } +void handle_next_event_tag(federate_info_t* fed) { + unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, + "RTI failed to read the content of the next event tag from federate %d.", + fed->enclave.id); - void handle_next_event_tag(federate_info_t * fed) { - unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; - read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, - "RTI failed to read the content of the next event tag from federate %d.", - fed->enclave.id); + // Acquire a mutex lock to ensure that this state does not change while a + // message is in transport or being used to determine a TAG. + LF_MUTEX_LOCK(&rti_mutex); // FIXME: Instead of using a mutex, it might be more efficient to use a + // select() mechanism to read and process federates' buffers in an orderly fashion. - // Acquire a mutex lock to ensure that this state does not change while a - // message is in transport or being used to determine a TAG. - LF_MUTEX_LOCK(&rti_mutex); // FIXME: Instead of using a mutex, it might be more efficient to use a - // select() mechanism to read and process federates' buffers in an orderly fashion. + tag_t intended_tag = extract_tag(buffer); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_NET, fed->enclave.id, &intended_tag); + } + LF_PRINT_LOG("RTI received from federate %d the Next Event Tag (NET) " PRINTF_TAG, fed->enclave.id, + intended_tag.time - start_time, intended_tag.microstep); + update_federate_next_event_tag_locked(fed->enclave.id, intended_tag); + LF_MUTEX_UNLOCK(&rti_mutex); +} - tag_t intended_tag = extract_tag(buffer); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_NET, fed->enclave.id, &intended_tag); - } - LF_PRINT_LOG("RTI received from federate %d the Next Event Tag (NET) " PRINTF_TAG, fed->enclave.id, - intended_tag.time - start_time, intended_tag.microstep); - update_federate_next_event_tag_locked(fed->enclave.id, intended_tag); - LF_MUTEX_UNLOCK(&rti_mutex); - } +/////////////////// STOP functions //////////////////// - /////////////////// STOP functions //////////////////// - - /** - * Boolean used to prevent the RTI from sending the - * MSG_TYPE_STOP_GRANTED message multiple times. - */ - bool stop_granted_already_sent_to_federates = false; - - /** - * Once the RTI has seen proposed tags from all connected federates, - * it will broadcast a MSG_TYPE_STOP_GRANTED carrying the _RTI.max_stop_tag. - * This function also checks the most recently received NET from - * each federate and resets that be no greater than the _RTI.max_stop_tag. - * - * This function assumes the caller holds the rti_mutex lock. - */ - static void broadcast_stop_time_to_federates_locked() { - if (stop_granted_already_sent_to_federates == true) { - return; - } - stop_granted_already_sent_to_federates = true; +/** + * Boolean used to prevent the RTI from sending the + * MSG_TYPE_STOP_GRANTED message multiple times. + */ +bool stop_granted_already_sent_to_federates = false; - // Reply with a stop granted to all federates - unsigned char outgoing_buffer[MSG_TYPE_STOP_GRANTED_LENGTH]; - ENCODE_STOP_GRANTED(outgoing_buffer, rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); +/** + * Once the RTI has seen proposed tags from all connected federates, + * it will broadcast a MSG_TYPE_STOP_GRANTED carrying the _RTI.max_stop_tag. + * This function also checks the most recently received NET from + * each federate and resets that be no greater than the _RTI.max_stop_tag. + * + * This function assumes the caller holds the rti_mutex lock. + */ +static void broadcast_stop_time_to_federates_locked() { + if (stop_granted_already_sent_to_federates == true) { + return; + } + stop_granted_already_sent_to_federates = true; - // Iterate over federates and send each the message. - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->enclave.state == NOT_CONNECTED) { - continue; - } - if (lf_tag_compare(fed->enclave.next_event, rti_remote->base.max_stop_tag) >= 0) { - // Need the next_event to be no greater than the stop tag. - fed->enclave.next_event = rti_remote->base.max_stop_tag; - } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_STOP_GRN, fed->enclave.id, &rti_remote->base.max_stop_tag); - } - write_to_socket_fail_on_error(&fed->socket, MSG_TYPE_STOP_GRANTED_LENGTH, outgoing_buffer, &rti_mutex, - "RTI failed to send MSG_TYPE_STOP_GRANTED message to federate %d.", - fed->enclave.id); - } + // Reply with a stop granted to all federates + unsigned char outgoing_buffer[MSG_TYPE_STOP_GRANTED_LENGTH]; + ENCODE_STOP_GRANTED(outgoing_buffer, rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); - LF_PRINT_LOG("RTI sent to federates MSG_TYPE_STOP_GRANTED with tag " PRINTF_TAG, - rti_remote->base.max_stop_tag.time - start_time, rti_remote->base.max_stop_tag.microstep); + // Iterate over federates and send each the message. + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->enclave.state == NOT_CONNECTED) { + continue; } - - /** - * Mark a federate requesting stop. If the number of federates handling stop reaches - * the number of persistent federates, broadcast MSG_TYPE_STOP_GRANTED to every federate. - * This function assumes the _RTI.mutex is already locked. - * @param fed The federate that has requested a stop. - * @return 1 if stop time has been sent to all federates and 0 otherwise. - */ - static int mark_federate_requesting_stop(federate_info_t * fed) { - if (!fed->requested_stop) { - // Increment the number of federates handling stop only if it is persistent - if (!fed->is_transient) - rti_remote->base.num_scheduling_nodes_handling_stop++; - fed->requested_stop = true; - } - if (rti_remote->base.num_scheduling_nodes_handling_stop == - (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { - // We now have information about the stop time of all - // federates. - broadcast_stop_time_to_federates_locked(); - return 1; - } - return 0; + if (lf_tag_compare(fed->enclave.next_event, rti_remote->base.max_stop_tag) >= 0) { + // Need the next_event to be no greater than the stop tag. + fed->enclave.next_event = rti_remote->base.max_stop_tag; } - - /** - * Thread to time out if federates do not reply to stop request. - */ - static void* wait_for_stop_request_reply(void* args) { - initialize_lf_thread_id(); - // Divide the time into small chunks and check periodically. - interval_t chunk = MAX_TIME_FOR_REPLY_TO_STOP_REQUEST / 30; - int count = 0; - while (count++ < 30) { - if (stop_granted_already_sent_to_federates) - return NULL; - lf_sleep(chunk); - } - // If we reach here, then error out. - lf_print_error_and_exit("Received only %d stop request replies within timeout " PRINTF_TIME "ns. RTI is exiting.", - rti_remote->base.num_scheduling_nodes_handling_stop, MAX_TIME_FOR_REPLY_TO_STOP_REQUEST); - return NULL; + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_STOP_GRN, fed->enclave.id, &rti_remote->base.max_stop_tag); } + write_to_socket_fail_on_error(&fed->socket, MSG_TYPE_STOP_GRANTED_LENGTH, outgoing_buffer, &rti_mutex, + "RTI failed to send MSG_TYPE_STOP_GRANTED message to federate %d.", fed->enclave.id); + } - void handle_stop_request_message(federate_info_t * fed) { - LF_PRINT_DEBUG("RTI handling stop_request from federate %d.", fed->enclave.id); - - size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; - unsigned char buffer[bytes_to_read]; - read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, NULL, - "RTI failed to read the MSG_TYPE_STOP_REQUEST payload from federate %d.", - fed->enclave.id); + LF_PRINT_LOG("RTI sent to federates MSG_TYPE_STOP_GRANTED with tag " PRINTF_TAG, + rti_remote->base.max_stop_tag.time - start_time, rti_remote->base.max_stop_tag.microstep); +} - // Extract the proposed stop tag for the federate - tag_t proposed_stop_tag = extract_tag(buffer); +/** + * Mark a federate requesting stop. If the number of federates handling stop reaches + * the number of persistent federates, broadcast MSG_TYPE_STOP_GRANTED to every federate. + * This function assumes the _RTI.mutex is already locked. + * @param fed The federate that has requested a stop. + * @return 1 if stop time has been sent to all federates and 0 otherwise. + */ +static int mark_federate_requesting_stop(federate_info_t* fed) { + if (!fed->requested_stop) { + // Increment the number of federates handling stop only if it is persistent + if (!fed->is_transient) + rti_remote->base.num_scheduling_nodes_handling_stop++; + fed->requested_stop = true; + } + if (rti_remote->base.num_scheduling_nodes_handling_stop == + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { + // We now have information about the stop time of all + // federates. + broadcast_stop_time_to_federates_locked(); + return 1; + } + return 0; +} - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_STOP_REQ, fed->enclave.id, &proposed_stop_tag); - } +/** + * Thread to time out if federates do not reply to stop request. + */ +static void* wait_for_stop_request_reply(void* args) { + initialize_lf_thread_id(); + // Divide the time into small chunks and check periodically. + interval_t chunk = MAX_TIME_FOR_REPLY_TO_STOP_REQUEST / 30; + int count = 0; + while (count++ < 30) { + if (stop_granted_already_sent_to_federates) + return NULL; + lf_sleep(chunk); + } + // If we reach here, then error out. + lf_print_error_and_exit("Received only %d stop request replies within timeout " PRINTF_TIME "ns. RTI is exiting.", + rti_remote->base.num_scheduling_nodes_handling_stop, MAX_TIME_FOR_REPLY_TO_STOP_REQUEST); + return NULL; +} - LF_PRINT_LOG("RTI received from federate %d a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", - fed->enclave.id, proposed_stop_tag.time - start_time, proposed_stop_tag.microstep); +void handle_stop_request_message(federate_info_t* fed) { + LF_PRINT_DEBUG("RTI handling stop_request from federate %d.", fed->enclave.id); - // Acquire a mutex lock to ensure that this state does change while a - // message is in transport or being used to determine a TAG. - LF_MUTEX_LOCK(&rti_mutex); + size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; + unsigned char buffer[bytes_to_read]; + read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, NULL, + "RTI failed to read the MSG_TYPE_STOP_REQUEST payload from federate %d.", + fed->enclave.id); - // Check whether we have already received a stop_tag - // from this federate - if (fed->requested_stop) { - // If stop request messages have already been broadcast, treat this as if it were a reply. - if (rti_remote->stop_in_progress) { - mark_federate_requesting_stop(fed); - } - LF_MUTEX_UNLOCK(&rti_mutex); - return; - } + // Extract the proposed stop tag for the federate + tag_t proposed_stop_tag = extract_tag(buffer); - // Update the maximum stop tag received from federates - if (lf_tag_compare(proposed_stop_tag, rti_remote->base.max_stop_tag) > 0) { - rti_remote->base.max_stop_tag = proposed_stop_tag; - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_STOP_REQ, fed->enclave.id, &proposed_stop_tag); + } - // If all federates have replied, send stop request granted. - if (mark_federate_requesting_stop(fed)) { - // Have send stop request granted to all federates. Nothing more to do. - LF_MUTEX_UNLOCK(&rti_mutex); - return; - } + LF_PRINT_LOG("RTI received from federate %d a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", + fed->enclave.id, proposed_stop_tag.time - start_time, proposed_stop_tag.microstep); - // Forward the stop request to all other federates that have not - // also issued a stop request. - unsigned char stop_request_buffer[MSG_TYPE_STOP_REQUEST_LENGTH]; - ENCODE_STOP_REQUEST(stop_request_buffer, rti_remote->base.max_stop_tag.time, - rti_remote->base.max_stop_tag.microstep); + // Acquire a mutex lock to ensure that this state does change while a + // message is in transport or being used to determine a TAG. + LF_MUTEX_LOCK(&rti_mutex); - // Iterate over federates and send each the MSG_TYPE_STOP_REQUEST message - // if we do not have a stop_time already for them. Do not do this more than once. - if (rti_remote->stop_in_progress) { - LF_MUTEX_UNLOCK(&rti_mutex); - return; - } - rti_remote->stop_in_progress = true; - // Need a timeout here in case a federate never replies. - lf_thread_t timeout_thread; - lf_thread_create(&timeout_thread, wait_for_stop_request_reply, NULL); - - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* f = GET_FED_INFO(i); - if (f->enclave.id != fed->enclave.id && f->requested_stop == false) { - if (f->enclave.state == NOT_CONNECTED) { - mark_federate_requesting_stop(f); - continue; - } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); - } - write_to_socket_fail_on_error(&f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, &rti_mutex, - "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", - f->enclave.id); - } - } - LF_PRINT_LOG("RTI forwarded to federates MSG_TYPE_STOP_REQUEST with tag (" PRINTF_TIME ", %u).", - rti_remote->base.max_stop_tag.time - start_time, rti_remote->base.max_stop_tag.microstep); - LF_MUTEX_UNLOCK(&rti_mutex); + // Check whether we have already received a stop_tag + // from this federate + if (fed->requested_stop) { + // If stop request messages have already been broadcast, treat this as if it were a reply. + if (rti_remote->stop_in_progress) { + mark_federate_requesting_stop(fed); } + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } + + // Update the maximum stop tag received from federates + if (lf_tag_compare(proposed_stop_tag, rti_remote->base.max_stop_tag) > 0) { + rti_remote->base.max_stop_tag = proposed_stop_tag; + } - void handle_stop_request_reply(federate_info_t * fed) { - size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_REPLY_LENGTH - 1; - unsigned char buffer_stop_time[bytes_to_read]; - read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer_stop_time, NULL, - "RTI failed to read the reply to MSG_TYPE_STOP_REQUEST message from federate %d.", - fed->enclave.id); + // If all federates have replied, send stop request granted. + if (mark_federate_requesting_stop(fed)) { + // Have send stop request granted to all federates. Nothing more to do. + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } - tag_t federate_stop_tag = extract_tag(buffer_stop_time); + // Forward the stop request to all other federates that have not + // also issued a stop request. + unsigned char stop_request_buffer[MSG_TYPE_STOP_REQUEST_LENGTH]; + ENCODE_STOP_REQUEST(stop_request_buffer, rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); + // Iterate over federates and send each the MSG_TYPE_STOP_REQUEST message + // if we do not have a stop_time already for them. Do not do this more than once. + if (rti_remote->stop_in_progress) { + LF_MUTEX_UNLOCK(&rti_mutex); + return; + } + rti_remote->stop_in_progress = true; + // Need a timeout here in case a federate never replies. + lf_thread_t timeout_thread; + lf_thread_create(&timeout_thread, wait_for_stop_request_reply, NULL); + + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* f = GET_FED_INFO(i); + if (f->enclave.id != fed->enclave.id && f->requested_stop == false) { + if (f->enclave.state == NOT_CONNECTED) { + mark_federate_requesting_stop(f); + continue; + } if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_STOP_REQ_REP, fed->enclave.id, &federate_stop_tag); + tracepoint_rti_to_federate(send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); } + write_to_socket_fail_on_error(&f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, &rti_mutex, + "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", + f->enclave.id); + } + } + LF_PRINT_LOG("RTI forwarded to federates MSG_TYPE_STOP_REQUEST with tag (" PRINTF_TIME ", %u).", + rti_remote->base.max_stop_tag.time - start_time, rti_remote->base.max_stop_tag.microstep); + LF_MUTEX_UNLOCK(&rti_mutex); +} - LF_PRINT_LOG("RTI received from federate %d STOP reply tag " PRINTF_TAG ".", fed->enclave.id, - federate_stop_tag.time - start_time, federate_stop_tag.microstep); +void handle_stop_request_reply(federate_info_t* fed) { + size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_REPLY_LENGTH - 1; + unsigned char buffer_stop_time[bytes_to_read]; + read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer_stop_time, NULL, + "RTI failed to read the reply to MSG_TYPE_STOP_REQUEST message from federate %d.", + fed->enclave.id); - // Acquire the mutex lock so that we can change the state of the RTI - LF_MUTEX_LOCK(&rti_mutex); - // If the federate has not requested stop before, count the reply - if (lf_tag_compare(federate_stop_tag, rti_remote->base.max_stop_tag) > 0) { - rti_remote->base.max_stop_tag = federate_stop_tag; - } - mark_federate_requesting_stop(fed); - LF_MUTEX_UNLOCK(&rti_mutex); - } + tag_t federate_stop_tag = extract_tag(buffer_stop_time); - ////////////////////////////////////////////////// + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_STOP_REQ_REP, fed->enclave.id, &federate_stop_tag); + } - void handle_address_query(uint16_t fed_id) { - federate_info_t* fed = GET_FED_INFO(fed_id); - // Use buffer both for reading and constructing the reply. - // The length is what is needed for the reply. - unsigned char buffer[1 + sizeof(int32_t)]; - read_from_socket_fail_on_error(&fed->socket, sizeof(uint16_t), (unsigned char*)buffer, NULL, - "Failed to read address query."); - uint16_t remote_fed_id = extract_uint16(buffer); + LF_PRINT_LOG("RTI received from federate %d STOP reply tag " PRINTF_TAG ".", fed->enclave.id, + federate_stop_tag.time - start_time, federate_stop_tag.microstep); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_ADR_QR, fed_id, NULL); - } + // Acquire the mutex lock so that we can change the state of the RTI + LF_MUTEX_LOCK(&rti_mutex); + // If the federate has not requested stop before, count the reply + if (lf_tag_compare(federate_stop_tag, rti_remote->base.max_stop_tag) > 0) { + rti_remote->base.max_stop_tag = federate_stop_tag; + } + mark_federate_requesting_stop(fed); + LF_MUTEX_UNLOCK(&rti_mutex); +} - LF_PRINT_DEBUG("RTI received address query from %d for %d.", fed_id, remote_fed_id); +////////////////////////////////////////////////// - // NOTE: server_port initializes to -1, which means the RTI does not know - // the port number because it has not yet received an MSG_TYPE_ADDRESS_ADVERTISEMENT message - // from this federate. In that case, it will respond by sending -1. +void handle_address_query(uint16_t fed_id) { + federate_info_t* fed = GET_FED_INFO(fed_id); + // Use buffer both for reading and constructing the reply. + // The length is what is needed for the reply. + unsigned char buffer[1 + sizeof(int32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(uint16_t), (unsigned char*)buffer, NULL, + "Failed to read address query."); + uint16_t remote_fed_id = extract_uint16(buffer); - // Response message is MSG_TYPE_ADDRESS_QUERY_REPLY. - buffer[0] = MSG_TYPE_ADDRESS_QUERY_REPLY; + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_ADR_QR, fed_id, NULL); + } - // Encode the port number. - federate_info_t* remote_fed = GET_FED_INFO(remote_fed_id); + LF_PRINT_DEBUG("RTI received address query from %d for %d.", fed_id, remote_fed_id); - // Send the port number (which could be -1). - LF_MUTEX_LOCK(&rti_mutex); - encode_int32(remote_fed->server_port, (unsigned char*)&buffer[1]); - write_to_socket_fail_on_error(&fed->socket, sizeof(int32_t) + 1, (unsigned char*)buffer, &rti_mutex, - "Failed to write port number to socket of federate %d.", fed_id); - - // Send the server IP address to federate. - write_to_socket_fail_on_error(&fed->socket, sizeof(remote_fed->server_ip_addr), - (unsigned char*)&remote_fed->server_ip_addr, &rti_mutex, - "Failed to write ip address to socket of federate %d.", fed_id); - LF_MUTEX_UNLOCK(&rti_mutex); - - LF_PRINT_DEBUG("Replied to address query from federate %d with address %s:%d.", fed_id, - remote_fed->server_hostname, remote_fed->server_port); - } + // NOTE: server_port initializes to -1, which means the RTI does not know + // the port number because it has not yet received an MSG_TYPE_ADDRESS_ADVERTISEMENT message + // from this federate. In that case, it will respond by sending -1. - void handle_address_ad(uint16_t federate_id) { - federate_info_t* fed = GET_FED_INFO(federate_id); - // Read the port number of the federate that can be used for physical - // connections to other federates - int32_t server_port = -1; - unsigned char buffer[sizeof(int32_t)]; - read_from_socket_fail_on_error(&fed->socket, sizeof(int32_t), (unsigned char*)buffer, NULL, - "Error reading port data from federate %d.", federate_id); + // Response message is MSG_TYPE_ADDRESS_QUERY_REPLY. + buffer[0] = MSG_TYPE_ADDRESS_QUERY_REPLY; - server_port = extract_int32(buffer); + // Encode the port number. + federate_info_t* remote_fed = GET_FED_INFO(remote_fed_id); - assert(server_port < 65536); + // Send the port number (which could be -1). + LF_MUTEX_LOCK(&rti_mutex); + encode_int32(remote_fed->server_port, (unsigned char*)&buffer[1]); + write_to_socket_fail_on_error(&fed->socket, sizeof(int32_t) + 1, (unsigned char*)buffer, &rti_mutex, + "Failed to write port number to socket of federate %d.", fed_id); - LF_MUTEX_LOCK(&rti_mutex); - fed->server_port = server_port; - LF_MUTEX_UNLOCK(&rti_mutex); + // Send the server IP address to federate. + write_to_socket_fail_on_error(&fed->socket, sizeof(remote_fed->server_ip_addr), + (unsigned char*)&remote_fed->server_ip_addr, &rti_mutex, + "Failed to write ip address to socket of federate %d.", fed_id); + LF_MUTEX_UNLOCK(&rti_mutex); - LF_PRINT_LOG("Received address advertisement with port %d from federate %d.", server_port, federate_id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_ADR_AD, federate_id, NULL); - } - } + LF_PRINT_DEBUG("Replied to address query from federate %d with address %s:%d.", fed_id, remote_fed->server_hostname, + remote_fed->server_port); +} - /** - * @brief Send the global federation start time and the federate-specific starting tag to the specified federate. - * - * For persistent federates and transient federates that happen to join during federation startup, the - * `federation_start_time` will match the time in the `federate_start_tag`, and the microstep will be 0. - * For a transient federate that joins later, the time in the `federate_start_tag` will be greater than the - * federation_start_time`. - * - * - * Before sending the start time and tag, this function notifies my_fed of all upstream transient federates that are - * connected. After sending the start time and tag, and if my_fed is transient, notify federates downstream of its - * connection, ensuring proper handling of zero-delay cycles. - * - * This function assumes that the mutex lock is already held. - * - * @param my_fed the federate to send the start time to. - * @param federation_start_time the federation start_time - * @param federate_start_tag the federate effective start tag - */ - static void send_start_tag_locked(federate_info_t * my_fed, instant_t federation_start_time, - tag_t federate_start_tag) { - // Notify my_fed of any upstream transient federates that are connected. - // This has to occur before sending the start tag so that my_fed does not begin executing thinking that these - // upstream federates are not connected. - for (int i = 0; i < my_fed->enclave.num_upstream; i++) { - federate_info_t* fed = GET_FED_INFO(my_fed->enclave.upstream[i]); - if (fed->is_transient && fed->enclave.state == GRANTED) { - send_upstream_connected_locked(my_fed, fed); - } - } +void handle_address_ad(uint16_t federate_id) { + federate_info_t* fed = GET_FED_INFO(federate_id); + // Read the port number of the federate that can be used for physical + // connections to other federates + int32_t server_port = -1; + unsigned char buffer[sizeof(int32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(int32_t), (unsigned char*)buffer, NULL, + "Error reading port data from federate %d.", federate_id); - // Send back to the federate the maximum time plus an offset on a TIMESTAMP_START - // message. - // In the startup phase, federates will receive identical start_time and - // effective_start_tag - unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_START_LENGTH]; - start_time_buffer[0] = MSG_TYPE_TIMESTAMP_START; - encode_int64(swap_bytes_if_big_endian_int64(federation_start_time), &start_time_buffer[1]); - encode_tag(&(start_time_buffer[1 + sizeof(instant_t)]), federate_start_tag); + server_port = extract_int32(buffer); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_TIMESTAMP, my_fed->enclave.id, &federate_start_tag); - } - if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_START_LENGTH, start_time_buffer)) { - lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); - } else { - // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP_START - // message has been sent. That MSG_TYPE_TIMESTAMP_START message grants time advance to - // the federate to the federate_start_tag.time. - my_fed->enclave.state = GRANTED; - lf_cond_broadcast(&sent_start_time); - LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); - - // If this is a transient federate, notify its downstream federates that it is now connected. - if (my_fed->is_transient) { - for (int i = 0; i < my_fed->enclave.num_downstream; i++) { - send_upstream_connected_locked(GET_FED_INFO(my_fed->enclave.downstream[i]), my_fed); - } - } - } - } + assert(server_port < 65536); - void handle_timestamp(federate_info_t * my_fed) { - unsigned char buffer[sizeof(int64_t)]; - // Read bytes from the socket. We need 8 bytes. - read_from_socket_fail_on_error(&my_fed->socket, sizeof(int64_t), (unsigned char*)&buffer, NULL, - "ERROR reading timestamp from federate %d.\n", my_fed->enclave.id); + LF_MUTEX_LOCK(&rti_mutex); + fed->server_port = server_port; + LF_MUTEX_UNLOCK(&rti_mutex); - int64_t timestamp = swap_bytes_if_big_endian_int64(*((int64_t*)(&buffer))); - if (rti_remote->base.tracing_enabled) { - tag_t tag = {.time = timestamp, .microstep = 0}; - tracepoint_rti_from_federate(receive_TIMESTAMP, my_fed->enclave.id, &tag); - } - LF_PRINT_DEBUG("RTI received timestamp message with time: " PRINTF_TIME ".", timestamp); - - LF_MUTEX_LOCK(&rti_mutex); - - // Processing the TIMESTAMP depends on whether it is the startup phase. - if (rti_remote->phase == startup_phase) { - // Not all persistent federates have proposed a start time. - if (timestamp > rti_remote->max_start_time) { - rti_remote->max_start_time = timestamp; - } - // Note that if a transient federate's thread gets here during the startup phase, - // then it will be assigned the same global tag as its effective start tag and its - // timestamp will affect that start tag. - if (!my_fed->is_transient) { - rti_remote->num_feds_proposed_start++; - } - if (rti_remote->num_feds_proposed_start == - (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { - // This federate is the last persistent federate to proposed a start time. - lf_cond_broadcast(&received_start_times); - rti_remote->phase = execution_phase; - } else { - // Wait until all persistent federates have proposed a start time. - while (rti_remote->num_feds_proposed_start < - (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { - lf_cond_wait(&received_start_times); - } - } - // Add an offset to the maximum tag to get everyone starting together. - start_time = rti_remote->max_start_time + DELAY_START; - my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; + LF_PRINT_LOG("Received address advertisement with port %d from federate %d.", server_port, federate_id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_ADR_AD, federate_id, NULL); + } +} - LF_MUTEX_UNLOCK(&rti_mutex); +/** + * @brief Send the global federation start time and the federate-specific starting tag to the specified federate. + * + * For persistent federates and transient federates that happen to join during federation startup, the + * `federation_start_time` will match the time in the `federate_start_tag`, and the microstep will be 0. + * For a transient federate that joins later, the time in the `federate_start_tag` will be greater than the + * federation_start_time`. + * + * + * Before sending the start time and tag, this function notifies my_fed of all upstream transient federates that are + * connected. After sending the start time and tag, and if my_fed is transient, notify federates downstream of its + * connection, ensuring proper handling of zero-delay cycles. + * + * This function assumes that the mutex lock is already held. + * + * @param my_fed the federate to send the start time to. + * @param federation_start_time the federation start_time + * @param federate_start_tag the federate effective start tag + */ +static void send_start_tag_locked(federate_info_t* my_fed, instant_t federation_start_time, tag_t federate_start_tag) { + // Notify my_fed of any upstream transient federates that are connected. + // This has to occur before sending the start tag so that my_fed does not begin executing thinking that these + // upstream federates are not connected. + for (int i = 0; i < my_fed->enclave.num_upstream; i++) { + federate_info_t* fed = GET_FED_INFO(my_fed->enclave.upstream[i]); + if (fed->is_transient && fed->enclave.state == GRANTED) { + send_upstream_connected_locked(my_fed, fed); + } + } - // Notify the federate of its start tag. - // This has to be done while still holding the mutex. - send_start_tag(my_fed, start_time, my_fed->effective_start_tag); + // Send back to the federate the maximum time plus an offset on a TIMESTAMP_START + // message. + // In the startup phase, federates will receive identical start_time and + // effective_start_tag + unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_START_LENGTH]; + start_time_buffer[0] = MSG_TYPE_TIMESTAMP_START; + encode_int64(swap_bytes_if_big_endian_int64(federation_start_time), &start_time_buffer[1]); + encode_tag(&(start_time_buffer[1 + sizeof(instant_t)]), federate_start_tag); + + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_TIMESTAMP, my_fed->enclave.id, &federate_start_tag); + } + if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_START_LENGTH, start_time_buffer)) { + lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); + } else { + // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP_START + // message has been sent. That MSG_TYPE_TIMESTAMP_START message grants time advance to + // the federate to the federate_start_tag.time. + my_fed->enclave.state = GRANTED; + lf_cond_broadcast(&sent_start_time); + LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); + + // If this is a transient federate, notify its downstream federates that it is now connected. + if (my_fed->is_transient) { + for (int i = 0; i < my_fed->enclave.num_downstream; i++) { + send_upstream_connected_locked(GET_FED_INFO(my_fed->enclave.downstream[i]), my_fed); + } + } + } +} - LF_MUTEX_UNLOCK(&rti_mutex); - } else if (rti_remote->phase == shutdown_phase || !my_fed->is_transient) { - LF_MUTEX_UNLOCK(&rti_mutex); +void handle_timestamp(federate_info_t* my_fed) { + unsigned char buffer[sizeof(int64_t)]; + // Read bytes from the socket. We need 8 bytes. + read_from_socket_fail_on_error(&my_fed->socket, sizeof(int64_t), (unsigned char*)&buffer, NULL, + "ERROR reading timestamp from federate %d.\n", my_fed->enclave.id); - // Send reject message if the federation is in shutdown phase or if - // it is in the execution phase but the federate is persistent. - send_reject(&my_fed->socket, JOINING_TOO_LATE); - return; - } else { - // The federate is transient and we are in the execution phase. - // At this point, we already hold the mutex. - - //// Algorithm for computing the effective_start_time of a joining transient - // The effective_start_time will be the max among all the following tags: - // 1. At tag: (joining time, 0 microstep) - // 2. (start_time, 0 microstep) - // 3. The latest completed logical tag + 1 microstep - // 4. The latest granted (P)TAG + 1 microstep, of every downstream federate - // 5. The maximun tag of messages from the upstream federates + 1 microstep - - // Condition 1. - my_fed->effective_start_tag = (tag_t){.time = timestamp, .microstep = 0u}; - - // Condition 2. - if (timestamp < start_time) { - my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; - } + int64_t timestamp = swap_bytes_if_big_endian_int64(*((int64_t*)(&buffer))); + if (rti_remote->base.tracing_enabled) { + tag_t tag = {.time = timestamp, .microstep = 0}; + tracepoint_rti_from_federate(receive_TIMESTAMP, my_fed->enclave.id, &tag); + } + LF_PRINT_DEBUG("RTI received timestamp message with time: " PRINTF_TIME ".", timestamp); + + LF_MUTEX_LOCK(&rti_mutex); + + // Processing the TIMESTAMP depends on whether it is the startup phase. + if (rti_remote->phase == startup_phase) { + // Not all persistent federates have proposed a start time. + if (timestamp > rti_remote->max_start_time) { + rti_remote->max_start_time = timestamp; + } + // Note that if a transient federate's thread gets here during the startup phase, + // then it will be assigned the same global tag as its effective start tag and its + // timestamp will affect that start tag. + if (!my_fed->is_transient) { + rti_remote->num_feds_proposed_start++; + } + if (rti_remote->num_feds_proposed_start == + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { + // This federate is the last persistent federate to proposed a start time. + lf_cond_broadcast(&received_start_times); + rti_remote->phase = execution_phase; + } else { + // Wait until all persistent federates have proposed a start time. + while (rti_remote->num_feds_proposed_start < + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { + lf_cond_wait(&received_start_times); + } + } + // Add an offset to the maximum tag to get everyone starting together. + start_time = rti_remote->max_start_time + DELAY_START; + my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; - // Condition 3. - if (lf_tag_compare(my_fed->enclave.completed, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = my_fed->enclave.completed; - my_fed->effective_start_tag.microstep++; - } + // Notify the federate of its start tag. + // This has to be done while still holding the mutex. + send_start_tag_locked(my_fed, start_time, my_fed->effective_start_tag); - // Condition 4. Iterate over the downstream federates - for (int j = 0; j < my_fed->enclave.num_downstream; j++) { - federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); + LF_MUTEX_UNLOCK(&rti_mutex); + } else if (rti_remote->phase == shutdown_phase || !my_fed->is_transient) { + LF_MUTEX_UNLOCK(&rti_mutex); - // Get the max over the TAG of the downstreams - if (lf_tag_compare(downstream->enclave.last_granted, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = downstream->enclave.last_granted; - my_fed->effective_start_tag.microstep++; - } + // Send reject message if the federation is in shutdown phase or if + // it is in the execution phase but the federate is persistent. + send_reject(&my_fed->socket, JOINING_TOO_LATE); + return; + } else { + // The federate is transient and we are in the execution phase. + // At this point, we already hold the mutex. - // Get the max over the PTAG of the downstreams - if (lf_tag_compare(downstream->enclave.last_provisionally_granted, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = downstream->enclave.last_provisionally_granted; - my_fed->effective_start_tag.microstep++; - } - } + //// Algorithm for computing the effective_start_time of a joining transient + // The effective_start_time will be the max among all the following tags: + // 1. At tag: (joining time, 0 microstep) + // 2. (start_time, 0 microstep) + // 3. The latest completed logical tag + 1 microstep + // 4. The latest granted (P)TAG + 1 microstep, of every downstream federate + // 5. The maximun tag of messages from the upstream federates + 1 microstep - // Condition 5. - // This one is a bit subtle. Any messages from upstream federates that the RTI has - // not yet seen will be sent to this joining federate after the effective_start_tag - // because the effective_start_tag is sent while still holding the mutex. + // Condition 1. + my_fed->effective_start_tag = (tag_t){.time = timestamp, .microstep = 0u}; - // Iterate over the messages from the upstream federates - for (int j = 0; j < my_fed->enclave.num_upstream; j++) { - federate_info_t* upstream = GET_FED_INFO(my_fed->enclave.upstream[j]); + // Condition 2. + if (timestamp < start_time) { + my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; + } - size_t queue_size = pqueue_tag_size(upstream->in_transit_message_tags); - if (queue_size != 0) { - tag_t max_tag = pqueue_tag_max_tag(upstream->in_transit_message_tags); + // Condition 3. + if (lf_tag_compare(my_fed->enclave.completed, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = my_fed->enclave.completed; + my_fed->effective_start_tag.microstep++; + } - if (lf_tag_compare(max_tag, my_fed->effective_start_tag) >= 0) { - my_fed->effective_start_tag = max_tag; - my_fed->effective_start_tag.microstep++; - } - } - } + // Condition 4. Iterate over the downstream federates + for (int j = 0; j < my_fed->enclave.num_downstream; j++) { + federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); - // For every downstream that has a pending grant that is higher than the - // effective_start_time of the federate, cancel it. - // FIXME: Should this be higher-than or equal to? - // FIXME: Also, won't the grant simply be lost? - // If the joining federate doesn't send anything, the downstream federate won't issue another NET. - for (int j = 0; j < my_fed->enclave.num_downstream; j++) { - federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); + // Get the max over the TAG of the downstreams + if (lf_tag_compare(downstream->enclave.last_granted, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = downstream->enclave.last_granted; + my_fed->effective_start_tag.microstep++; + } - // Ignore this federate if it has resigned. - if (downstream->enclave.state == NOT_CONNECTED) { - continue; - } + // Get the max over the PTAG of the downstreams + if (lf_tag_compare(downstream->enclave.last_provisionally_granted, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = downstream->enclave.last_provisionally_granted; + my_fed->effective_start_tag.microstep++; + } + } - // Check the pending grants, if any, and keep it only if it is - // sooner than the effective start tag. - pqueue_delayed_grant_element_t* dge = - pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, downstream->enclave.id); - if (dge != NULL && lf_tag_compare(dge->base.tag, my_fed->effective_start_tag) > 0) { - pqueue_delayed_grants_remove(rti_remote->delayed_grants, dge); - } - } + // Condition 5. + // This one is a bit subtle. Any messages from upstream federates that the RTI has + // not yet seen will be sent to this joining federate after the effective_start_tag + // because the effective_start_tag is sent while still holding the mutex. - // Once the effective start time set, sent it to the joining transient, - // together with the start time of the federation. + // Iterate over the messages from the upstream federates + for (int j = 0; j < my_fed->enclave.num_upstream; j++) { + federate_info_t* upstream = GET_FED_INFO(my_fed->enclave.upstream[j]); - // Have to send the start tag while still holding the mutex to ensure that no message - // from an upstream federate is forwarded before the start tag. - send_start_tag_locked(my_fed, start_time, my_fed->effective_start_tag); + size_t queue_size = pqueue_tag_size(upstream->in_transit_message_tags); + if (queue_size != 0) { + tag_t max_tag = pqueue_tag_max_tag(upstream->in_transit_message_tags); - // Whenver a transient joins, invalidate all federates, so that all min_delays_upstream - // get re-computed. - // FIXME: Needs to be optimized to only invalidate those affected by the transient - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - invalidate_min_delays_upstream(&(fed->enclave)); + if (lf_tag_compare(max_tag, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = max_tag; + my_fed->effective_start_tag.microstep++; } - - LF_MUTEX_UNLOCK(&rti_mutex); } } - void send_physical_clock(unsigned char message_type, federate_info_t* fed, socket_type_t socket_type) { - if (fed->enclave.state == NOT_CONNECTED) { - lf_print_warning("Clock sync: RTI failed to send physical time to federate %d. Socket not connected.\n", - fed->enclave.id); - return; + // For every downstream that has a pending grant that is higher than the + // effective_start_time of the federate, cancel it. + // FIXME: Should this be higher-than or equal to? + // FIXME: Also, won't the grant simply be lost? + // If the joining federate doesn't send anything, the downstream federate won't issue another NET. + for (int j = 0; j < my_fed->enclave.num_downstream; j++) { + federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); + + // Ignore this federate if it has resigned. + if (downstream->enclave.state == NOT_CONNECTED) { + continue; } - unsigned char buffer[sizeof(int64_t) + 1]; - buffer[0] = message_type; - int64_t current_physical_time = lf_time_physical(); - encode_int64(current_physical_time, &(buffer[1])); - - // Send the message - if (socket_type == UDP) { - // FIXME: UDP_addr is never initialized. - LF_PRINT_DEBUG("Clock sync: RTI sending UDP message type %u.", buffer[0]); - ssize_t bytes_written = sendto(rti_remote->socket_descriptor_UDP, buffer, 1 + sizeof(int64_t), 0, - (struct sockaddr*)&fed->UDP_addr, sizeof(fed->UDP_addr)); - if (bytes_written < (ssize_t)sizeof(int64_t) + 1) { - lf_print_warning("Clock sync: RTI failed to send physical time to federate %d: %s\n", fed->enclave.id, - strerror(errno)); - return; - } - } else if (socket_type == TCP) { - LF_PRINT_DEBUG("Clock sync: RTI sending TCP message type %u.", buffer[0]); - LF_MUTEX_LOCK(&rti_mutex); - write_to_socket_fail_on_error(&fed->socket, 1 + sizeof(int64_t), buffer, &rti_mutex, - "Clock sync: RTI failed to send physical time to federate %d.", fed->enclave.id); - LF_MUTEX_UNLOCK(&rti_mutex); + + // Check the pending grants, if any, and keep it only if it is + // sooner than the effective start tag. + pqueue_delayed_grant_element_t* dge = + pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, downstream->enclave.id); + if (dge != NULL && lf_tag_compare(dge->base.tag, my_fed->effective_start_tag) > 0) { + pqueue_delayed_grants_remove(rti_remote->delayed_grants, dge); } - LF_PRINT_DEBUG("Clock sync: RTI sent PHYSICAL_TIME_SYNC_MESSAGE with timestamp " PRINTF_TIME " to federate %d.", - current_physical_time, fed->enclave.id); } - void handle_physical_clock_sync_message(federate_info_t * my_fed, socket_type_t socket_type) { - // Lock the mutex to prevent interference between sending the two - // coded probe messages. - LF_MUTEX_LOCK(&rti_mutex); - // Reply with a T4 type message - send_physical_clock(MSG_TYPE_CLOCK_SYNC_T4, my_fed, socket_type); - // Send the corresponding coded probe immediately after, - // but only if this is a UDP channel. - if (socket_type == UDP) { - send_physical_clock(MSG_TYPE_CLOCK_SYNC_CODED_PROBE, my_fed, socket_type); - } - LF_MUTEX_UNLOCK(&rti_mutex); + // Once the effective start time set, sent it to the joining transient, + // together with the start time of the federation. + + // Have to send the start tag while still holding the mutex to ensure that no message + // from an upstream federate is forwarded before the start tag. + send_start_tag_locked(my_fed, start_time, my_fed->effective_start_tag); + + // Whenver a transient joins, invalidate all federates, so that all min_delays_upstream + // get re-computed. + // FIXME: Needs to be optimized to only invalidate those affected by the transient + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + invalidate_min_delays_upstream(&(fed->enclave)); } - void* clock_synchronization_thread(void* noargs) { - initialize_lf_thread_id(); - // Wait until all federates have been notified of the start time. - // FIXME: Use lf_ version of this when merged with master. - LF_MUTEX_LOCK(&rti_mutex); - while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) { - lf_cond_wait(&received_start_times); - } - LF_MUTEX_UNLOCK(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); + } +} - // Wait until the start time before starting clock synchronization. - // The above wait ensures that start_time has been set. - interval_t ns_to_wait = start_time - lf_time_physical(); +void send_physical_clock(unsigned char message_type, federate_info_t* fed, socket_type_t socket_type) { + if (fed->enclave.state == NOT_CONNECTED) { + lf_print_warning("Clock sync: RTI failed to send physical time to federate %d. Socket not connected.\n", + fed->enclave.id); + return; + } + unsigned char buffer[sizeof(int64_t) + 1]; + buffer[0] = message_type; + int64_t current_physical_time = lf_time_physical(); + encode_int64(current_physical_time, &(buffer[1])); + + // Send the message + if (socket_type == UDP) { + // FIXME: UDP_addr is never initialized. + LF_PRINT_DEBUG("Clock sync: RTI sending UDP message type %u.", buffer[0]); + ssize_t bytes_written = sendto(rti_remote->socket_descriptor_UDP, buffer, 1 + sizeof(int64_t), 0, + (struct sockaddr*)&fed->UDP_addr, sizeof(fed->UDP_addr)); + if (bytes_written < (ssize_t)sizeof(int64_t) + 1) { + lf_print_warning("Clock sync: RTI failed to send physical time to federate %d: %s\n", fed->enclave.id, + strerror(errno)); + return; + } + } else if (socket_type == TCP) { + LF_PRINT_DEBUG("Clock sync: RTI sending TCP message type %u.", buffer[0]); + LF_MUTEX_LOCK(&rti_mutex); + write_to_socket_fail_on_error(&fed->socket, 1 + sizeof(int64_t), buffer, &rti_mutex, + "Clock sync: RTI failed to send physical time to federate %d.", fed->enclave.id); + LF_MUTEX_UNLOCK(&rti_mutex); + } + LF_PRINT_DEBUG("Clock sync: RTI sent PHYSICAL_TIME_SYNC_MESSAGE with timestamp " PRINTF_TIME " to federate %d.", + current_physical_time, fed->enclave.id); +} - if (ns_to_wait > 0LL) { - lf_sleep(ns_to_wait); - } +void handle_physical_clock_sync_message(federate_info_t* my_fed, socket_type_t socket_type) { + // Lock the mutex to prevent interference between sending the two + // coded probe messages. + LF_MUTEX_LOCK(&rti_mutex); + // Reply with a T4 type message + send_physical_clock(MSG_TYPE_CLOCK_SYNC_T4, my_fed, socket_type); + // Send the corresponding coded probe immediately after, + // but only if this is a UDP channel. + if (socket_type == UDP) { + send_physical_clock(MSG_TYPE_CLOCK_SYNC_CODED_PROBE, my_fed, socket_type); + } + LF_MUTEX_UNLOCK(&rti_mutex); +} - // Initiate a clock synchronization every rti->clock_sync_period_ns - bool any_federates_connected = true; - while (any_federates_connected) { - // Sleep - lf_sleep(rti_remote->clock_sync_period_ns); // Can be interrupted - any_federates_connected = false; - for (int fed_id = 0; fed_id < rti_remote->base.number_of_scheduling_nodes; fed_id++) { - federate_info_t* fed = GET_FED_INFO(fed_id); - if (fed->enclave.state == NOT_CONNECTED) { - // FIXME: We need better error handling here, but clock sync failure - // should not stop execution. - lf_print_error("Clock sync failed with federate %d. Not connected.", fed_id); - continue; - } else if (!fed->clock_synchronization_enabled) { - continue; - } - // Send the RTI's current physical time to the federate - // Send on UDP. - LF_PRINT_DEBUG("RTI sending T1 message to initiate clock sync round."); - send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, UDP); +void* clock_synchronization_thread(void* noargs) { + initialize_lf_thread_id(); + // Wait until all federates have been notified of the start time. + // FIXME: Use lf_ version of this when merged with master. + LF_MUTEX_LOCK(&rti_mutex); + while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) { + lf_cond_wait(&received_start_times); + } + LF_MUTEX_UNLOCK(&rti_mutex); - // Listen for reply message, which should be T3. - size_t message_size = 1 + sizeof(uint16_t); - unsigned char buffer[message_size]; - // Maximum number of messages that we discard before giving up on this cycle. - // If the T3 message from this federate does not arrive and we keep receiving - // other messages, then give up on this federate and move to the next federate. - int remaining_attempts = 5; - while (remaining_attempts > 0) { - remaining_attempts--; - int read_failed = read_from_socket(rti_remote->socket_descriptor_UDP, message_size, buffer); - // If any errors occur, either discard the message or the clock sync round. - if (!read_failed) { - if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { - uint16_t fed_id_2 = extract_uint16(&(buffer[1])); - // Check that this message came from the correct federate. - if (fed_id_2 != fed->enclave.id) { - // Message is from the wrong federate. Discard the message. - lf_print_warning("Clock sync: Received T3 message from federate %d, " - "but expected one from %d. Discarding message.", - fed_id_2, fed->enclave.id); - continue; - } - LF_PRINT_DEBUG("Clock sync: RTI received T3 message from federate %d.", fed_id_2); - handle_physical_clock_sync_message(GET_FED_INFO(fed_id_2), UDP); - break; - } else { - // The message is not a T3 message. Discard the message and - // continue waiting for the T3 message. This is possibly a message - // from a previous cycle that was discarded. - lf_print_warning("Clock sync: Unexpected UDP message %u. Expected %u from federate %d. " - "Discarding message.", - buffer[0], MSG_TYPE_CLOCK_SYNC_T3, fed->enclave.id); - continue; - } - } else { - lf_print_warning("Clock sync: Read from UDP socket failed: %s. " - "Skipping clock sync round for federate %d.", - strerror(errno), fed->enclave.id); - remaining_attempts = -1; + // Wait until the start time before starting clock synchronization. + // The above wait ensures that start_time has been set. + interval_t ns_to_wait = start_time - lf_time_physical(); + + if (ns_to_wait > 0LL) { + lf_sleep(ns_to_wait); + } + + // Initiate a clock synchronization every rti->clock_sync_period_ns + bool any_federates_connected = true; + while (any_federates_connected) { + // Sleep + lf_sleep(rti_remote->clock_sync_period_ns); // Can be interrupted + any_federates_connected = false; + for (int fed_id = 0; fed_id < rti_remote->base.number_of_scheduling_nodes; fed_id++) { + federate_info_t* fed = GET_FED_INFO(fed_id); + if (fed->enclave.state == NOT_CONNECTED) { + // FIXME: We need better error handling here, but clock sync failure + // should not stop execution. + lf_print_error("Clock sync failed with federate %d. Not connected.", fed_id); + continue; + } else if (!fed->clock_synchronization_enabled) { + continue; + } + // Send the RTI's current physical time to the federate + // Send on UDP. + LF_PRINT_DEBUG("RTI sending T1 message to initiate clock sync round."); + send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, UDP); + + // Listen for reply message, which should be T3. + size_t message_size = 1 + sizeof(uint16_t); + unsigned char buffer[message_size]; + // Maximum number of messages that we discard before giving up on this cycle. + // If the T3 message from this federate does not arrive and we keep receiving + // other messages, then give up on this federate and move to the next federate. + int remaining_attempts = 5; + while (remaining_attempts > 0) { + remaining_attempts--; + int read_failed = read_from_socket(rti_remote->socket_descriptor_UDP, message_size, buffer); + // If any errors occur, either discard the message or the clock sync round. + if (!read_failed) { + if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { + uint16_t fed_id_2 = extract_uint16(&(buffer[1])); + // Check that this message came from the correct federate. + if (fed_id_2 != fed->enclave.id) { + // Message is from the wrong federate. Discard the message. + lf_print_warning("Clock sync: Received T3 message from federate %d, " + "but expected one from %d. Discarding message.", + fed_id_2, fed->enclave.id); + continue; } + LF_PRINT_DEBUG("Clock sync: RTI received T3 message from federate %d.", fed_id_2); + handle_physical_clock_sync_message(GET_FED_INFO(fed_id_2), UDP); + break; + } else { + // The message is not a T3 message. Discard the message and + // continue waiting for the T3 message. This is possibly a message + // from a previous cycle that was discarded. + lf_print_warning("Clock sync: Unexpected UDP message %u. Expected %u from federate %d. " + "Discarding message.", + buffer[0], MSG_TYPE_CLOCK_SYNC_T3, fed->enclave.id); + continue; } - if (remaining_attempts > 0) { - any_federates_connected = true; - } + } else { + lf_print_warning("Clock sync: Read from UDP socket failed: %s. " + "Skipping clock sync round for federate %d.", + strerror(errno), fed->enclave.id); + remaining_attempts = -1; } } - return NULL; + if (remaining_attempts > 0) { + any_federates_connected = true; + } } + } + return NULL; +} - /** - * Handle MSG_TYPE_FAILED sent by a federate. This message is sent by a federate - * that is exiting in failure. In this case, the RTI will - * also terminate abnormally, returning a non-zero exit code when it exits. - * - * This function assumes the caller does not hold the mutex. - * - * @param my_fed The federate sending a MSG_TYPE_FAILED message. - */ - static void handle_federate_failed(federate_info_t * my_fed) { - // Nothing more to do. Close the socket and exit. - LF_MUTEX_LOCK(&rti_mutex); +/** + * Handle MSG_TYPE_FAILED sent by a federate. This message is sent by a federate + * that is exiting in failure. In this case, the RTI will + * also terminate abnormally, returning a non-zero exit code when it exits. + * + * This function assumes the caller does not hold the mutex. + * + * @param my_fed The federate sending a MSG_TYPE_FAILED message. + */ +static void handle_federate_failed(federate_info_t* my_fed) { + // Nothing more to do. Close the socket and exit. + LF_MUTEX_LOCK(&rti_mutex); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_FAILED, my_fed->enclave.id, NULL); - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_FAILED, my_fed->enclave.id, NULL); + } - // Set the flag telling the RTI to exit with an error code when it exits. - _lf_federate_reports_error = true; - lf_print_error("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); + // Set the flag telling the RTI to exit with an error code when it exits. + _lf_federate_reports_error = true; + lf_print_error("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); - notify_federate_disconnected(&my_fed->enclave); - my_fed->enclave.state = NOT_CONNECTED; + notify_federate_disconnected(&my_fed->enclave); + my_fed->enclave.state = NOT_CONNECTED; - // Indicate that there will no further events from this federate. - my_fed->enclave.next_event = FOREVER_TAG; + // Indicate that there will no further events from this federate. + my_fed->enclave.next_event = FOREVER_TAG; - // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, - // the close should happen when receiving a 0 length message from the other end. - // Here, we just signal the other side that no further writes to the socket are - // forthcoming, which should result in the other end getting a zero-length reception. - shutdown(my_fed->socket, SHUT_RDWR); + // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, + // the close should happen when receiving a 0 length message from the other end. + // Here, we just signal the other side that no further writes to the socket are + // forthcoming, which should result in the other end getting a zero-length reception. + shutdown(my_fed->socket, SHUT_RDWR); - // We can now safely close the socket. - close(my_fed->socket); // from unistd.h + // We can now safely close the socket. + close(my_fed->socket); // from unistd.h - // Check downstream federates to see whether they should now be granted a TAG. - // To handle cycles, need to create a boolean array to keep - // track of which upstream federates have been visited. - bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. - notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); - free(visited); + // Check downstream federates to see whether they should now be granted a TAG. + // To handle cycles, need to create a boolean array to keep + // track of which upstream federates have been visited. + bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. + notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); + free(visited); - LF_MUTEX_UNLOCK(&rti_mutex); - } + LF_MUTEX_UNLOCK(&rti_mutex); +} - /** - * Handle MSG_TYPE_RESIGN sent by a federate. This message is sent at the time of termination - * after all shutdown events are processed on the federate. - * - * This function assumes the caller does not hold the mutex. - * - * @note At this point, the RTI might have outgoing messages to the federate. This - * function thus first performs a shutdown on the socket, which sends an EOF. It then - * waits for the remote socket to be closed before closing the socket itself. - * - * @param my_fed The federate sending a MSG_TYPE_RESIGN message. - */ - static void handle_federate_resign(federate_info_t * my_fed) { - // Nothing more to do. Close the socket and exit. - LF_MUTEX_LOCK(&rti_mutex); +/** + * Handle MSG_TYPE_RESIGN sent by a federate. This message is sent at the time of termination + * after all shutdown events are processed on the federate. + * + * This function assumes the caller does not hold the mutex. + * + * @note At this point, the RTI might have outgoing messages to the federate. This + * function thus first performs a shutdown on the socket, which sends an EOF. It then + * waits for the remote socket to be closed before closing the socket itself. + * + * @param my_fed The federate sending a MSG_TYPE_RESIGN message. + */ +static void handle_federate_resign(federate_info_t* my_fed) { + // Nothing more to do. Close the socket and exit. + LF_MUTEX_LOCK(&rti_mutex); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_RESIGN, my_fed->enclave.id, NULL); - } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_RESIGN, my_fed->enclave.id, NULL); + } - lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); + lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); - my_fed->enclave.state = NOT_CONNECTED; + my_fed->enclave.state = NOT_CONNECTED; - // Indicate that there will no further events from this federate. - my_fed->enclave.next_event = FOREVER_TAG; - - // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, - // the close should happen when receiving a 0 length message from the other end. - // Here, we just signal the other side that no further writes to the socket are - // forthcoming, which should result in the other end getting a zero-length reception. - shutdown(my_fed->socket, SHUT_WR); - - // Wait for the federate to send an EOF or a socket error to occur. - // Discard any incoming bytes. Normally, this read should return 0 because - // the federate is resigning and should itself invoke shutdown. - unsigned char buffer[10]; - while (read(my_fed->socket, buffer, 10) > 0) - ; - - // We can now safely close the socket. - close(my_fed->socket); // from unistd.h - - // Check downstream federates to see whether they should now be granted a TAG. - // To handle cycles, need to create a boolean array to keep - // track of which upstream federates have been visited. - bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. - notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); - free(visited); - - LF_MUTEX_UNLOCK(&rti_mutex); - } + notify_federate_disconnected(&my_fed->enclave); - void* federate_info_thread_TCP(void* fed) { - initialize_lf_thread_id(); - federate_info_t* my_fed = (federate_info_t*)fed; - - // Buffer for incoming messages. - // This does not constrain the message size because messages - // are forwarded piece by piece. - unsigned char buffer[FED_COM_BUFFER_SIZE]; - - // Listen for messages from the federate. - while (my_fed->enclave.state != NOT_CONNECTED) { - // Read no more than one byte to get the message type. - int read_failed = read_from_socket(my_fed->socket, 1, buffer); - if (read_failed) { - // Socket is closed - lf_print_error("RTI: Socket to federate %d is closed. Exiting the thread.", my_fed->enclave.id); - my_fed->enclave.state = NOT_CONNECTED; - my_fed->socket = -1; - // FIXME: We need better error handling here, but do not stop execution here. - break; - } - LF_PRINT_DEBUG("RTI: Received message type %u from federate %d.", buffer[0], my_fed->enclave.id); - switch (buffer[0]) { - case MSG_TYPE_TIMESTAMP: - handle_timestamp(my_fed); - break; - case MSG_TYPE_ADDRESS_QUERY: - handle_address_query(my_fed->enclave.id); - break; - case MSG_TYPE_ADDRESS_ADVERTISEMENT: - handle_address_ad(my_fed->enclave.id); - break; - case MSG_TYPE_TAGGED_MESSAGE: - handle_timed_message(my_fed, buffer); - break; - case MSG_TYPE_RESIGN: - handle_federate_resign(my_fed); - break; - case MSG_TYPE_NEXT_EVENT_TAG: - handle_next_event_tag(my_fed); - break; - case MSG_TYPE_LATEST_TAG_COMPLETE: - handle_latest_tag_complete(my_fed); - break; - case MSG_TYPE_STOP_REQUEST: - handle_stop_request_message(my_fed); // FIXME: Reviewed until here. - // Need to also look at - // notify_advance_grant_if_safe() - // and notify_downstream_advance_grant_if_safe() - break; - case MSG_TYPE_STOP_REQUEST_REPLY: - handle_stop_request_reply(my_fed); - break; - case MSG_TYPE_PORT_ABSENT: - handle_port_absent_message(my_fed, buffer); - break; - case MSG_TYPE_FAILED: - handle_federate_failed(my_fed); - return NULL; - default: - lf_print_error("RTI received from federate %d an unrecognized TCP message type: %u.", my_fed->enclave.id, - buffer[0]); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(receive_UNIDENTIFIED, my_fed->enclave.id, NULL); - } - } - } + // Indicate that there will no further events from this federate. + my_fed->enclave.next_event = FOREVER_TAG; - // Nothing more to do. Close the socket and exit. - // Prevent multiple threads from closing the same socket at the same time. - LF_MUTEX_LOCK(&rti_mutex); - close(my_fed->socket); // from unistd.h - // Manual clean, in case of a transient federate - if (my_fed->is_transient) { - // FIXME: Aren't there transit messages anymore??? - // free_in_transit_message_q(my_fed->in_transit_message_tags); - lf_print("RTI: Transient Federate %d thread exited.", my_fed->enclave.id); - - // Update the number of connected transient federates - rti_remote->number_of_connected_transient_federates--; - - // Reset the status of the leaving federate - reset_transient_federate(my_fed); - } - // Signal the hot swap mechanism, if needed - if (hot_swap_in_progress && hot_swap_federate->enclave.id == my_fed->enclave.id) { - hot_swap_old_resigned = true; - } - LF_MUTEX_UNLOCK(&rti_mutex); - return NULL; - } + // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, + // the close should happen when receiving a 0 length message from the other end. + // Here, we just signal the other side that no further writes to the socket are + // forthcoming, which should result in the other end getting a zero-length reception. + shutdown(my_fed->socket, SHUT_WR); - void send_reject(int* socket_id, rejection_code_t error_code) { - LF_PRINT_DEBUG("RTI sending MSG_TYPE_REJECT."); - unsigned char response[2]; - response[0] = MSG_TYPE_REJECT; - response[1] = (unsigned char)error_code; - LF_MUTEX_LOCK(&rti_mutex); - // NOTE: Ignore errors on this response. - if (write_to_socket(*socket_id, 2, response)) { - lf_print_warning("RTI failed to write MSG_TYPE_REJECT message on the socket."); + // // Wait for the federate to send an EOF or a socket error to occur. + // // Discard any incoming bytes. Normally, this read should return 0 because + // // the federate is resigning and should itself invoke shutdown. + unsigned char buffer[10]; + while (read(my_fed->socket, buffer, 10) > 0) + ; + + // // We can now safely close the socket. + close(my_fed->socket); // from unistd.h + + // notify_federate_disconnected(&my_fed->enclave); + // Check downstream federates to see whether they should now be granted a TAG. + // To handle cycles, need to create a boolean array to keep + // track of which upstream federates have been visited. + bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. + notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); + free(visited); + + LF_MUTEX_UNLOCK(&rti_mutex); +} + +void* federate_info_thread_TCP(void* fed) { + initialize_lf_thread_id(); + federate_info_t* my_fed = (federate_info_t*)fed; + + // Buffer for incoming messages. + // This does not constrain the message size because messages + // are forwarded piece by piece. + unsigned char buffer[FED_COM_BUFFER_SIZE]; + + // Listen for messages from the federate. + while (my_fed->enclave.state != NOT_CONNECTED) { + // Read no more than one byte to get the message type. + int read_failed = read_from_socket(my_fed->socket, 1, buffer); + if (read_failed) { + // Socket is closed + lf_print_error("RTI: Socket to federate %d is closed. Exiting the thread.", my_fed->enclave.id); + my_fed->enclave.state = NOT_CONNECTED; + notify_federate_disconnected(&my_fed->enclave); + my_fed->socket = -1; + // FIXME: We need better error handling here, but do not stop execution here. + break; + } + LF_PRINT_DEBUG("RTI: Received message type %u from federate %d.", buffer[0], my_fed->enclave.id); + switch (buffer[0]) { + case MSG_TYPE_TIMESTAMP: + handle_timestamp(my_fed); + break; + case MSG_TYPE_ADDRESS_QUERY: + handle_address_query(my_fed->enclave.id); + break; + case MSG_TYPE_ADDRESS_ADVERTISEMENT: + handle_address_ad(my_fed->enclave.id); + break; + case MSG_TYPE_TAGGED_MESSAGE: + handle_timed_message(my_fed, buffer); + break; + case MSG_TYPE_RESIGN: + handle_federate_resign(my_fed); + break; + case MSG_TYPE_NEXT_EVENT_TAG: + handle_next_event_tag(my_fed); + break; + case MSG_TYPE_LATEST_TAG_CONFIRMED: + handle_latest_tag_confirmed(my_fed); + break; + case MSG_TYPE_STOP_REQUEST: + handle_stop_request_message(my_fed); // FIXME: Reviewed until here. + // Need to also look at + // notify_advance_grant_if_safe() + // and notify_downstream_advance_grant_if_safe() + break; + case MSG_TYPE_STOP_REQUEST_REPLY: + handle_stop_request_reply(my_fed); + break; + case MSG_TYPE_PORT_ABSENT: + handle_port_absent_message(my_fed, buffer); + break; + case MSG_TYPE_FAILED: + handle_federate_failed(my_fed); + return NULL; + default: + lf_print_error("RTI received from federate %d an unrecognized TCP message type: %u.", my_fed->enclave.id, + buffer[0]); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_UNIDENTIFIED, my_fed->enclave.id, NULL); } - // Close the socket. - shutdown(*socket_id, SHUT_RDWR); - close(*socket_id); - *socket_id = -1; - LF_MUTEX_UNLOCK(&rti_mutex); } + } - /** - * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload - * a federate ID and a federation ID. If the federation ID - * matches this federation, send an MSG_TYPE_ACK and otherwise send - * a MSG_TYPE_REJECT message. - * @param socket_id Pointer to the socket on which to listen. - * @param client_fd The socket address. - * @return The federate ID for success or -1 for failure. - */ - static int32_t receive_and_check_fed_id_message(int* socket_id, struct sockaddr_in* client_fd) { - // Buffer for message ID, federate ID, type (persistent or transient), and federation ID length. - size_t length = 1 + sizeof(uint16_t) + 1 + 1; // Message ID, federate ID, length of fedration ID, type. - unsigned char buffer[length]; - - // Read bytes from the socket. We need 4 bytes. - if (read_from_socket_close_on_error(socket_id, length, buffer)) { - lf_print_error("RTI failed to read from accepted socket."); - return -1; - } + // Nothing more to do. Close the socket and exit. + // Prevent multiple threads from closing the same socket at the same time. + LF_MUTEX_LOCK(&rti_mutex); + close(my_fed->socket); // from unistd.h + // Manual clean, in case of a transient federate + if (my_fed->is_transient) { + // FIXME: Aren't there transit messages anymore??? + // free_in_transit_message_q(my_fed->in_transit_message_tags); + lf_print("RTI: Transient Federate %d thread exited. and socket_id is: %d ", my_fed->enclave.id, my_fed->socket); + + // Update the number of connected transient federates + rti_remote->number_of_connected_transient_federates--; + + // Reset the status of the leaving federate + reset_transient_federate(my_fed); + } + // Signal the hot swap mechanism, if needed + if (hot_swap_in_progress && hot_swap_federate->enclave.id == my_fed->enclave.id) { + hot_swap_old_resigned = true; + } + LF_MUTEX_UNLOCK(&rti_mutex); + return NULL; +} - uint16_t fed_id = rti_remote->base.number_of_scheduling_nodes; // Initialize to an invalid value. - bool is_transient = false; +void send_reject(int* socket_id, rejection_code_t error_code) { + LF_PRINT_DEBUG("RTI sending MSG_TYPE_REJECT."); + unsigned char response[2]; + response[0] = MSG_TYPE_REJECT; + response[1] = (unsigned char)error_code; + LF_MUTEX_LOCK(&rti_mutex); + // NOTE: Ignore errors on this response. + if (write_to_socket(*socket_id, 2, response)) { + lf_print_warning("RTI failed to write MSG_TYPE_REJECT message on the socket."); + } + // Close the socket. + shutdown(*socket_id, SHUT_RDWR); + close(*socket_id); + *socket_id = -1; + LF_MUTEX_UNLOCK(&rti_mutex); +} - // First byte received is the message type. - if (buffer[0] != MSG_TYPE_FED_IDS) { - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); - } - if (buffer[0] == MSG_TYPE_P2P_SENDING_FED_ID || buffer[0] == MSG_TYPE_P2P_TAGGED_MESSAGE) { - // The federate is trying to connect to a peer, not to the RTI. - // It has connected to the RTI instead. - // FIXME: This should not happen, but apparently has been observed. - // It should not happen because the peers get the port and IP address - // of the peer they want to connect to from the RTI. - // If the connection is a peer-to-peer connection between two - // federates, reject the connection with the WRONG_SERVER error. - send_reject(socket_id, WRONG_SERVER); - } else if (buffer[0] == MSG_TYPE_FED_NONCE) { - send_reject(socket_id, RTI_NOT_EXECUTED_WITH_AUTH); - lf_print_error("RTI not executed with HMAC authentication option using -a or --auth."); - } else { - send_reject(socket_id, UNEXPECTED_MESSAGE); - } - lf_print_error("RTI expected a MSG_TYPE_FED_IDS message. Got %u (see net_common.h).", buffer[0]); - return -1; - } else { - // Received federate ID. - fed_id = extract_uint16(buffer + 1); - is_transient = (buffer[sizeof(uint16_t) + 1] == 1) ? true : false; - if (is_transient) { - LF_PRINT_LOG("RTI received federate ID: %d, which is transient.", fed_id); - } else { - LF_PRINT_LOG("RTI received federate ID: %d, which is persistent.", fed_id); - } +/** + * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload + * a federate ID and a federation ID. If the federation ID + * matches this federation, send an MSG_TYPE_ACK and otherwise send + * a MSG_TYPE_REJECT message. + * @param socket_id Pointer to the socket on which to listen. + * @return The federate ID for success or -1 for failure. + */ +static int32_t receive_and_check_fed_id_message(int* socket_id) { + // Buffer for message ID, federate ID, type (persistent or transient), and federation ID length. + size_t length = 1 + sizeof(uint16_t) + 1 + 1; // Message ID, federate ID, length of fedration ID, type. + unsigned char buffer[length]; + + // Read bytes from the socket. We need 4 bytes. + if (read_from_socket_close_on_error(socket_id, length, buffer)) { + lf_print_error("RTI failed to read from accepted socket."); + return -1; + } - // Read the federation ID. First read the length, which is one byte. - size_t federation_id_length = (size_t)buffer[sizeof(uint16_t) + 2]; - char federation_id_received[federation_id_length + 1]; // One extra for null terminator. - // Next read the actual federation ID. - if (read_from_socket_close_on_error(socket_id, federation_id_length, (unsigned char*)federation_id_received)) { - lf_print_error("RTI failed to read federation id from federate %d.", fed_id); - return -1; - } + uint16_t fed_id = rti_remote->base.number_of_scheduling_nodes; // Initialize to an invalid value. + bool is_transient = false; - // Terminate the string with a null. - federation_id_received[federation_id_length] = 0; + // First byte received is the message type. + if (buffer[0] != MSG_TYPE_FED_IDS) { + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + } + if (buffer[0] == MSG_TYPE_P2P_SENDING_FED_ID || buffer[0] == MSG_TYPE_P2P_TAGGED_MESSAGE) { + // The federate is trying to connect to a peer, not to the RTI. + // It has connected to the RTI instead. + // FIXME: This should not happen, but apparently has been observed. + // It should not happen because the peers get the port and IP address + // of the peer they want to connect to from the RTI. + // If the connection is a peer-to-peer connection between two + // federates, reject the connection with the WRONG_SERVER error. + send_reject(socket_id, WRONG_SERVER); + } else if (buffer[0] == MSG_TYPE_FED_NONCE) { + send_reject(socket_id, RTI_NOT_EXECUTED_WITH_AUTH); + lf_print_error("RTI not executed with HMAC authentication option using -a or --auth."); + } else { + send_reject(socket_id, UNEXPECTED_MESSAGE); + } + lf_print_error("RTI expected a MSG_TYPE_FED_IDS message. Got %u (see net_common.h).", buffer[0]); + return -1; + } else { + // Received federate ID. + fed_id = extract_uint16(buffer + 1); + is_transient = (buffer[sizeof(uint16_t) + 1] == 1) ? true : false; + if (is_transient) { + LF_PRINT_LOG("RTI received federate ID: %d, which is transient.", fed_id); + } else { + LF_PRINT_LOG("RTI received federate ID: %d, which is persistent.", fed_id); + } + + // Read the federation ID. First read the length, which is one byte. + size_t federation_id_length = (size_t)buffer[sizeof(uint16_t) + 2]; + char federation_id_received[federation_id_length + 1]; // One extra for null terminator. + // Next read the actual federation ID. + if (read_from_socket_close_on_error(socket_id, federation_id_length, (unsigned char*)federation_id_received)) { + lf_print_error("RTI failed to read federation id from federate %d.", fed_id); + return -1; + } + + // Terminate the string with a null. + federation_id_received[federation_id_length] = 0; - LF_PRINT_DEBUG("RTI received federation ID: %s.", federation_id_received); + LF_PRINT_DEBUG("RTI received federation ID: %s.", federation_id_received); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(receive_FED_ID, fed_id, NULL); + } + // Compare the received federation ID to mine. + if (strncmp(rti_remote->federation_id, federation_id_received, federation_id_length) != 0) { + // Federation IDs do not match. Send back a MSG_TYPE_REJECT message. + lf_print_warning("Federate from another federation %s attempted to connect to RTI in federation %s.", + federation_id_received, rti_remote->federation_id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + } + send_reject(socket_id, FEDERATION_ID_DOES_NOT_MATCH); + return -1; + } else { + if (fed_id >= rti_remote->base.number_of_scheduling_nodes) { + // Federate ID is out of range. + lf_print_error("RTI received federate ID %d, which is out of range.", fed_id); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(rti_remote->base.trace, receive_FED_ID, fed_id, NULL); + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); } - // Compare the received federation ID to mine. - if (strncmp(rti_remote->federation_id, federation_id_received, federation_id_length) != 0) { - // Federation IDs do not match. Send back a MSG_TYPE_REJECT message. - lf_print_warning("Federate from another federation %s attempted to connect to RTI in federation %s.", - federation_id_received, rti_remote->federation_id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); - } - send_reject(socket_id, FEDERATION_ID_DOES_NOT_MATCH); - return -1; - } else { - if (fed_id >= rti_remote->base.number_of_scheduling_nodes) { - // Federate ID is out of range. - lf_print_error("RTI received federate ID %d, which is out of range.", fed_id); + send_reject(socket_id, FEDERATE_ID_OUT_OF_RANGE); + return -1; + } else { + // Find out if it is a new connection or a hot swap. + // Reject if: + // - duplicate of a connected persistent federate + // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that + // particular federate + // - or it is a hot swap, but it is not the execution phase yet + // Find out if it is a new connection or a hot swap. + // Reject if: + // - duplicate of a connected persistent federate + // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that + // particular federate + // - or it is a hot swap, but it is not the execution phase yet + if ((rti_remote->base.scheduling_nodes[fed_id])->state != NOT_CONNECTED) { + if (!is_transient) { + lf_print_error("RTI received duplicate federate ID: %d.", fed_id); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); } - send_reject(socket_id, FEDERATE_ID_OUT_OF_RANGE); + send_reject(socket_id, FEDERATE_ID_IN_USE); return -1; - } else { - // Find out if it is a new connection or a hot swap. - // Reject if: - // - duplicate of a connected persistent federate - // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that - // particular federate - // - or it is a hot swap, but it is not the execution phase yet - // Find out if it is a new connection or a hot swap. - // Reject if: - // - duplicate of a connected persistent federate - // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that - // particular federate - // - or it is a hot swap, but it is not the execution phase yet - if ((rti_remote->base.scheduling_nodes[fed_id])->state != NOT_CONNECTED) { - if (!is_transient) { - lf_print_error("RTI received duplicate federate ID: %d.", fed_id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); - } - send_reject(socket_id, FEDERATE_ID_IN_USE); - return -1; - } else if (hot_swap_in_progress || rti_remote->phase != execution_phase) { - lf_print_warning("RTI rejects the connection of transient federate %d, \ + } else if (hot_swap_in_progress || rti_remote->phase != execution_phase) { + lf_print_warning("RTI rejects the connection of transient federate %d, \ because a hot swap is already in progress for federate %d. \n\ Only one hot swap operation is allowed at a time.", - fed_id, hot_swap_federate->enclave.id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); - } - send_reject(socket_id, FEDERATE_ID_IN_USE); - return -1; - } + fed_id, hot_swap_federate->enclave.id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); } + send_reject(socket_id, FEDERATE_ID_IN_USE); + return -1; } } } + } + } - federate_info_t* fed_twin = GET_FED_INFO(fed_id); - federate_info_t* fed; - // If the federate is already connected (making the request a duplicate), and that - // the federate is transient, and it is the execution phase, then mark that a hot - // swap is in progreass and initialize the hot_swap_federate. - // Otherwise, proceed with a normal transinet connection - if (fed_twin->enclave.state != NOT_CONNECTED && is_transient && fed_twin->is_transient && - rti_remote->phase == execution_phase && !hot_swap_in_progress) { - // Allocate memory for the new federate and initilize it - hot_swap_federate = (federate_info_t*)malloc(sizeof(federate_info_t)); - initialize_federate(hot_swap_federate, fed_id); - - // Set that hot swap is in progress - hot_swap_in_progress = true; - // free(fed); // Free the old memory to prevent memory leak - fed = hot_swap_federate; - lf_print("RTI: Hot Swap starting for federate %d.", fed_id); - } else { - fed = fed_twin; - fed->is_transient = is_transient; - } + federate_info_t* fed_twin = GET_FED_INFO(fed_id); + federate_info_t* fed; + // If the federate is already connected (making the request a duplicate), and that + // the federate is transient, and it is the execution phase, then mark that a hot + // swap is in progreass and initialize the hot_swap_federate. + // Otherwise, proceed with a normal transinet connection + if (fed_twin->enclave.state != NOT_CONNECTED && is_transient && fed_twin->is_transient && + rti_remote->phase == execution_phase && !hot_swap_in_progress) { + // Allocate memory for the new federate and initilize it + hot_swap_federate = (federate_info_t*)malloc(sizeof(federate_info_t)); + initialize_federate(hot_swap_federate, fed_id); + + // Set that hot swap is in progress + hot_swap_in_progress = true; + // free(fed); // Free the old memory to prevent memory leak + fed = hot_swap_federate; + lf_print("RTI: Hot Swap starting for federate %d.", fed_id); + } else { + fed = fed_twin; + fed->is_transient = is_transient; + } - // The MSG_TYPE_FED_IDS message has the right federation ID. + // The MSG_TYPE_FED_IDS message has the right federation ID. - // Get the peer address from the connected socket_id. Then assign it as the federate's socket server. - struct sockaddr_in peer_addr; - socklen_t addr_len = sizeof(peer_addr); - if (getpeername(*socket_id, (struct sockaddr*)&peer_addr, &addr_len) != 0) { - lf_print_error("RTI failed to get peer address."); - } - fed->server_ip_addr = peer_addr.sin_addr; + // Get the peer address from the connected socket_id. Then assign it as the federate's socket server. + struct sockaddr_in peer_addr; + socklen_t addr_len = sizeof(peer_addr); + if (getpeername(*socket_id, (struct sockaddr*)&peer_addr, &addr_len) != 0) { + lf_print_error("RTI failed to get peer address."); + } + fed->server_ip_addr = peer_addr.sin_addr; #if LOG_LEVEL >= LOG_LEVEL_DEBUG - // Create the human readable format and copy that into - // the .server_hostname field of the federate. - char str[INET_ADDRSTRLEN + 1]; - inet_ntop(AF_INET, &fed->server_ip_addr, str, INET_ADDRSTRLEN); - strncpy(fed->server_hostname, str, INET_ADDRSTRLEN); + // Create the human readable format and copy that into + // the .server_hostname field of the federate. + char str[INET_ADDRSTRLEN + 1]; + inet_ntop(AF_INET, &fed->server_ip_addr, str, INET_ADDRSTRLEN); + strncpy(fed->server_hostname, str, INET_ADDRSTRLEN); - LF_PRINT_DEBUG("RTI got address %s from federate %d.", fed->server_hostname, fed_id); + LF_PRINT_DEBUG("RTI got address %s from federate %d.", fed->server_hostname, fed_id); #endif - fed->socket = *socket_id; - - // Set the federate's state as pending - // because it is waiting for the start time to be - // sent by the RTI before beginning its execution. - fed->enclave.state = PENDING; + fed->socket = *socket_id; + + // Set the federate's state as pending + // because it is waiting for the start time to be + // sent by the RTI before beginning its execution. + fed->enclave.state = PENDING; + + LF_PRINT_DEBUG("RTI responding with MSG_TYPE_ACK to federate %d.", fed_id); + // Send an MSG_TYPE_ACK message. + unsigned char ack_message = MSG_TYPE_ACK; + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_ACK, fed_id, NULL); + } + LF_MUTEX_LOCK(&rti_mutex); + if (write_to_socket_close_on_error(&fed->socket, 1, &ack_message)) { + LF_MUTEX_UNLOCK(&rti_mutex); + lf_print_error("RTI failed to write MSG_TYPE_ACK message to federate %d.", fed_id); + return -1; + } + LF_MUTEX_UNLOCK(&rti_mutex); - LF_PRINT_DEBUG("RTI responding with MSG_TYPE_ACK to federate %d.", fed_id); - // Send an MSG_TYPE_ACK message. - unsigned char ack_message = MSG_TYPE_ACK; - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_ACK, fed_id, NULL); - } - LF_MUTEX_LOCK(&rti_mutex); - if (write_to_socket_close_on_error(&fed->socket, 1, &ack_message)) { - LF_MUTEX_UNLOCK(&rti_mutex); - lf_print_error("RTI failed to write MSG_TYPE_ACK message to federate %d.", fed_id); - return -1; - } - LF_MUTEX_UNLOCK(&rti_mutex); + LF_PRINT_DEBUG("RTI sent MSG_TYPE_ACK to federate %d.", fed_id); - LF_PRINT_DEBUG("RTI sent MSG_TYPE_ACK to federate %d.", fed_id); + return (int32_t)fed_id; +} - return (int32_t)fed_id; +/** + * Listen for a MSG_TYPE_NEIGHBOR_STRUCTURE message, and upon receiving it, fill + * out the relevant information in the federate's struct. + * + * In case of a hot swap, check that no changes were made to the connections, compared + * to the first instance that joigned. This means that the first instance to join + * __is__ the reference. + * + * @return 1 on success and 0 on failure. + */ +static int receive_connection_information(int* socket_id, uint16_t fed_id) { + LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_NEIGHBOR_STRUCTURE from federate %d.", fed_id); + unsigned char connection_info_header[MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE]; + read_from_socket_fail_on_error(socket_id, MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE, connection_info_header, NULL, + "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message header from federate %d.", + fed_id); + + if (connection_info_header[0] != MSG_TYPE_NEIGHBOR_STRUCTURE) { + lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " + "Rejecting federate.", + fed_id, connection_info_header[0]); + send_reject(socket_id, UNEXPECTED_MESSAGE); + return 0; + } else { + // In case of a transient federate that is joining again, or a hot swap, then + // check that the connection information did not change. + federate_info_t* fed = GET_FED_INFO(fed_id); + federate_info_t* temp_fed = NULL; + if (lf_tag_compare(fed->effective_start_tag, NEVER_TAG) != 0) { + if (hot_swap_in_progress) { + fed = hot_swap_federate; + } else { + temp_fed = (federate_info_t*)calloc(1, sizeof(federate_info_t)); + initialize_federate(temp_fed, fed_id); + fed = temp_fed; + } + } + // Read the number of upstream and downstream connections + fed->enclave.num_upstream = extract_int32(&(connection_info_header[1])); + fed->enclave.num_downstream = extract_int32(&(connection_info_header[1 + sizeof(int32_t)])); + LF_PRINT_DEBUG("RTI got %d upstreams and %d downstreams from federate %d.", fed->enclave.num_upstream, + fed->enclave.num_downstream, fed_id); + + // Allocate memory for the upstream and downstream pointers + if (fed->enclave.num_upstream > 0) { + fed->enclave.upstream = (uint16_t*)malloc(sizeof(uint16_t) * fed->enclave.num_upstream); + LF_ASSERT_NON_NULL(fed->enclave.upstream); + // Allocate memory for the upstream delay pointers + fed->enclave.upstream_delay = (interval_t*)malloc(sizeof(interval_t) * fed->enclave.num_upstream); + LF_ASSERT_NON_NULL(fed->enclave.upstream_delay); + } else { + fed->enclave.upstream = (uint16_t*)NULL; + fed->enclave.upstream_delay = (interval_t*)NULL; + } + if (fed->enclave.num_downstream > 0) { + fed->enclave.downstream = (uint16_t*)malloc(sizeof(uint16_t) * fed->enclave.num_downstream); + LF_ASSERT_NON_NULL(fed->enclave.downstream); + } else { + fed->enclave.downstream = (uint16_t*)NULL; } - /** - * Listen for a MSG_TYPE_NEIGHBOR_STRUCTURE message, and upon receiving it, fill - * out the relevant information in the federate's struct. - * - * In case of a hot swap, check that no changes were made to the connections, compared - * to the first instance that joigned. This means that the first instance to join - * __is__ the reference. - * - * @return 1 on success and 0 on failure. - */ - static int receive_connection_information(int* socket_id, uint16_t fed_id) { - LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_NEIGHBOR_STRUCTURE from federate %d.", fed_id); - unsigned char connection_info_header[MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE]; - read_from_socket_fail_on_error(socket_id, MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE, connection_info_header, NULL, - "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message header from federate %d.", + size_t connections_info_body_size = ((sizeof(uint16_t) + sizeof(int64_t)) * fed->enclave.num_upstream) + + (sizeof(uint16_t) * fed->enclave.num_downstream); + unsigned char* connections_info_body = NULL; + if (connections_info_body_size > 0) { + connections_info_body = (unsigned char*)malloc(connections_info_body_size); + LF_ASSERT_NON_NULL(connections_info_body); + read_from_socket_fail_on_error(socket_id, connections_info_body_size, connections_info_body, NULL, + "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message body from federate %d.", fed_id); + // Keep track of where we are in the buffer + size_t message_head = 0; + // First, read the info about upstream federates + for (int i = 0; i < fed->enclave.num_upstream; i++) { + fed->enclave.upstream[i] = extract_uint16(&(connections_info_body[message_head])); + message_head += sizeof(uint16_t); + fed->enclave.upstream_delay[i] = extract_int64(&(connections_info_body[message_head])); + message_head += sizeof(int64_t); + } - if (connection_info_header[0] != MSG_TYPE_NEIGHBOR_STRUCTURE) { - lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " - "Rejecting federate.", - fed_id, connection_info_header[0]); - send_reject(socket_id, UNEXPECTED_MESSAGE); - return 0; + // Next, read the info about downstream federates + for (int i = 0; i < fed->enclave.num_downstream; i++) { + fed->enclave.downstream[i] = extract_uint16(&(connections_info_body[message_head])); + message_head += sizeof(uint16_t); + } + + free(connections_info_body); + } + + // NOTE: In this design, changes in the connections are not allowed. This means that the first + // instance to join __is__ the reference. If this policy is to be changed, then it is in + // the following lines will be updated accordingly. + if (hot_swap_in_progress || temp_fed != NULL) { + if (temp_fed == NULL) { + temp_fed = hot_swap_federate; + } + // Now, compare the previous and the new neighberhood structure + // Start with the number of upstreams and downstreams + bool reject = false; + if ((fed->enclave.num_upstream != temp_fed->enclave.num_upstream) || + (fed->enclave.num_downstream != temp_fed->enclave.num_downstream)) { + reject = true; } else { - // In case of a transient federate that is joining again, or a hot swap, then - // check that the connection information did not change. - federate_info_t* fed = GET_FED_INFO(fed_id); - federate_info_t* temp_fed = NULL; - if (lf_tag_compare(fed->effective_start_tag, NEVER_TAG) != 0) { - if (hot_swap_in_progress) { - fed = hot_swap_federate; - } else { - temp_fed = (federate_info_t*)calloc(1, sizeof(federate_info_t)); - initialize_federate(temp_fed, fed_id); - fed = temp_fed; + // Then check all upstreams and their delays + for (int i = 0; i < fed->enclave.num_upstream; i++) { + if ((fed->enclave.upstream[i] != temp_fed->enclave.upstream[i]) || + (fed->enclave.upstream_delay[i] != temp_fed->enclave.upstream_delay[i])) { + reject = true; + break; } } - // Read the number of upstream and downstream connections - fed->enclave.num_upstream = extract_int32(&(connection_info_header[1])); - fed->enclave.num_downstream = extract_int32(&(connection_info_header[1 + sizeof(int32_t)])); - LF_PRINT_DEBUG("RTI got %d upstreams and %d downstreams from federate %d.", fed->enclave.num_upstream, - fed->enclave.num_downstream, fed_id); - - // Allocate memory for the upstream and downstream pointers - if (fed->enclave.num_upstream > 0) { - fed->enclave.upstream = (uint16_t*)malloc(sizeof(uint16_t) * fed->enclave.num_upstream); - LF_ASSERT_NON_NULL(fed->enclave.upstream); - // Allocate memory for the upstream delay pointers - fed->enclave.upstream_delay = (interval_t*)malloc(sizeof(interval_t) * fed->enclave.num_upstream); - LF_ASSERT_NON_NULL(fed->enclave.upstream_delay); - } else { - fed->enclave.upstream = (uint16_t*)NULL; - fed->enclave.upstream_delay = (interval_t*)NULL; + if (!reject) { + // Finally, check all downstream federates + for (int i = 0; i < fed->enclave.num_downstream; i++) { + if (fed->enclave.downstream[i] != temp_fed->enclave.downstream[i]) { + reject = true; + break; + } + } } - if (fed->enclave.num_downstream > 0) { - fed->enclave.downstream = (uint16_t*)malloc(sizeof(uint16_t) * fed->enclave.num_downstream); - LF_ASSERT_NON_NULL(fed->enclave.downstream); - } else { - fed->enclave.downstream = (uint16_t*)NULL; + } + if (reject) { + if (temp_fed != hot_swap_federate) { + free(temp_fed); } + return 0; + } + } + } + LF_PRINT_DEBUG("RTI received neighbor structure from federate %d.", fed_id); + return 1; +} - size_t connections_info_body_size = ((sizeof(uint16_t) + sizeof(int64_t)) * fed->enclave.num_upstream) + - (sizeof(uint16_t) * fed->enclave.num_downstream); - unsigned char* connections_info_body = NULL; - if (connections_info_body_size > 0) { - connections_info_body = (unsigned char*)malloc(connections_info_body_size); - LF_ASSERT_NON_NULL(connections_info_body); - read_from_socket_fail_on_error( - socket_id, connections_info_body_size, connections_info_body, NULL, - "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message body from federate %d.", fed_id); - // Keep track of where we are in the buffer - size_t message_head = 0; - // First, read the info about upstream federates - for (int i = 0; i < fed->enclave.num_upstream; i++) { - fed->enclave.upstream[i] = extract_uint16(&(connections_info_body[message_head])); - message_head += sizeof(uint16_t); - fed->enclave.upstream_delay[i] = extract_int64(&(connections_info_body[message_head])); - message_head += sizeof(int64_t); - } +/** + * Listen for a MSG_TYPE_UDP_PORT message, and upon receiving it, set up + * clock synchronization and perform the initial clock synchronization. + * Initial clock synchronization is performed only if the MSG_TYPE_UDP_PORT message + * payload is not UINT16_MAX. If it is also not 0, then this function sets + * up to perform runtime clock synchronization using the UDP port number + * specified in the payload to communicate with the federate's clock + * synchronization logic. + * @param socket_id The socket on which to listen. + * @param fed_id The federate ID. + * @return 1 for success, 0 for failure. + */ +static int receive_udp_message_and_set_up_clock_sync(int* socket_id, uint16_t fed_id) { + // Read the MSG_TYPE_UDP_PORT message from the federate regardless of the status of + // clock synchronization. This message will tell the RTI whether the federate + // is doing clock synchronization, and if it is, what port to use for UDP. + LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_UDP_PORT from federate %d.", fed_id); + unsigned char response[1 + sizeof(uint16_t)]; + read_from_socket_fail_on_error(socket_id, 1 + sizeof(uint16_t), response, NULL, + "RTI failed to read MSG_TYPE_UDP_PORT message from federate %d.", fed_id); + if (response[0] != MSG_TYPE_UDP_PORT) { + lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " + "Rejecting federate.", + fed_id, response[0]); + send_reject(socket_id, UNEXPECTED_MESSAGE); + return 0; + } else { + federate_info_t* fed; + if (hot_swap_in_progress) { + fed = hot_swap_federate; + } else { + fed = GET_FED_INFO(fed_id); + } + if (rti_remote->clock_sync_global_status >= clock_sync_init) { + // If no initial clock sync, no need perform initial clock sync. + uint16_t federate_UDP_port_number = extract_uint16(&(response[1])); - // Next, read the info about downstream federates - for (int i = 0; i < fed->enclave.num_downstream; i++) { - fed->enclave.downstream[i] = extract_uint16(&(connections_info_body[message_head])); - message_head += sizeof(uint16_t); - } + LF_PRINT_DEBUG("RTI got MSG_TYPE_UDP_PORT %u from federate %d.", federate_UDP_port_number, fed_id); - free(connections_info_body); - } + // A port number of UINT16_MAX means initial clock sync should not be performed. + if (federate_UDP_port_number != UINT16_MAX) { + // Perform the initialization clock synchronization with the federate. + // Send the required number of messages for clock synchronization + for (int i = 0; i < rti_remote->clock_sync_exchanges_per_interval; i++) { + // Send the RTI's current physical time T1 to the federate. + send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, TCP); - // NOTE: In this design, changes in the connections are not allowed. This means that the first - // instance to join __is__ the reference. If this policy is to be changed, then it is in - // the following lines will be updated accordingly. - if (hot_swap_in_progress || temp_fed != NULL) { - if (temp_fed == NULL) { - temp_fed = hot_swap_federate; - } - // Now, compare the previous and the new neighberhood structure - // Start with the number of upstreams and downstreams - bool reject = false; - if ((fed->enclave.num_upstream != temp_fed->enclave.num_upstream) || - (fed->enclave.num_downstream != temp_fed->enclave.num_downstream)) { - reject = true; + // Listen for reply message, which should be T3. + size_t message_size = 1 + sizeof(uint16_t); + unsigned char buffer[message_size]; + read_from_socket_fail_on_error(socket_id, message_size, buffer, NULL, + "Socket to federate %d unexpectedly closed.", fed_id); + if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { + uint16_t fed_id = extract_uint16(&(buffer[1])); + LF_PRINT_DEBUG("RTI received T3 clock sync message from federate %d.", fed_id); + handle_physical_clock_sync_message(fed, TCP); } else { - // Then check all upstreams and their delays - for (int i = 0; i < fed->enclave.num_upstream; i++) { - if ((fed->enclave.upstream[i] != temp_fed->enclave.upstream[i]) || - (fed->enclave.upstream_delay[i] != temp_fed->enclave.upstream_delay[i])) { - reject = true; - break; - } - } - if (!reject) { - // Finally, check all downstream federates - for (int i = 0; i < fed->enclave.num_downstream; i++) { - if (fed->enclave.downstream[i] != temp_fed->enclave.downstream[i]) { - reject = true; - break; - } - } - } - } - if (reject) { - if (temp_fed != hot_swap_federate) { - free(temp_fed); - } + lf_print_error("Unexpected message %u from federate %d.", buffer[0], fed_id); + send_reject(socket_id, UNEXPECTED_MESSAGE); return 0; } } + LF_PRINT_DEBUG("RTI finished initial clock synchronization with federate %d.", fed_id); } - LF_PRINT_DEBUG("RTI received neighbor structure from federate %d.", fed_id); - return 1; - } - - /** - * Listen for a MSG_TYPE_UDP_PORT message, and upon receiving it, set up - * clock synchronization and perform the initial clock synchronization. - * Initial clock synchronization is performed only if the MSG_TYPE_UDP_PORT message - * payload is not UINT16_MAX. If it is also not 0, then this function sets - * up to perform runtime clock synchronization using the UDP port number - * specified in the payload to communicate with the federate's clock - * synchronization logic. - * @param socket_id The socket on which to listen. - * @param fed_id The federate ID. - * @return 1 for success, 0 for failure. - */ - static int receive_udp_message_and_set_up_clock_sync(int* socket_id, uint16_t fed_id) { - // Read the MSG_TYPE_UDP_PORT message from the federate regardless of the status of - // clock synchronization. This message will tell the RTI whether the federate - // is doing clock synchronization, and if it is, what port to use for UDP. - LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_UDP_PORT from federate %d.", fed_id); - unsigned char response[1 + sizeof(uint16_t)]; - read_from_socket_fail_on_error(socket_id, 1 + sizeof(uint16_t), response, NULL, - "RTI failed to read MSG_TYPE_UDP_PORT message from federate %d.", fed_id); - if (response[0] != MSG_TYPE_UDP_PORT) { - lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " - "Rejecting federate.", - fed_id, response[0]); - send_reject(socket_id, UNEXPECTED_MESSAGE); - return 0; - } else { - federate_info_t* fed; - if (hot_swap_in_progress) { - fed = hot_swap_federate; - } else { - fed = GET_FED_INFO(fed_id); - } - if (rti_remote->clock_sync_global_status >= clock_sync_init) { - // If no initial clock sync, no need perform initial clock sync. - uint16_t federate_UDP_port_number = extract_uint16(&(response[1])); - - LF_PRINT_DEBUG("RTI got MSG_TYPE_UDP_PORT %u from federate %d.", federate_UDP_port_number, fed_id); - - // A port number of UINT16_MAX means initial clock sync should not be performed. - if (federate_UDP_port_number != UINT16_MAX) { - // Perform the initialization clock synchronization with the federate. - // Send the required number of messages for clock synchronization - for (int i = 0; i < rti_remote->clock_sync_exchanges_per_interval; i++) { - // Send the RTI's current physical time T1 to the federate. - send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, TCP); - - // Listen for reply message, which should be T3. - size_t message_size = 1 + sizeof(uint16_t); - unsigned char buffer[message_size]; - read_from_socket_fail_on_error(socket_id, message_size, buffer, NULL, - "Socket to federate %d unexpectedly closed.", fed_id); - if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { - uint16_t fed_id = extract_uint16(&(buffer[1])); - LF_PRINT_DEBUG("RTI received T3 clock sync message from federate %d.", fed_id); - handle_physical_clock_sync_message(fed, TCP); - } else { - lf_print_error("Unexpected message %u from federate %d.", buffer[0], fed_id); - send_reject(socket_id, UNEXPECTED_MESSAGE); - return 0; - } - } - LF_PRINT_DEBUG("RTI finished initial clock synchronization with federate %d.", fed_id); - } - if (rti_remote->clock_sync_global_status >= clock_sync_on) { - // If no runtime clock sync, no need to set up the UDP port. - if (federate_UDP_port_number > 0) { - // Initialize the UDP_addr field of the federate struct - fed->UDP_addr.sin_family = AF_INET; - fed->UDP_addr.sin_port = htons(federate_UDP_port_number); - fed->UDP_addr.sin_addr = fed->server_ip_addr; - } - } else { - // Disable clock sync after initial round. - fed->clock_synchronization_enabled = false; - } - } else { - // No clock synchronization at all. - LF_PRINT_DEBUG("RTI: No clock synchronization for federate %d.", fed_id); - // Clock synchronization is universally disabled via the clock-sync command-line parameter - // (-c off was passed to the RTI). - // Note that the federates are still going to send a - // MSG_TYPE_UDP_PORT message but with a payload (port) of -1. - fed->clock_synchronization_enabled = false; + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + // If no runtime clock sync, no need to set up the UDP port. + if (federate_UDP_port_number > 0) { + // Initialize the UDP_addr field of the federate struct + fed->UDP_addr.sin_family = AF_INET; + fed->UDP_addr.sin_port = htons(federate_UDP_port_number); + fed->UDP_addr.sin_addr = fed->server_ip_addr; } + } else { + // Disable clock sync after initial round. + fed->clock_synchronization_enabled = false; } - return 1; + } else { + // No clock synchronization at all. + LF_PRINT_DEBUG("RTI: No clock synchronization for federate %d.", fed_id); + // Clock synchronization is universally disabled via the clock-sync command-line parameter + // (-c off was passed to the RTI). + // Note that the federates are still going to send a + // MSG_TYPE_UDP_PORT message but with a payload (port) of -1. + fed->clock_synchronization_enabled = false; } + } + return 1; +} #ifdef __RTI_AUTH__ - /** - * Authenticate incoming federate by performing HMAC-based authentication. - * - * @param socket Socket for the incoming federate tryting to authenticate. - * @return True if authentication is successful and false otherwise. - */ - static bool authenticate_federate(int* socket) { - // Wait for MSG_TYPE_FED_NONCE from federate. - size_t fed_id_length = sizeof(uint16_t); - unsigned char buffer[1 + fed_id_length + NONCE_LENGTH]; - read_from_socket_fail_on_error(socket, 1 + fed_id_length + NONCE_LENGTH, buffer, NULL, - "Failed to read MSG_TYPE_FED_NONCE"); - if (buffer[0] != MSG_TYPE_FED_NONCE) { - lf_print_error_and_exit("Received unexpected response %u from the FED (see net_common.h).", buffer[0]); - } - unsigned int hmac_length = SHA256_HMAC_LENGTH; - size_t federation_id_length = strnlen(rti_remote->federation_id, 255); - // HMAC tag is created with MSG_TYPE, federate ID, received federate nonce. - unsigned char mac_buf[1 + fed_id_length + NONCE_LENGTH]; - mac_buf[0] = MSG_TYPE_RTI_RESPONSE; - memcpy(&mac_buf[1], &buffer[1], fed_id_length); - memcpy(&mac_buf[1 + fed_id_length], &buffer[1 + fed_id_length], NONCE_LENGTH); - unsigned char hmac_tag[hmac_length]; - unsigned char* ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, mac_buf, - 1 + fed_id_length + NONCE_LENGTH, hmac_tag, &hmac_length); - if (ret == NULL) { - lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_RTI_RESPONSE."); - } - // Make buffer for message type, RTI's nonce, and HMAC tag. - unsigned char sender[1 + NONCE_LENGTH + hmac_length]; - sender[0] = MSG_TYPE_RTI_RESPONSE; - unsigned char rti_nonce[NONCE_LENGTH]; - RAND_bytes(rti_nonce, NONCE_LENGTH); - memcpy(&sender[1], rti_nonce, NONCE_LENGTH); - memcpy(&sender[1 + NONCE_LENGTH], hmac_tag, hmac_length); - if (write_to_socket(*socket, 1 + NONCE_LENGTH + hmac_length, sender)) { - lf_print_error("Failed to send nonce to federate."); - } +/** + * Authenticate incoming federate by performing HMAC-based authentication. + * + * @param socket Socket for the incoming federate tryting to authenticate. + * @return True if authentication is successful and false otherwise. + */ +static bool authenticate_federate(int* socket) { + // Wait for MSG_TYPE_FED_NONCE from federate. + size_t fed_id_length = sizeof(uint16_t); + unsigned char buffer[1 + fed_id_length + NONCE_LENGTH]; + read_from_socket_fail_on_error(socket, 1 + fed_id_length + NONCE_LENGTH, buffer, NULL, + "Failed to read MSG_TYPE_FED_NONCE"); + if (buffer[0] != MSG_TYPE_FED_NONCE) { + lf_print_error_and_exit("Received unexpected response %u from the FED (see net_common.h).", buffer[0]); + } + unsigned int hmac_length = SHA256_HMAC_LENGTH; + size_t federation_id_length = strnlen(rti_remote->federation_id, 255); + // HMAC tag is created with MSG_TYPE, federate ID, received federate nonce. + unsigned char mac_buf[1 + fed_id_length + NONCE_LENGTH]; + mac_buf[0] = MSG_TYPE_RTI_RESPONSE; + memcpy(&mac_buf[1], &buffer[1], fed_id_length); + memcpy(&mac_buf[1 + fed_id_length], &buffer[1 + fed_id_length], NONCE_LENGTH); + unsigned char hmac_tag[hmac_length]; + unsigned char* ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, mac_buf, + 1 + fed_id_length + NONCE_LENGTH, hmac_tag, &hmac_length); + if (ret == NULL) { + lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_RTI_RESPONSE."); + } + // Make buffer for message type, RTI's nonce, and HMAC tag. + unsigned char sender[1 + NONCE_LENGTH + hmac_length]; + sender[0] = MSG_TYPE_RTI_RESPONSE; + unsigned char rti_nonce[NONCE_LENGTH]; + RAND_bytes(rti_nonce, NONCE_LENGTH); + memcpy(&sender[1], rti_nonce, NONCE_LENGTH); + memcpy(&sender[1 + NONCE_LENGTH], hmac_tag, hmac_length); + if (write_to_socket(*socket, 1 + NONCE_LENGTH + hmac_length, sender)) { + lf_print_error("Failed to send nonce to federate."); + } - // Wait for MSG_TYPE_FED_RESPONSE - unsigned char received[1 + hmac_length]; - read_from_socket_fail_on_error(socket, 1 + hmac_length, received, NULL, "Failed to read federate response."); - if (received[0] != MSG_TYPE_FED_RESPONSE) { - lf_print_error_and_exit("Received unexpected response %u from the federate (see net_common.h).", received[0]); - return false; - } - // HMAC tag is created with MSG_TYPE_FED_RESPONSE and RTI's nonce. - unsigned char mac_buf2[1 + NONCE_LENGTH]; - mac_buf2[0] = MSG_TYPE_FED_RESPONSE; - memcpy(&mac_buf2[1], rti_nonce, NONCE_LENGTH); - unsigned char rti_tag[hmac_length]; - ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, mac_buf2, 1 + NONCE_LENGTH, rti_tag, - &hmac_length); - if (ret == NULL) { - lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_FED_RESPONSE."); - } - // Compare received tag and created tag. - if (memcmp(&received[1], rti_tag, hmac_length) != 0) { - // Federation IDs do not match. Send back a HMAC_DOES_NOT_MATCH message. - lf_print_warning("HMAC authentication failed. Rejecting the federate."); - send_reject(socket, HMAC_DOES_NOT_MATCH); - return false; - } else { - LF_PRINT_LOG("Federate's HMAC verified."); - return true; - } - } + // Wait for MSG_TYPE_FED_RESPONSE + unsigned char received[1 + hmac_length]; + read_from_socket_fail_on_error(socket, 1 + hmac_length, received, NULL, "Failed to read federate response."); + if (received[0] != MSG_TYPE_FED_RESPONSE) { + lf_print_error_and_exit("Received unexpected response %u from the federate (see net_common.h).", received[0]); + return false; + } + // HMAC tag is created with MSG_TYPE_FED_RESPONSE and RTI's nonce. + unsigned char mac_buf2[1 + NONCE_LENGTH]; + mac_buf2[0] = MSG_TYPE_FED_RESPONSE; + memcpy(&mac_buf2[1], rti_nonce, NONCE_LENGTH); + unsigned char rti_tag[hmac_length]; + ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, mac_buf2, 1 + NONCE_LENGTH, rti_tag, + &hmac_length); + if (ret == NULL) { + lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_FED_RESPONSE."); + } + // Compare received tag and created tag. + if (memcmp(&received[1], rti_tag, hmac_length) != 0) { + // Federation IDs do not match. Send back a HMAC_DOES_NOT_MATCH message. + lf_print_warning("HMAC authentication failed. Rejecting the federate."); + send_reject(socket, HMAC_DOES_NOT_MATCH); + return false; + } else { + LF_PRINT_LOG("Federate's HMAC verified."); + return true; + } +} #endif - // FIXME: The socket descriptor here (parameter) is not used. Should be removed? - void lf_connect_to_persistent_federates(int socket_descriptor) { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote.number_of_transient_federates; i++) { - // Wait for an incoming connection request. - struct sockaddr client_fd; - uint32_t client_length = sizeof(client_fd); - // The following blocks until a federate connects. - int socket_id = -1; - while (1) { - socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); - if (socket_id >= 0) { - // Got a socket - break; - } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { - lf_print_error_system_failure("RTI failed to accept the socket."); - } else { - // Try again - lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); - continue; - } - } - +// FIXME: The socket descriptor here (parameter) is not used. Should be removed? +void lf_connect_to_persistent_federates(int socket_descriptor) { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates; i++) { + int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, -1); // Wait for the first message from the federate when RTI -a option is on. #ifdef __RTI_AUTH__ - if (rti_remote->authentication_enabled) { - if (!authenticate_federate(&socket_id)) { - lf_print_warning("RTI failed to authenticate the incoming federate."); - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - socket_id = -1; - // Ignore the federate that failed authentication. - i--; - continue; - } - } + if (rti_remote->authentication_enabled) { + if (!authenticate_federate(&socket_id)) { + lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + socket_id = -1; + // Ignore the federate that failed authentication. + i--; + continue; + } + } #endif - // The first message from the federate should contain its ID and the federation ID. - int32_t fed_id = receive_and_check_fed_id_message(&socket_id); - if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && - receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - federate_info_t* fed = GET_FED_INFO(fed_id); - lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); - - // If the federate is transient, then do not count it. - if (fed->is_transient) { - rti_remote->number_of_connected_transient_federates++; - assert(rti_remote->number_of_connected_transient_federates <= rti_remote->number_of_transient_federates); - i--; - lf_print("RTI: Transient federate %d joined.", fed->enclave.id); - } - } else { - // Received message was rejected. Try again. - i--; - } - } - // All federates have connected. - LF_PRINT_DEBUG("All persistent federates have connected to RTI."); + // The first message from the federate should contain its ID and the federation ID. + int32_t fed_id = receive_and_check_fed_id_message(&socket_id); + if (fed_id >= 0 && socket_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && + receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - if (rti_remote->clock_sync_global_status >= clock_sync_on) { - // Create the thread that performs periodic PTP clock synchronization sessions - // over the UDP channel, but only if the UDP channel is open and at least one - // federate is performing runtime clock synchronization. - bool clock_sync_enabled = false; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed_info = GET_FED_INFO(i); - if (fed_info->clock_synchronization_enabled) { - clock_sync_enabled = true; - break; - } - } - if (rti_remote->final_port_UDP != UINT16_MAX && clock_sync_enabled) { - lf_thread_create(&rti_remote->clock_thread, clock_synchronization_thread, NULL); - } + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + federate_info_t* fed = GET_FED_INFO(fed_id); + lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + + // If the federate is transient, then do not count it. + if (fed->is_transient) { + rti_remote->number_of_connected_transient_federates++; + assert(rti_remote->number_of_connected_transient_federates <= rti_remote->number_of_transient_federates); + i--; + lf_print("RTI: Transient federate %d joined.", fed->enclave.id); } + } else { + // Received message was rejected. Try again. + i--; } - - /** - * @brief A request for immediate stop to the federate - * - * @param fed: the deferate to stop - */ - void send_stop(federate_info_t * fed) { - // Reply with a stop granted to all federates - unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; - outgoing_buffer[0] = MSG_TYPE_STOP; - lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); - - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); + } + // All federates have connected. + LF_PRINT_DEBUG("All persistent federates have connected to RTI."); + + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + // Create the thread that performs periodic PTP clock synchronization sessions + // over the UDP channel, but only if the UDP channel is open and at least one + // federate is performing runtime clock synchronization. + bool clock_sync_enabled = false; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed_info = GET_FED_INFO(i); + if (fed_info->clock_synchronization_enabled) { + clock_sync_enabled = true; + break; } - write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, - "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); - - LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); } + if (rti_remote->final_port_UDP != UINT16_MAX && clock_sync_enabled) { + lf_thread_create(&rti_remote->clock_thread, clock_synchronization_thread, NULL); + } + } +} - /** - * @brief A request for immediate stop to the federate - * - * @param fed: the deferate to stop - */ - void send_stop(federate_info_t * fed) { - // Reply with a stop granted to all federates - unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; - outgoing_buffer[0] = MSG_TYPE_STOP; - lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); +/** + * @brief A request for immediate stop to the federate + * + * @param fed: the deferate to stop + */ +void send_stop(federate_info_t* fed) { + // Reply with a stop granted to all federates + unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; + outgoing_buffer[0] = MSG_TYPE_STOP; + lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); + + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_STOP, fed->enclave.id, NULL); + } + write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, + "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP, fed->enclave.id, NULL); - } - write_to_socket_fail_on_error(&(fed->socket), MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, - "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); + LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); +} - LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); - } - - void* lf_connect_to_transient_federates_thread(void* nothing) { - // This loop will continue to accept connections of transient federates, as - // soon as there is room, or enable hot swap - - while (!rti_remote->all_persistent_federates_exited) { - // Continue waiting for an incoming connection requests from transients - // to join, or for hot swap. - // Wait for an incoming connection request. - struct sockaddr client_fd; - uint32_t client_length = sizeof(client_fd); - // The following blocks until a federate connects. - int socket_id = -1; - while (1) { - if (!rti_remote->all_persistent_federates_exited) { - return NULL; - } - socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); - if (socket_id >= 0) { - // Got a socket - break; - } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { - lf_print_error_system_failure("RTI failed to accept the socket."); - } else { - // Try again - lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); - continue; - } - } +void* lf_connect_to_transient_federates_thread(void* nothing) { + // This loop will continue to accept connections of transient federates, as + // soon as there is room, or enable hot swap + while (!rti_remote->all_persistent_federates_exited) { + // Continue waiting for an incoming connection requests from transients + // to join, or for hot swap. + // Wait for an incoming connection request. + // struct sockaddr client_fd; + // uint32_t client_length = sizeof(client_fd); + // // // The following blocks until a federate connects. + // int socket_id = -1; + // while (1) { + // if (!rti_remote->all_persistent_federates_exited) { + // return NULL; + // } + // socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); + // if (socket_id >= 0) { + // // Got a socket + // break; + // } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { + // lf_print_error_system_failure("RTI failed to accept the socket."); + // } else { + // // Try again + // lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); + // continue; + // } + // } + + int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, 1); + // lf_print(">>>>>>>>>>>>>>>>>>>>>>>>>> socket_id %d in 2105 \n", socket_id); // Wait for the first message from the federate when RTI -a option is on. #ifdef __RTI_AUTH__ - if (rti_remote->authentication_enabled) { - if (!authenticate_federate(&socket_id)) { - lf_print_warning("RTI failed to authenticate the incoming federate."); - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - socket_id = -1; - // Ignore the federate that failed authentication. - i--; - continue; - } - } + if (rti_remote->authentication_enabled) { + if (!authenticate_federate(&socket_id)) { + lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + socket_id = -1; + continue; + } + } #endif - // The first message from the federate should contain its ID and the federation ID. - // The function also detects if a hot swap request is initiated. - int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in*)&client_fd); + // The first message from the federate should contain its ID and the federation ID. + // The function also detects if a hot swap request is initiated. + int32_t fed_id = receive_and_check_fed_id_message(&socket_id); - if (fed_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && - receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { - LF_MUTEX_LOCK(&rti_mutex); - if (hot_swap_in_progress) { - lf_print("RTI: Hot swap confirmed for federate %d.", fed_id); + // lf_print(">>>>>>>>>>>>>>>>>>>>>>>>>> socket_id %d in 2125 \n", socket_id); - // Then send STOP - federate_info_t* fed_old = GET_FED_INFO(fed_id); - hot_swap_federate->enclave.completed = fed_old->enclave.completed; + if (fed_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && + receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { + LF_MUTEX_LOCK(&rti_mutex); + if (hot_swap_in_progress) { + lf_print("RTI: Hot swap confirmed for federate %d.", fed_id); - LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); - send_stop(fed_old); - LF_MUTEX_UNLOCK(&rti_mutex); + // Then send STOP + federate_info_t* fed_old = GET_FED_INFO(fed_id); + hot_swap_federate->enclave.completed = fed_old->enclave.completed; - // Wait for the old federate to send MSG_TYPE_RESIGN - LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); - // FIXME: This is a busy wait! Need instead a lf_cond_wait on a condition variable. - while (!hot_swap_old_resigned) { - } + LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); + send_stop(fed_old); + LF_MUTEX_UNLOCK(&rti_mutex); - // The latest LTC is the tag at which the old federate resigned. This is useful - // for computing the effective_start_time of the new joining federate. - hot_swap_federate->enclave.completed = fed_old->enclave.completed; + // Wait for the old federate to send MSG_TYPE_RESIGN + LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); + // FIXME: This is a busy wait! Need instead a lf_cond_wait on a condition variable. + while (!hot_swap_old_resigned) { + } - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - lf_thread_create(&(hot_swap_federate->thread_id), federate_info_thread_TCP, hot_swap_federate); + // The latest LTC is the tag at which the old federate resigned. This is useful + // for computing the effective_start_time of the new joining federate. + hot_swap_federate->enclave.completed = fed_old->enclave.completed; - // Redirect the federate in rti_remote - rti_remote->base.scheduling_nodes[fed_id] = (scheduling_node_t*)hot_swap_federate; + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + lf_thread_create(&(hot_swap_federate->thread_id), federate_info_thread_TCP, hot_swap_federate); - // Free the old federate memory and reset the Hot wap indicators - // FIXME: Is this enough to free the memory allocated to the federate? - free(fed_old); - lf_mutex_lock(&rti_mutex); - hot_swap_in_progress = false; - lf_mutex_unlock(&rti_mutex); + // Redirect the federate in rti_remote + rti_remote->base.scheduling_nodes[fed_id] = (scheduling_node_t*)hot_swap_federate; - lf_print("RTI: Hot swap succeeded for federate %d.", fed_id); - } else { - lf_mutex_unlock(&rti_mutex); - - // Create a thread to communicate with the federate. - // This has to be done after clock synchronization is finished - // or that thread may end up attempting to handle incoming clock - // synchronization messages. - federate_info_t* fed = GET_FED_INFO(fed_id); - lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); - lf_print("RTI: Transient federate %d joined.", fed_id); - } - rti_remote->number_of_connected_transient_federates++; - } else { - // If a hot swap was initialed, but the connection information or/and clock - // synchronization fail, then reset hot_swap_in_profress, and free the memory - // allocated for hot_swap_federate - if (hot_swap_in_progress) { - lf_print("RTI: Hot swap canceled for federate %d.", fed_id); - lf_mutex_lock(&rti_mutex); - hot_swap_in_progress = false; - lf_mutex_unlock(&rti_mutex); - - // FIXME: Is this enough to free the memory of a federate_info_t data structure? - free(hot_swap_federate); - } - } + // Free the old federate memory and reset the Hot wap indicators + // FIXME: Is this enough to free the memory allocated to the federate? + free(fed_old); + lf_mutex_lock(&rti_mutex); + hot_swap_in_progress = false; + lf_mutex_unlock(&rti_mutex); + + lf_print("RTI: Hot swap succeeded for federate %d.", fed_id); + } else { + lf_mutex_unlock(&rti_mutex); + + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + federate_info_t* fed = GET_FED_INFO(fed_id); + lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + lf_print("RTI: Transient federate %d joined.", fed_id); } - return NULL; - } + rti_remote->number_of_connected_transient_federates++; + } else { + // If a hot swap was initialed, but the connection information or/and clock + // synchronization fail, then reset hot_swap_in_profress, and free the memory + // allocated for hot_swap_federate + if (hot_swap_in_progress) { + lf_print("RTI: Hot swap canceled for federate %d.", fed_id); + lf_mutex_lock(&rti_mutex); + hot_swap_in_progress = false; + lf_mutex_unlock(&rti_mutex); - /** - * @brief Thread that manages the delayed grants using a priprity queue. - * - * This thread is responsible for managing the priority queue of delayed grants to be issued. - * It waits until the current time matches the highest priority tag time in the queue. - * If reached, it notifies the grant immediately. If, however, the current time has not yet - * reached the highest priority tag and the queue has been updated (either by inserting or - * canceling an entry), the thread stops waiting and restarts the process again. - */ - static void* lf_delayed_grants_thread(void* nothing) { - initialize_lf_thread_id(); - // Hold the mutex when not waiting. - LF_MUTEX_LOCK(&rti_mutex); - while (!rti_remote->all_federates_exited) { - if (pqueue_delayed_grants_size(rti_remote->delayed_grants) > 0) { - // Do not pop, but rather peek. - pqueue_delayed_grant_element_t* next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); - instant_t next_time = next->base.tag.time; - // Wait for expiration, or a signal to stop or terminate. - int ret = lf_clock_cond_timedwait(&updated_delayed_grants, next_time); - if (ret == LF_TIMEOUT) { - // Time reached to send the grant. - // However, the grant may have been canceled while we were waiting. - pqueue_delayed_grant_element_t* new_next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); - if (next == new_next) { - pqueue_delayed_grants_pop(rti_remote->delayed_grants); - federate_info_t* fed = GET_FED_INFO(next->fed_id); - if (next->is_provisional) { - notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - } else { - notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - } - free(next); - } - } else if (ret != 0) { - // An error occurred. - lf_print_error_and_exit("lf_delayed_grants_thread: lf_clock_cond_timedwait failed with code %d.", ret); - } - } else if (pqueue_delayed_grants_size(rti_remote->delayed_grants) == 0) { - // Wait for something to appear on the queue. - lf_cond_wait(&updated_delayed_grants); - } + // FIXME: Is this enough to free the memory of a federate_info_t data structure? + free(hot_swap_federate); } - // Free any delayed grants that are still on the queue. - pqueue_delayed_grants_free(rti_remote->delayed_grants); - LF_MUTEX_UNLOCK(&rti_mutex); - return NULL; } - /** - * This thread is responsible for managing the priority queue of delayed grants to be issued. - * It waits until the current time matches the highest priority tag time in the queue. - * If reached, it notifies the grant immediately. If, however, the current time has not yet - * reached the highest priority tag and the queue has been updated (either by inserting or - * canceling an entry), the thread stops waiting and restarts the process again. - */ - void* lf_delayed_grants_thread(void* nothing) { - initialize_lf_thread_id(); - // Hold the mutex only when accessing rti_remote->delayed_grants pqueue - while (!rti_remote->all_federates_exited) { - if (pqueue_delayed_grants_size(rti_remote->delayed_grants) > 0) { - // Do not pop, but rather peek. - LF_MUTEX_LOCK(&rti_mutex); - pqueue_delayed_grant_element_t* next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); - instant_t next_time = next->base.tag.time; - LF_MUTEX_UNLOCK(&rti_mutex); - // Wait for expiration, or a signal to stop or terminate. - if (lf_clock_cond_timedwait(&updated_delayed_grants, next_time)) { - // Time reached to send the grant. - // However, the grant may have been canceled while we were waiting. - LF_MUTEX_LOCK(&rti_mutex); - pqueue_delayed_grant_element_t* new_next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); - if (next == new_next) { - pqueue_delayed_grants_pop(rti_remote->delayed_grants); - federate_info_t* fed = GET_FED_INFO(next->fed_id); - if (next->is_provisional) { - notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - // FIXME: Send port absent notification to all federates downstream of absent federates. - } else { - notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); - } - free(next); - } + } + return NULL; +} + +/** + * @brief Thread that manages the delayed grants using a priprity queue. + * + * This thread is responsible for managing the priority queue of delayed grants to be issued. + * It waits until the current time matches the highest priority tag time in the queue. + * If reached, it notifies the grant immediately. If, however, the current time has not yet + * reached the highest priority tag and the queue has been updated (either by inserting or + * canceling an entry), the thread stops waiting and restarts the process again. + */ +static void* lf_delayed_grants_thread(void* nothing) { + initialize_lf_thread_id(); + // Hold the mutex when not waiting. + LF_MUTEX_LOCK(&rti_mutex); + while (!rti_remote->all_federates_exited) { + if (pqueue_delayed_grants_size(rti_remote->delayed_grants) > 0) { + // Do not pop, but rather peek. + pqueue_delayed_grant_element_t* next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + instant_t next_time = next->base.tag.time; + // Wait for expiration, or a signal to stop or terminate. + int ret = lf_clock_cond_timedwait(&updated_delayed_grants, next_time); + if (ret == LF_TIMEOUT) { + // Time reached to send the grant. + // However, the grant may have been canceled while we were waiting. + pqueue_delayed_grant_element_t* new_next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + if (next == new_next) { + pqueue_delayed_grants_pop(rti_remote->delayed_grants); + federate_info_t* fed = GET_FED_INFO(next->fed_id); + if (next->is_provisional) { + notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } else { + notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); } - LF_MUTEX_UNLOCK(&rti_mutex); - } else if (pqueue_delayed_grants_size(rti_remote->delayed_grants) == 0) { - // Wait for something to appear on the queue. - lf_cond_wait(&updated_delayed_grants); + free(next); } + } else if (ret != 0) { + // An error occurred. + lf_print_error_and_exit("lf_delayed_grants_thread: lf_clock_cond_timedwait failed with code %d.", ret); } - // Free any delayed grants that are still on the queue. - while (pqueue_delayed_grants_size(rti_remote->delayed_grants) > 0) { - pqueue_delayed_grant_element_t* next = pqueue_delayed_grants_pop(rti_remote->delayed_grants); - free(next); - } - return NULL; + } else if (pqueue_delayed_grants_size(rti_remote->delayed_grants) == 0) { + // Wait for something to appear on the queue. + lf_cond_wait(&updated_delayed_grants); } + } + // Free any delayed grants that are still on the queue. + pqueue_delayed_grants_free(rti_remote->delayed_grants); + LF_MUTEX_UNLOCK(&rti_mutex); + return NULL; +} - void* respond_to_erroneous_connections(void* nothing) { - initialize_lf_thread_id(); - while (true) { - // Wait for an incoming connection request. - // The following will block until either a federate attempts to connect - // or close(rti->socket_descriptor_TCP) is called. - int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, -1); - if (socket_id < 0) { - return NULL; - } - if (rti_remote->all_federates_exited) { - return NULL; - } - - lf_print_error("RTI received an unexpected connection request. Federation is running."); - unsigned char response[2]; - response[0] = MSG_TYPE_REJECT; - response[1] = FEDERATION_ID_DOES_NOT_MATCH; - // Ignore errors on this response. - if (write_to_socket(socket_id, 2, response)) { - lf_print_warning("RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); - } - // Close the socket. - shutdown(socket_id, SHUT_RDWR); - close(socket_id); - } +void* respond_to_erroneous_connections(void* nothing) { + initialize_lf_thread_id(); + while (true) { + // Wait for an incoming connection request. + // The following will block until either a federate attempts to connect + // or close(rti->socket_descriptor_TCP) is called. + int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, -1); + if (socket_id < 0) { return NULL; } - - void initialize_federate(federate_info_t * fed, uint16_t id) { - initialize_scheduling_node(&(fed->enclave), id); - fed->requested_stop = false; - fed->socket = -1; // No socket. - fed->clock_synchronization_enabled = true; - fed->in_transit_message_tags = pqueue_tag_init(10); - strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); - fed->server_ip_addr.s_addr = 0; - fed->server_port = -1; - fed->has_upstream_transient_federates = false; - fed->is_transient = true; - fed->effective_start_tag = NEVER_TAG; + if (rti_remote->all_federates_exited) { + return NULL; } - void reset_transient_federate(federate_info_t * fed) { - // Reset all the timing information from the previous run - fed->enclave.completed = NEVER_TAG; - fed->enclave.last_granted = NEVER_TAG; - fed->enclave.last_provisionally_granted = NEVER_TAG; - fed->enclave.next_event = NEVER_TAG; - // Reset of the federate-related attributes - fed->socket = -1; // No socket. - fed->clock_synchronization_enabled = true; - // FIXME: The following two lines can be improved? - pqueue_tag_free(fed->in_transit_message_tags); - fed->in_transit_message_tags = pqueue_tag_init(10); - strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); - fed->server_ip_addr.s_addr = 0; - fed->server_port = -1; - fed->requested_stop = false; - fed->effective_start_tag = NEVER_TAG; - // invalidate_all_min_delays(); + lf_print_error("RTI received an unexpected connection request. Federation is running."); + unsigned char response[2]; + response[0] = MSG_TYPE_REJECT; + response[1] = FEDERATION_ID_DOES_NOT_MATCH; + // Ignore errors on this response. + if (write_to_socket(socket_id, 2, response)) { + lf_print_warning("RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); } + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + } + return NULL; +} - int32_t start_rti_server(uint16_t port) { - _lf_initialize_clock(); - // Create the TCP socket server - if (create_server(port, &rti_remote->socket_descriptor_TCP, &rti_remote->final_port_TCP, TCP, true)) { - lf_print_error_system_failure("RTI failed to create TCP server: %s.", strerror(errno)); - }; - lf_print("RTI: Listening for federates."); - // Create the UDP socket server - // Try to get the rti_remote->final_port_TCP + 1 port - if (rti_remote->clock_sync_global_status >= clock_sync_on) { - if (create_server(rti_remote->final_port_TCP + 1, &rti_remote->socket_descriptor_UDP, - &rti_remote->final_port_UDP, UDP, true)) { - lf_print_error_system_failure("RTI failed to create UDP server: %s.", strerror(errno)); - } - } - return rti_remote->socket_descriptor_TCP; - } +void initialize_federate(federate_info_t* fed, uint16_t id) { + initialize_scheduling_node(&(fed->enclave), id); + fed->requested_stop = false; + fed->socket = -1; // No socket. + fed->clock_synchronization_enabled = true; + fed->in_transit_message_tags = pqueue_tag_init(10); + strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); + fed->server_ip_addr.s_addr = 0; + fed->server_port = -1; + fed->has_upstream_transient_federates = false; + fed->is_transient = true; + fed->effective_start_tag = NEVER_TAG; +} - /** - * Iterate over the federates and sets 'has_upstream_transient_federates'. - * Once done, check that no transient federate has an upstream transient federate. - * and compute the number of persistent federates that do have upstream transients, - * which is the maximun number of delayed grants that can be pending at the same time. - * This is useful for initialyzing the queue of delayed grants. - - * @return -1, if there is more than one level of transiency, else, the number of - * persistents that have an upstream transient - */ - static int set_has_upstream_transient_federates_parameter_and_check() { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - for (int j = 0; j < fed->enclave.num_upstream; j++) { - federate_info_t* upstream_fed = GET_FED_INFO(fed->enclave.upstream[j]); - if (upstream_fed->is_transient) { - fed->has_upstream_transient_federates = true; - break; - } - } - } +void reset_transient_federate(federate_info_t* fed) { + // Reset all the timing information from the previous run + fed->enclave.completed = NEVER_TAG; + fed->enclave.last_granted = NEVER_TAG; + fed->enclave.last_provisionally_granted = NEVER_TAG; + fed->enclave.next_event = NEVER_TAG; + // Reset of the federate-related attributes + fed->socket = -1; // No socket. + fed->clock_synchronization_enabled = true; + // FIXME: The following two lines can be improved? + pqueue_tag_free(fed->in_transit_message_tags); + fed->in_transit_message_tags = pqueue_tag_init(10); + strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); + fed->server_ip_addr.s_addr = 0; + fed->server_port = -1; + fed->requested_stop = false; + fed->effective_start_tag = NEVER_TAG; + // invalidate_all_min_delays(); +} - // Now check that no transient has an upstream transient - // FIXME: Do we really need this? Or should it be the job of the validator? - int max_number_of_delayed_grants = 0; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->is_transient && fed->has_upstream_transient_federates) { - return -1; - } - if (!fed->is_transient && fed->has_upstream_transient_federates) { - max_number_of_delayed_grants++; - } - } - // Now check that no transient has an upstream transient - // FIXME: Do we really need this? Or should it be the job of the validator? - uint16_t max_number_of_delayed_grants = 0; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->is_transient && fed->has_upstream_transient_federates) { - return -1; - } - if (!fed->is_transient && fed->has_upstream_transient_federates) { - max_number_of_delayed_grants++; - } +int32_t start_rti_server(uint16_t port) { + _lf_initialize_clock(); + // Create the TCP socket server + if (create_server(port, &rti_remote->socket_descriptor_TCP, &rti_remote->final_port_TCP, TCP, true)) { + lf_print_error_system_failure("RTI failed to create TCP server: %s.", strerror(errno)); + }; + lf_print("RTI: Listening for federates."); + // Create the UDP socket server + // Try to get the rti_remote->final_port_TCP + 1 port + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + if (create_server(rti_remote->final_port_TCP + 1, &rti_remote->socket_descriptor_UDP, &rti_remote->final_port_UDP, + UDP, true)) { + lf_print_error_system_failure("RTI failed to create UDP server: %s.", strerror(errno)); + } + } + return rti_remote->socket_descriptor_TCP; +} + +/** + * Iterate over the federates and sets 'has_upstream_transient_federates'. + * Once done, check that no transient federate has an upstream transient federate. + * and compute the number of persistent federates that do have upstream transients, + * which is the maximun number of delayed grants that can be pending at the same time. + * This is useful for initialyzing the queue of delayed grants. + + * @return -1, if there is more than one level of transiency, else, the number of + * persistents that have an upstream transient + */ +static int set_has_upstream_transient_federates_parameter_and_check() { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + for (int j = 0; j < fed->enclave.num_upstream; j++) { + federate_info_t* upstream_fed = GET_FED_INFO(fed->enclave.upstream[j]); + if (upstream_fed->is_transient) { + fed->has_upstream_transient_federates = true; + break; } + } + } - return max_number_of_delayed_grants; + // Now check that no transient has an upstream transient + // FIXME: Do we really need this? Or should it be the job of the validator? + uint16_t max_number_of_delayed_grants = 0; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->is_transient && fed->has_upstream_transient_federates) { + return -1; + } + if (!fed->is_transient && fed->has_upstream_transient_federates) { + max_number_of_delayed_grants++; } - return max_number_of_delayed_grants; } + return max_number_of_delayed_grants; +} - void wait_for_federates(int socket_descriptor) { - // Wait for connections from persistent federates and create a thread for each. - lf_connect_to_persistent_federates(socket_descriptor); +void wait_for_federates(int socket_descriptor) { + // Wait for connections from persistent federates and create a thread for each. + lf_connect_to_persistent_federates(socket_descriptor); - // Set the start_time in the RTI trace - if (rti_remote->base.tracing_enabled) { - lf_tracing_set_start_time(start_time); - } + // Set the start_time in the RTI trace + if (rti_remote->base.tracing_enabled) { + lf_tracing_set_start_time(start_time); + } - // Set has_upstream_transient_federates parameter in all federates and check - // that there is no more than one level of transiency - if (rti_remote->number_of_transient_federates > 0) { - int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); - if (max_number_of_pending_grants == -1) { - lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); - } - rti_remote->delayed_grants = pqueue_delayed_grants_init(max_number_of_pending_grants); + // Set has_upstream_transient_federates parameter in all federates and check + // that there is no more than one level of transiency + if (rti_remote->number_of_transient_federates > 0) { + int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); + if (max_number_of_pending_grants == -1) { + lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); } + rti_remote->delayed_grants = pqueue_delayed_grants_init(max_number_of_pending_grants); + } - // All persistent federates have connected. - lf_print("RTI: All expected persistent federates have connected. Starting execution."); - if (rti_remote->number_of_transient_federates > 0) { - lf_print("RTI: Transient Federates can join and leave the federation at anytime."); - } + // All persistent federates have connected. + lf_print("RTI: All expected persistent federates have connected. Starting execution."); + if (rti_remote->number_of_transient_federates > 0) { + lf_print("RTI: Transient Federates can join and leave the federation at anytime."); + } - // The socket server will only continue to accept connections from transient - // federates. - // In case some other federation's federates are trying to join the wrong - // federation, need to respond. Start a separate thread to do that. - lf_thread_t responder_thread; - lf_thread_t transient_thread; - lf_thread_t delayed_grants_thread; - - // If the federation does not include transient federates, then respond to - // erronous connections. Otherwise, continue to accept transients joining and - // respond to duplicate joing requests. - if (rti_remote->number_of_transient_federates == 0) { - lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); - } else if (rti_remote->number_of_transient_federates > 0) { - lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); - lf_thread_create(&delayed_grants_thread, lf_delayed_grants_thread, NULL); + // The socket server will only continue to accept connections from transient + // federates. + // In case some other federation's federates are trying to join the wrong + // federation, need to respond. Start a separate thread to do that. + lf_thread_t responder_thread; + lf_thread_t transient_thread; + lf_thread_t delayed_grants_thread; + + // If the federation does not include transient federates, then respond to + // erronous connections. Otherwise, continue to accept transients joining and + // respond to duplicate joing requests. + if (rti_remote->number_of_transient_federates == 0) { + lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); + } else if (rti_remote->number_of_transient_federates > 0) { + lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); + lf_thread_create(&delayed_grants_thread, lf_delayed_grants_thread, NULL); + } + + // Wait for persistent federate threads to exit. + void* thread_exit_status; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (!fed->is_transient) { + lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); + lf_thread_join(fed->thread_id, &thread_exit_status); + pqueue_tag_free(fed->in_transit_message_tags); + lf_print("RTI: Persistent federate %d thread exited.", fed->enclave.id); } + } + + rti_remote->all_persistent_federates_exited = true; + rti_remote->phase = shutdown_phase; + lf_print("RTI: All persistent threads exited."); - // Wait for persistent federate threads to exit. - void* thread_exit_status; + // Wait for transient federate threads to exit, if any. + if (rti_remote->number_of_transient_federates > 0) { for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { federate_info_t* fed = GET_FED_INFO(i); - if (!fed->is_transient) { + if (fed->is_transient) { lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); lf_thread_join(fed->thread_id, &thread_exit_status); pqueue_tag_free(fed->in_transit_message_tags); - lf_print("RTI: Persistent federate %d thread exited.", fed->enclave.id); - } - } - - rti_remote->all_persistent_federates_exited = true; - rti_remote->phase = shutdown_phase; - lf_print("RTI: All persistent threads exited."); - - // Wait for transient federate threads to exit, if any. - if (rti_remote->number_of_transient_federates > 0) { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - if (fed->is_transient) { - lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); - lf_thread_join(fed->thread_id, &thread_exit_status); - pqueue_tag_free(fed->in_transit_message_tags); - lf_print("RTI: Transient federate %d thread exited.", fed->enclave.id); - } + lf_print("RTI: Transient federate %d thread exited.", fed->enclave.id); } } + } - rti_remote->all_federates_exited = true; + rti_remote->all_federates_exited = true; - // Shutdown and close the socket that is listening for incoming connections - // so that the accept() call in respond_to_erroneous_connections returns. - // That thread should then check rti->all_federates_exited and it should exit. - if (shutdown(socket_descriptor, SHUT_RDWR)) { - LF_PRINT_LOG("On shut down TCP socket, received reply: %s", strerror(errno)); - } - // NOTE: In all common TCP/IP stacks, there is a time period, - // typically between 30 and 120 seconds, called the TIME_WAIT period, - // before the port is released after this close. This is because - // the OS is preventing another program from accidentally receiving - // duplicated packets intended for this program. - close(socket_descriptor); - - if (rti_remote->socket_descriptor_UDP > 0) { - if (shutdown(rti_remote->socket_descriptor_UDP, SHUT_RDWR)) { - LF_PRINT_LOG("On shut down UDP socket, received reply: %s", strerror(errno)); - } - close(rti_remote->socket_descriptor_UDP); - } + // Shutdown and close the socket that is listening for incoming connections + // so that the accept() call in respond_to_erroneous_connections returns. + // That thread should then check rti->all_federates_exited and it should exit. + if (shutdown(socket_descriptor, SHUT_RDWR)) { + LF_PRINT_LOG("On shut down TCP socket, received reply: %s", strerror(errno)); } + // NOTE: In all common TCP/IP stacks, there is a time period, + // typically between 30 and 120 seconds, called the TIME_WAIT period, + // before the port is released after this close. This is because + // the OS is preventing another program from accidentally receiving + // duplicated packets intended for this program. + close(socket_descriptor); + + if (rti_remote->socket_descriptor_UDP > 0) { + if (shutdown(rti_remote->socket_descriptor_UDP, SHUT_RDWR)) { + LF_PRINT_LOG("On shut down UDP socket, received reply: %s", strerror(errno)); + } + close(rti_remote->socket_descriptor_UDP); + } +} - void initialize_RTI(rti_remote_t * rti) { - rti_remote = rti; - - // Initialize thread synchronization primitives - LF_MUTEX_INIT(&rti_mutex); - LF_COND_INIT(&received_start_times, &rti_mutex); - LF_COND_INIT(&sent_start_time, &rti_mutex); - LF_COND_INIT(&updated_delayed_grants, &rti_mutex); - - initialize_rti_common(&rti_remote->base); - rti_remote->base.mutex = &rti_mutex; - - // federation_rti related initializations - rti_remote->max_start_time = 0LL; - rti_remote->num_feds_proposed_start = 0; - rti_remote->all_federates_exited = false; - rti_remote->federation_id = "Unidentified Federation"; - rti_remote->user_specified_port = 0; - rti_remote->final_port_TCP = 0; - rti_remote->socket_descriptor_TCP = -1; - rti_remote->final_port_UDP = UINT16_MAX; - rti_remote->socket_descriptor_UDP = -1; - rti_remote->clock_sync_global_status = clock_sync_init; - rti_remote->clock_sync_period_ns = MSEC(10); - rti_remote->clock_sync_exchanges_per_interval = 10; - rti_remote->authentication_enabled = false; - rti_remote->base.tracing_enabled = false; - rti_remote->stop_in_progress = false; - rti_remote->number_of_transient_federates = 0; - rti_remote->phase = startup_phase; - } - - void free_scheduling_nodes(scheduling_node_t * *scheduling_nodes, uint16_t number_of_scheduling_nodes) { - for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { - // FIXME: Gives error freeing memory not allocated!!!! - scheduling_node_t* node = scheduling_nodes[i]; - if (node->upstream != NULL) - free(node->upstream); - if (node->downstream != NULL) - free(node->downstream); - } - free(scheduling_nodes); +void initialize_RTI(rti_remote_t* rti) { + rti_remote = rti; + + // Initialize thread synchronization primitives + LF_MUTEX_INIT(&rti_mutex); + LF_COND_INIT(&received_start_times, &rti_mutex); + LF_COND_INIT(&sent_start_time, &rti_mutex); + LF_COND_INIT(&updated_delayed_grants, &rti_mutex); + + initialize_rti_common(&rti_remote->base); + rti_remote->base.mutex = &rti_mutex; + + // federation_rti related initializations + rti_remote->max_start_time = 0LL; + rti_remote->num_feds_proposed_start = 0; + rti_remote->all_federates_exited = false; + rti_remote->federation_id = "Unidentified Federation"; + rti_remote->user_specified_port = 0; + rti_remote->final_port_TCP = 0; + rti_remote->socket_descriptor_TCP = -1; + rti_remote->final_port_UDP = UINT16_MAX; + rti_remote->socket_descriptor_UDP = -1; + rti_remote->clock_sync_global_status = clock_sync_init; + rti_remote->clock_sync_period_ns = MSEC(10); + rti_remote->clock_sync_exchanges_per_interval = 10; + rti_remote->authentication_enabled = false; + rti_remote->base.tracing_enabled = false; + rti_remote->stop_in_progress = false; + rti_remote->number_of_transient_federates = 0; + rti_remote->phase = startup_phase; +} + +// The RTI includes clock.c, which requires the following functions that are defined +// in clock-sync.c. But clock-sync.c is not included in the standalone RTI. +// Provide empty implementations of these functions. +void clock_sync_add_offset(instant_t* t) { (void)t; } +void clock_sync_subtract_offset(instant_t* t) { (void)t; } + +void free_scheduling_nodes(scheduling_node_t** scheduling_nodes, uint16_t number_of_scheduling_nodes) { + for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { + // FIXME: Gives error freeing memory not allocated!!!! + scheduling_node_t* node = scheduling_nodes[i]; + if (node->upstream != NULL) + free(node->upstream); + if (node->downstream != NULL) + free(node->downstream); } + free(scheduling_nodes); +} #endif // STANDALONE_RTI diff --git a/core/federated/federate.c b/core/federated/federate.c index aefebf5fe..73c0c416b 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -88,8 +88,8 @@ federate_instance_t _fed = {.socket_TCP_RTI = -1, .has_upstream = false, .has_downstream = false, .received_stop_request_from_rti = false, - .last_sent_LTC = (tag_t){.time = NEVER, .microstep = 0u}, - .last_sent_NET = (tag_t){.time = NEVER, .microstep = 0u}, + .last_sent_LTC = {.time = NEVER, .microstep = 0u}, + .last_sent_NET = {.time = NEVER, .microstep = 0u}, .min_delay_from_physical_action_to_federate_output = NEVER, .is_transient = false}; @@ -2040,7 +2040,7 @@ void lf_connect_to_rti(const char* hostname, int port) { } else if (response == MSG_TYPE_UPSTREAM_DISCONNECTED) { handle_upstream_disconnected_message(); } else { - lf_print_warning("RTI on port %d gave unexpected response %u. Will try again", uport, response); + lf_print_warning("RTI on port %d gave unexpected response %u. Will try again", port, response); continue; } } diff --git a/trace/api/types/trace_types.h b/trace/api/types/trace_types.h index 6d8758fa4..3be4d92b1 100644 --- a/trace/api/types/trace_types.h +++ b/trace/api/types/trace_types.h @@ -72,6 +72,8 @@ typedef enum { receive_ADR_AD, receive_ADR_QR, receive_UNIDENTIFIED, + send_STOP, + receive_STOP, NUM_EVENT_TYPES } trace_event_t; @@ -135,6 +137,8 @@ static const char* trace_event_names[] = { "Receiving ADR_AD", "Receiving ADR_QR", "Receiving UNIDENTIFIED", + "Sending STOP", + "Receiving STOP", }; static inline void _suppress_unused_variable_warning_for_static_variable() { (void)trace_event_names; } From b11ce380c4b1efdfaba5a4bef382f05a95b0d65f Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 3 Jan 2025 15:32:55 +0100 Subject: [PATCH 136/148] Fix segmentation fault and use accept_ocket() when connecting to trabsient federates --- core/federated/RTI/rti_remote.c | 39 ++++++--------------------------- core/federated/federate.c | 6 ++++- 2 files changed, 12 insertions(+), 33 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 13c0cae0c..4731a6f65 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -158,8 +158,10 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque return NULL; for (int i = 1; i <= q->size; i++) { dge = (pqueue_delayed_grant_element_t*)q->d[i]; - if (dge->fed_id == fed_id) { - return dge; + if (dge) { + if (dge->fed_id == fed_id) { + return dge; + } } } return NULL; @@ -168,7 +170,6 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque /** * @brief Insert the delayed grant into the delayed_grants queue and notify. * - * * This function assumes the caller holds the rti_mutex. * @param fed The federate. * @param tag The tag to grant. @@ -2078,35 +2079,11 @@ void send_stop(federate_info_t* fed) { } void* lf_connect_to_transient_federates_thread(void* nothing) { - // This loop will continue to accept connections of transient federates, as - // soon as there is room, or enable hot swap + // This loop will continue to accept connections of transient federates, as soon as there is room, or enable hot swap while (!rti_remote->all_persistent_federates_exited) { - // Continue waiting for an incoming connection requests from transients - // to join, or for hot swap. + // Continue waiting for an incoming connection requests from transients to join, or for hot swap. // Wait for an incoming connection request. - // struct sockaddr client_fd; - // uint32_t client_length = sizeof(client_fd); - // // // The following blocks until a federate connects. - // int socket_id = -1; - // while (1) { - // if (!rti_remote->all_persistent_federates_exited) { - // return NULL; - // } - // socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); - // if (socket_id >= 0) { - // // Got a socket - // break; - // } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { - // lf_print_error_system_failure("RTI failed to accept the socket."); - // } else { - // // Try again - // lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); - // continue; - // } - // } - - int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, 1); - // lf_print(">>>>>>>>>>>>>>>>>>>>>>>>>> socket_id %d in 2105 \n", socket_id); + int socket_id = accept_socket(rti_remote->socket_descriptor_TCP, -1); // Wait for the first message from the federate when RTI -a option is on. #ifdef __RTI_AUTH__ @@ -2126,8 +2103,6 @@ void* lf_connect_to_transient_federates_thread(void* nothing) { // The function also detects if a hot swap request is initiated. int32_t fed_id = receive_and_check_fed_id_message(&socket_id); - // lf_print(">>>>>>>>>>>>>>>>>>>>>>>>>> socket_id %d in 2125 \n", socket_id); - if (fed_id >= 0 && receive_connection_information(&socket_id, (uint16_t)fed_id) && receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { LF_MUTEX_LOCK(&rti_mutex); diff --git a/core/federated/federate.c b/core/federated/federate.c index 73c0c416b..aad3943fd 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2164,7 +2164,11 @@ void* lf_handle_p2p_connections_from_federates(void* env_arg) { // Extract the ID of the sending federate. uint16_t remote_fed_id = extract_uint16((unsigned char*)&(buffer[1])); bool remote_fed_is_transient = buffer[1 + sizeof(uint16_t)]; - LF_PRINT_DEBUG("Received sending federate ID %d.", remote_fed_id); + if (remote_fed_is_transient) { + LF_PRINT_DEBUG("Received sending federate ID %d, which is transient.", remote_fed_id); + } else { + LF_PRINT_DEBUG("Received sending federate ID %d, which is persistent.", remote_fed_id); + } // Trace the event when tracing is enabled tracepoint_federate_to_federate(receive_FED_ID, _lf_my_fed_id, remote_fed_id, NULL); From bbba28549d38f7d206fd8f2d77cedfe32a264dbe Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Sat, 4 Jan 2025 11:54:39 +0100 Subject: [PATCH 137/148] Fix lingua-franca-ref.txt --- lingua-franca-ref.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingua-franca-ref.txt b/lingua-franca-ref.txt index 23343040e..52199a147 100644 --- a/lingua-franca-ref.txt +++ b/lingua-franca-ref.txt @@ -1 +1 @@ -transient-fed-cycles +transient-fed From 95e67b6290f76e2a9eec62f33434ab2d2a5ef03e Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Sat, 4 Jan 2025 17:13:57 +0100 Subject: [PATCH 138/148] Fix the waiting time to start in decentralized coordination --- core/threaded/reactor_threaded.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index ef6634b0b..540b7344e 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -582,31 +582,28 @@ void _lf_initialize_start_tag(environment_t* env) { } _lf_initialize_timers(env); - env->current_tag = effective_start_tag; #if defined FEDERATED_DECENTRALIZED // If we have a non-zero STA offset, then we need to allow messages to arrive // at the start time. To avoid spurious STP violations, we temporarily // set the current time back by the STA offset. - env->current_tag.time -= lf_fed_STA_offset; - LF_PRINT_LOG("Waiting for start time " PRINTF_TIME " plus STA " PRINTF_TIME ".", start_time, lf_fed_STA_offset); + env->current_tag.time = lf_time_subtract(env->current_tag.time, lf_fed_STA_offset); #else // For other than federated decentralized execution, there is no lf_fed_STA_offset variable defined. // To use uniform code below, we define it here as a local variable. instant_t lf_fed_STA_offset = 0; #endif - LF_PRINT_LOG("Waiting for start time " PRINTF_TIME ".", start_time); - - // Wait until the start time. This is required for federates because the startup procedure - // in lf_synchronize_with_other_federates() can decide on a new start_time that is - // larger than the current physical time. - // This wait_until() is deliberately called after most precursor operations - // for tag (0,0) are performed (e.g., injecting startup reactions, etc.). - // This has two benefits: First, the startup overheads will reduce - // the required waiting time. Second, this call releases the mutex lock and allows - // other threads (specifically, federate threads that handle incoming p2p messages - // from other federates) to hold the lock and possibly raise a tag barrier. - while (!wait_until(effective_start_tag.time + lf_fed_STA_offset, &env->event_q_changed)) { + LF_PRINT_LOG("Waiting for start time " PRINTF_TIME ".", effective_start_tag.time); + + // Wait until the effective start time. This is required for federates because the startup procedure + // in lf_synchronize_with_other_federates() can decide on a new start_time, or the effective start time if it is a + // transient federate, that is larger than the current physical time. + // This wait_until() is deliberately called after most precursor operations for tag (0,0), or effective_start_tag,q + // are performed (e.g., injecting startup reactions, etc.). This has two benefits: First, the startup overheads will + // reduce the required waiting time. Second, this call releases the mutex lock and allows other threads (specifically, + // federate threads that handle incoming p2p messages from other federates) to hold the lock and possibly raise a tag + // barrier. + while (!wait_until(effective_start_tag.time, &env->event_q_changed)) { }; LF_PRINT_DEBUG("Done waiting for effective start time + STA offset " PRINTF_TIME ".", effective_start_tag.time + lf_fed_STA_offset); From 7727215ce60a0759ceaf55c1cb0cda6ed66fd78c Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Sat, 4 Jan 2025 20:47:06 +0100 Subject: [PATCH 139/148] Small fix to pqueue_delayed_grants_find_by_fed_id() --- core/federated/RTI/rti_remote.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 4731a6f65..4d43f13ea 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -156,7 +156,7 @@ static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pque pqueue_t* _q = (pqueue_t*)q; if (!q || q->size == 1) return NULL; - for (int i = 1; i <= q->size; i++) { + for (int i = 1; i < q->size; i++) { dge = (pqueue_delayed_grant_element_t*)q->d[i]; if (dge) { if (dge->fed_id == fed_id) { From fd603964d546ccb03a07053cb50080b73bbfa664 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Sat, 4 Jan 2025 21:57:52 +0100 Subject: [PATCH 140/148] Cleanup --- core/federated/RTI/rti_remote.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index 9bf6c5ca8..1df521aa5 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -396,14 +396,6 @@ void* federate_info_thread_TCP(void* fed); */ void send_reject(int* socket_id, rejection_code_t error_code); -/** - * Wait for one incoming connection request from each (persistent) federate, - * and upon receiving it, create a thread to communicate with that federate. - * Return when all persistent federates have connected. - * @param socket_descriptor The socket on which to accept connections. - */ -void* lf_connect_to_persistent_transient_federates_thread(int socket_descriptor); - /** * Thread to wait for incoming connection request from transient federates. * Upon receiving the connection request, check if a hot swap should start or From 9be359694b19e51ce374dd7b6fa95a84be48ef47 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Sat, 4 Jan 2025 22:51:38 +0100 Subject: [PATCH 141/148] Support backward compatibility when sending the IDS --- core/federated/RTI/rti_remote.c | 16 +++++++---- core/federated/federate.c | 23 +++++++++++----- include/core/federated/network/net_common.h | 30 +++++++++++++++------ 3 files changed, 49 insertions(+), 20 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 4d43f13ea..d7b70ceb8 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1517,7 +1517,7 @@ void send_reject(int* socket_id, rejection_code_t error_code) { */ static int32_t receive_and_check_fed_id_message(int* socket_id) { // Buffer for message ID, federate ID, type (persistent or transient), and federation ID length. - size_t length = 1 + sizeof(uint16_t) + 1 + 1; // Message ID, federate ID, length of fedration ID, type. + size_t length = 1 + sizeof(uint16_t) + 1; // Message ID, federate ID and length of fedration ID. unsigned char buffer[length]; // Read bytes from the socket. We need 4 bytes. @@ -1530,7 +1530,7 @@ static int32_t receive_and_check_fed_id_message(int* socket_id) { bool is_transient = false; // First byte received is the message type. - if (buffer[0] != MSG_TYPE_FED_IDS) { + if (buffer[0] != MSG_TYPE_FED_IDS && buffer[0] != MSG_TYPE_TRANSIENT_FED_IDS) { if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); } @@ -1554,15 +1554,21 @@ static int32_t receive_and_check_fed_id_message(int* socket_id) { } else { // Received federate ID. fed_id = extract_uint16(buffer + 1); - is_transient = (buffer[sizeof(uint16_t) + 1] == 1) ? true : false; + // Read the federation ID length, which is one byte. + size_t federation_id_length = (size_t)buffer[sizeof(uint16_t) + 1]; + if (buffer[0] == MSG_TYPE_TRANSIENT_FED_IDS) { + char buf; + read_from_socket_close_on_error(socket_id, 1, &buf); + is_transient = (buf == 1) ? true : false; + } + if (is_transient) { LF_PRINT_LOG("RTI received federate ID: %d, which is transient.", fed_id); } else { LF_PRINT_LOG("RTI received federate ID: %d, which is persistent.", fed_id); } - // Read the federation ID. First read the length, which is one byte. - size_t federation_id_length = (size_t)buffer[sizeof(uint16_t) + 2]; + // Read the federation ID. char federation_id_received[federation_id_length + 1]; // One extra for null terminator. // Next read the actual federation ID. if (read_from_socket_close_on_error(socket_id, federation_id_length, (unsigned char*)federation_id_received)) { diff --git a/core/federated/federate.c b/core/federated/federate.c index aad3943fd..8c61affeb 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1960,7 +1960,7 @@ void lf_connect_to_rti(const char* hostname, int port) { while (!CHECK_TIMEOUT(start_connect, CONNECT_TIMEOUT) && !_lf_termination_executed) { // Have connected to an RTI, but not sure it's the right RTI. - // Send a MSG_TYPE_FED_IDS message and wait for a reply. + // Send a MSG_TYPE_FED_IDS or MSG_TYPE_TRANSIENT_FED_IDS message and wait for a reply. // Notify the RTI of the ID of this federate and its federation. #ifdef FEDERATED_AUTHENTICATED @@ -1977,26 +1977,35 @@ void lf_connect_to_rti(const char* hostname, int port) { LF_PRINT_LOG("Connected to an RTI. Sending federation ID for authentication."); #endif - // Send the message type first. unsigned char buffer[5]; - buffer[0] = MSG_TYPE_FED_IDS; + // Send the message type first. + if (_fed.is_transient) { + buffer[0] = MSG_TYPE_TRANSIENT_FED_IDS; + } else { + buffer[0] = MSG_TYPE_FED_IDS; + } + // Next send the federate ID. if (_lf_my_fed_id == UINT16_MAX) { lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX - 1); } encode_uint16((uint16_t)_lf_my_fed_id, &buffer[1]); - // Next send the federate type (persistent or transient) - buffer[1 + sizeof(uint16_t)] = _fed.is_transient ? 1 : 0; // Next send the federation ID length. // The federation ID is limited to 255 bytes. size_t federation_id_length = strnlen(federation_metadata.federation_id, 255); - buffer[2 + sizeof(uint16_t)] = (unsigned char)(federation_id_length & 0xff); + buffer[1 + sizeof(uint16_t)] = (unsigned char)(federation_id_length & 0xff); // Trace the event when tracing is enabled tracepoint_federate_to_rti(send_FED_ID, _lf_my_fed_id, NULL); + unsigned char size = 1 + sizeof(uint16_t) + 1; + if (_fed.is_transient) { + // Next send the federate type (persistent or transient) + buffer[2 + sizeof(uint16_t)] = _fed.is_transient ? 1 : 0; + size++; + } // No need for a mutex here because no other threads are writing to this socket. - if (write_to_socket(_fed.socket_TCP_RTI, 3 + sizeof(uint16_t), buffer)) { + if (write_to_socket(_fed.socket_TCP_RTI, size, buffer)) { continue; // Try again, possibly on a new port. } diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index 5c341a90e..096ac58e1 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -237,18 +237,15 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define MSG_TYPE_UDP_PORT 254 -/** Byte identifying a message from a federate to an RTI containing - * the federation ID and the federate ID. The message contains, in - * this order: +/** Byte identifying a message from a (persistent) federate to an RTI containing + * the federate ID and the federation ID. The message contains, in this order: * * One byte equal to MSG_TYPE_FED_IDS. * * Two bytes (ushort) giving the federate ID. - * * One byte giving the type of the federate (1 if transient, 0 if persistent) * * One byte (uchar) giving the length N of the federation ID. * * N bytes containing the federation ID. - * Each federate needs to have a unique ID between 0 and - * NUMBER_OF_FEDERATES-1. - * Each federate, when starting up, should send this message - * to the RTI. This is its first message to the RTI. + * Each federate needs to have a unique ID between 0 and NUMBER_OF_FEDERATES-1. + * Each federate, when starting up, should send either this message, or MSG_TYPE_TRANSIENT_FED_IDS + * to the RTI, as its first message to the RTI. * The RTI will respond with either MSG_TYPE_REJECT, MSG_TYPE_ACK, or MSG_TYPE_UDP_PORT. * If the federate is a C target LF program, the generated federate * code does this by calling lf_synchronize_with_other_federates(), @@ -256,6 +253,23 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define MSG_TYPE_FED_IDS 1 +/** Byte identifying a message from a transient federate to an RTI containing + * the federate ID and the federation ID. The message contains, in this order: + * * One byte equal to MSG_TYPE_TRANSIENT_FED_IDS. + * * Two bytes (ushort) giving the federate ID. + * * One byte (uchar) giving the length N of the federation ID. + * * One byte giving the type of the federate (1 if transient, 0 if persistent) + * * N bytes containing the federation ID. + * Each federate needs to have a unique ID between 0 and NUMBER_OF_FEDERATES-1. + * Each federate, when starting up, should send either this message, or MSG_TYPE_FED_IDS + * to the RTI, as its first message to the RTI. + * The RTI will respond with either MSG_TYPE_REJECT, MSG_TYPE_ACK, or MSG_TYPE_UDP_PORT. + * If the federate is a C target LF program, the generated federate + * code does this by calling lf_synchronize_with_other_federates(), + * passing to it its federate ID. + */ +#define MSG_TYPE_TRANSIENT_FED_IDS 103 + /////////// Messages used for authenticated federation. /////////////// /** * Byte identifying a message from a federate to an RTI containing From 8d9f15f901407b9bddf652339bba3c57fd6a1703 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Sat, 4 Jan 2025 23:29:06 +0100 Subject: [PATCH 142/148] Fix types when reading from and writing to a socket --- core/federated/RTI/rti_remote.c | 2 +- core/federated/federate.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index d7b70ceb8..865b7b01d 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1557,7 +1557,7 @@ static int32_t receive_and_check_fed_id_message(int* socket_id) { // Read the federation ID length, which is one byte. size_t federation_id_length = (size_t)buffer[sizeof(uint16_t) + 1]; if (buffer[0] == MSG_TYPE_TRANSIENT_FED_IDS) { - char buf; + unsigned char buf; read_from_socket_close_on_error(socket_id, 1, &buf); is_transient = (buf == 1) ? true : false; } diff --git a/core/federated/federate.c b/core/federated/federate.c index 8c61affeb..305d7c3b2 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1019,9 +1019,9 @@ static instant_t get_start_time_from_rti(instant_t my_physical_time) { // Send the timestamp marker first. send_time(MSG_TYPE_TIMESTAMP, my_physical_time); - // Read bytes from the socket. We need 21 (1 + 8 + 8 + 4) bytes. + // Read bytes from the socket. We need first 9 (1 + 8) bytes. // Buffer for message ID plus timestamp. - size_t buffer_length = MSG_TYPE_TIMESTAMP_START_LENGTH; + size_t buffer_length = MSG_TYPE_TIMESTAMP_LENGTH; unsigned char buffer[buffer_length]; while (true) { @@ -1998,7 +1998,7 @@ void lf_connect_to_rti(const char* hostname, int port) { // Trace the event when tracing is enabled tracepoint_federate_to_rti(send_FED_ID, _lf_my_fed_id, NULL); - unsigned char size = 1 + sizeof(uint16_t) + 1; + size_t size = 1 + sizeof(uint16_t) + 1; if (_fed.is_transient) { // Next send the federate type (persistent or transient) buffer[2 + sizeof(uint16_t)] = _fed.is_transient ? 1 : 0; From e106ea11b99ea46caa63acb6a9532cc450e60ddd Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Sun, 5 Jan 2025 01:30:15 +0100 Subject: [PATCH 143/148] Support backward compatibility when sending the timestamp (and tag) --- core/federated/RTI/rti_remote.c | 70 +++++++++++---------- core/federated/federate.c | 12 ++-- include/core/federated/network/net_common.h | 18 ++---- 3 files changed, 49 insertions(+), 51 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 865b7b01d..51a309798 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -267,22 +267,24 @@ static void send_upstream_disconnected_locked(federate_info_t* destination, fede } /** - * @brief Mark a federate as disconnected and inform downstream federates. - * @param e The enclave corresponding to the disconnected federate. + * @brief Mark a federate as disconnected and, if this is a transient, inform downstream federates. + * @param fed The disconnected federate. */ -static void notify_federate_disconnected(scheduling_node_t* e) { - e->state = NOT_CONNECTED; +static void notify_federate_disconnected(federate_info_t* fed) { + fed->enclave.state = NOT_CONNECTED; // Notify downstream federates. Need to hold the mutex lock to do this. - LF_MUTEX_LOCK(&rti_mutex); - for (int j = 0; j < e->num_downstream; j++) { - federate_info_t* downstream = GET_FED_INFO(e->downstream[j]); - // Ignore this enclave if it no longer connected. - if (downstream->enclave.state != NOT_CONNECTED) { - // Notify the downstream enclave. - send_upstream_disconnected_locked(downstream, GET_FED_INFO(e->id)); + if (fed->is_transient) { + LF_MUTEX_LOCK(&rti_mutex); + for (int j = 0; j < fed->enclave.num_downstream; j++) { + federate_info_t* downstream = GET_FED_INFO(fed->enclave.downstream[j]); + // Ignore this enclave if it no longer connected. + if (downstream->enclave.state != NOT_CONNECTED) { + // Notify the downstream enclave. + send_upstream_disconnected_locked(downstream, fed); + } } + LF_MUTEX_UNLOCK(&rti_mutex); } - LF_MUTEX_UNLOCK(&rti_mutex); } /** @@ -309,7 +311,8 @@ static void notify_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) // to fail. Consider a failure here a soft failure and update the federate's status. if (write_to_socket(((federate_info_t*)e)->socket, message_length, buffer)) { lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); - notify_federate_disconnected(e); + // Mark a federate as disconnected and inform if necessary + notify_federate_disconnected(GET_FED_INFO(e->id)); } else { e->last_granted = tag; LF_PRINT_LOG("RTI sent to federate %d the tag advance grant (TAG) " PRINTF_TAG ".", e->id, tag.time - start_time, @@ -368,7 +371,8 @@ void notify_provisional_tag_advance_grant_immediate(scheduling_node_t* e, tag_t // to fail. Consider a failure here a soft failure and update the federate's status. if (write_to_socket(((federate_info_t*)e)->socket, message_length, buffer)) { lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); - notify_federate_disconnected(e); + // Mark a federate as disconnected and inform if necessary + notify_federate_disconnected(GET_FED_INFO(e->id)); } else { e->last_provisionally_granted = tag; LF_PRINT_LOG("RTI sent to federate %d the Provisional Tag Advance Grant (PTAG) " PRINTF_TAG ".", e->id, @@ -972,17 +976,19 @@ static void send_start_tag_locked(federate_info_t* my_fed, instant_t federation_ // Send back to the federate the maximum time plus an offset on a TIMESTAMP_START // message. - // In the startup phase, federates will receive identical start_time and - // effective_start_tag - unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_START_LENGTH]; - start_time_buffer[0] = MSG_TYPE_TIMESTAMP_START; + // If it is a persistent federate, only the start_time is sent. If, however, it is a transient + // federate, the effective_start_tag is also sent. + size_t buffer_size = (my_fed->is_transient) ? MSG_TYPE_TIMESTAMP_TAG_LENGTH : MSG_TYPE_TIMESTAMP_LENGTH; + unsigned char start_time_buffer[buffer_size]; + start_time_buffer[0] = MSG_TYPE_TIMESTAMP; encode_int64(swap_bytes_if_big_endian_int64(federation_start_time), &start_time_buffer[1]); - encode_tag(&(start_time_buffer[1 + sizeof(instant_t)]), federate_start_tag); - + if (my_fed->is_transient) { + encode_tag(&(start_time_buffer[1 + sizeof(instant_t)]), federate_start_tag); + } if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(send_TIMESTAMP, my_fed->enclave.id, &federate_start_tag); } - if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_START_LENGTH, start_time_buffer)) { + if (write_to_socket(my_fed->socket, buffer_size, start_time_buffer)) { lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); } else { // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP_START @@ -1306,19 +1312,19 @@ void* clock_synchronization_thread(void* noargs) { */ static void handle_federate_failed(federate_info_t* my_fed) { // Nothing more to do. Close the socket and exit. - LF_MUTEX_LOCK(&rti_mutex); - if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(receive_FAILED, my_fed->enclave.id, NULL); } + // First, mark a federate as disconnected and inform if necessary + notify_federate_disconnected(my_fed); + + LF_MUTEX_LOCK(&rti_mutex); + // Set the flag telling the RTI to exit with an error code when it exits. _lf_federate_reports_error = true; lf_print_error("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); - notify_federate_disconnected(&my_fed->enclave); - my_fed->enclave.state = NOT_CONNECTED; - // Indicate that there will no further events from this federate. my_fed->enclave.next_event = FOREVER_TAG; @@ -1355,17 +1361,15 @@ static void handle_federate_failed(federate_info_t* my_fed) { */ static void handle_federate_resign(federate_info_t* my_fed) { // Nothing more to do. Close the socket and exit. - LF_MUTEX_LOCK(&rti_mutex); - if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(receive_RESIGN, my_fed->enclave.id, NULL); } + // First, mark a federate as disconnected and inform if necessary + notify_federate_disconnected(my_fed); lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); - my_fed->enclave.state = NOT_CONNECTED; - - notify_federate_disconnected(&my_fed->enclave); + LF_MUTEX_LOCK(&rti_mutex); // Indicate that there will no further events from this federate. my_fed->enclave.next_event = FOREVER_TAG; @@ -1386,7 +1390,6 @@ static void handle_federate_resign(federate_info_t* my_fed) { // // We can now safely close the socket. close(my_fed->socket); // from unistd.h - // notify_federate_disconnected(&my_fed->enclave); // Check downstream federates to see whether they should now be granted a TAG. // To handle cycles, need to create a boolean array to keep // track of which upstream federates have been visited. @@ -1413,8 +1416,7 @@ void* federate_info_thread_TCP(void* fed) { if (read_failed) { // Socket is closed lf_print_error("RTI: Socket to federate %d is closed. Exiting the thread.", my_fed->enclave.id); - my_fed->enclave.state = NOT_CONNECTED; - notify_federate_disconnected(&my_fed->enclave); + notify_federate_disconnected(my_fed); my_fed->socket = -1; // FIXME: We need better error handling here, but do not stop execution here. break; diff --git a/core/federated/federate.c b/core/federated/federate.c index 305d7c3b2..f0410a159 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1019,16 +1019,16 @@ static instant_t get_start_time_from_rti(instant_t my_physical_time) { // Send the timestamp marker first. send_time(MSG_TYPE_TIMESTAMP, my_physical_time); - // Read bytes from the socket. We need first 9 (1 + 8) bytes. + // Read bytes from the socket. We need either 9 butes or 21, depending on the federate type // Buffer for message ID plus timestamp. - size_t buffer_length = MSG_TYPE_TIMESTAMP_LENGTH; + size_t buffer_length = (_fed.is_transient) ? MSG_TYPE_TIMESTAMP_TAG_LENGTH : MSG_TYPE_TIMESTAMP_LENGTH; unsigned char buffer[buffer_length]; while (true) { read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, 1, buffer, NULL, "Failed to read MSG_TYPE_TIMESTAMP_START message from RTI."); // First byte received is the message ID. - if (buffer[0] != MSG_TYPE_TIMESTAMP_START) { + if (buffer[0] != MSG_TYPE_TIMESTAMP) { if (buffer[0] == MSG_TYPE_FAILED) { lf_print_error_and_exit("RTI has failed."); } else if (buffer[0] == MSG_TYPE_UPSTREAM_CONNECTED) { @@ -1051,7 +1051,11 @@ static instant_t get_start_time_from_rti(instant_t my_physical_time) { } instant_t timestamp = extract_int64(&(buffer[1])); - effective_start_tag = extract_tag(&(buffer[9])); + if (_fed.is_transient) { + effective_start_tag = extract_tag(&(buffer[9])); + } else { + effective_start_tag = (tag_t){.time = timestamp, .microstep = 0u}; + } // Trace the event when tracing is enabled. // Note that we report in the trace the effective_start_tag. diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index 096ac58e1..767557fe2 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -322,11 +322,13 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** * Byte identifying a timestamp message, which is 64 bits long. * Each federate sends its starting physical time as a message of this - * type, and the RTI broadcasts to all the federates the starting logical + * type, and the RTI broadcasts to all persistent federates the starting * time as a message of this type. - s*/ + * In case of a joining federate, the RTI will also send the effective start tag. + */ #define MSG_TYPE_TIMESTAMP 2 -#define MSG_TYPE_TIMESTAMP_LENGTH (1 + sizeof(int64_t)) +#define MSG_TYPE_TIMESTAMP_LENGTH (1 + sizeof(instant_t)) +#define MSG_TYPE_TIMESTAMP_TAG_LENGTH (1 + sizeof(instant_t) + sizeof(tag_t)) /** Byte identifying a message to forward to another federate. * The next two bytes will be the ID of the destination port. @@ -649,16 +651,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MSG_TYPE_UPSTREAM_DISCONNECTED 27 #define MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH (1 + sizeof(uint16_t)) -/** - * As an answer to MSG_TYPE_TIMESTAMP, the RTI broadcasts to all persistent - * federates, or sends to newly joining transient federate, a message of - * MSG_TYPE_STIMESTAMP_START. It includes the starting time of the federation, - * together with the effective starting logical tag. The latter is useful for - * transient federates. - */ -#define MSG_TYPE_TIMESTAMP_START 28 -#define MSG_TYPE_TIMESTAMP_START_LENGTH (1 + sizeof(instant_t) + sizeof(instant_t) + sizeof(microstep_t)) - /** * Byte sent by the RTI ordering the federate to stop. Upon receiving the message, * the federate will call lf_stop(), which will make it resign at its current_tag From ea2c3ab7b252e0a2f46561a3fd0a627a7d26bb29 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Mon, 13 Jan 2025 13:49:35 +0100 Subject: [PATCH 144/148] Fix tracing the start time in the RTI --- core/federated/RTI/rti_remote.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 51a309798..08718e366 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1048,6 +1048,10 @@ void handle_timestamp(federate_info_t* my_fed) { } // Add an offset to the maximum tag to get everyone starting together. start_time = rti_remote->max_start_time + DELAY_START; + // Set the start_time in the RTI trace + if (rti_remote->base.tracing_enabled) { + lf_tracing_set_start_time(start_time); + } my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; // Notify the federate of its start tag. @@ -2353,11 +2357,6 @@ void wait_for_federates(int socket_descriptor) { // Wait for connections from persistent federates and create a thread for each. lf_connect_to_persistent_federates(socket_descriptor); - // Set the start_time in the RTI trace - if (rti_remote->base.tracing_enabled) { - lf_tracing_set_start_time(start_time); - } - // Set has_upstream_transient_federates parameter in all federates and check // that there is no more than one level of transiency if (rti_remote->number_of_transient_federates > 0) { From 7dbd42e973373dc84c7561e7ec2bf6b79cd2e55c Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 24 Jan 2025 12:07:17 +0100 Subject: [PATCH 145/148] Do not overwrite lft files, but find a name that does not exist --- trace/impl/src/trace_impl.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/trace/impl/src/trace_impl.c b/trace/impl/src/trace_impl.c index f819507c4..ac52c2610 100644 --- a/trace/impl/src/trace_impl.c +++ b/trace/impl/src/trace_impl.c @@ -260,10 +260,37 @@ void lf_tracing_global_init(char* process_name, char* process_names, int fedid, } process_id = fedid; char filename[100]; + + // When tracing transient federates, a new trace file is created for each execution. For this, the function + // checks for file existance. If the file exists, the function appends a number to the file name and checks + // again. + int iter = 0; + bool file_exists = false; + bool new_file = false; if (strcmp(process_name, "rti") == 0) { sprintf(filename, "%s.lft", process_name); } else { - sprintf(filename, "%s_%d.lft", process_name, process_id); + FILE* file; + do { + if (iter == 0) { + sprintf(filename, "%s_%d.lft", process_name, process_id); + } else { + sprintf(filename, "%s_%d_%d.lft", process_name, process_id, iter); + } + file = fopen(filename, "r"); + if (file) { + file_exists = true; + new_file = true; + fclose(file); + iter++; + } else { + file_exists = false; + } + } while (file_exists); + } + if (new_file) { + lf_print_warning("No overwriting! The default file name already exists. A new trace file named %s is created.", + filename); } trace_new(filename); start_trace(&trace, max_num_local_threads); From a85af56ed3decfb42f06b4fb815fffb171f8ab59 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 24 Jan 2025 14:34:53 +0100 Subject: [PATCH 146/148] Run clang-format --- trace/impl/src/trace_impl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/trace/impl/src/trace_impl.c b/trace/impl/src/trace_impl.c index ac52c2610..4e880407c 100644 --- a/trace/impl/src/trace_impl.c +++ b/trace/impl/src/trace_impl.c @@ -261,8 +261,8 @@ void lf_tracing_global_init(char* process_name, char* process_names, int fedid, process_id = fedid; char filename[100]; - // When tracing transient federates, a new trace file is created for each execution. For this, the function - // checks for file existance. If the file exists, the function appends a number to the file name and checks + // When tracing transient federates, a new trace file is created for each execution. For this, the function + // checks for file existance. If the file exists, the function appends a number to the file name and checks // again. int iter = 0; bool file_exists = false; From 57ba24d07c7dfaf33dd837395caf340a247b552a Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 31 Jan 2025 06:53:06 +0100 Subject: [PATCH 147/148] Fix merge issues --- core/federated/RTI/rti_common.c | 1 + core/federated/RTI/rti_remote.c | 86 ++++++++++----------- include/core/federated/network/net_common.h | 4 +- 3 files changed, 43 insertions(+), 48 deletions(-) diff --git a/core/federated/RTI/rti_common.c b/core/federated/RTI/rti_common.c index 62383a8a4..db063bb53 100644 --- a/core/federated/RTI/rti_common.c +++ b/core/federated/RTI/rti_common.c @@ -42,6 +42,7 @@ void invalidate_min_delays() { node->flags = 0; // All flags cleared because they get set lazily. } free(rti_common->min_delays); + rti_common->min_delays = NULL; } } diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index eb914112e..0792a3777 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -218,8 +218,8 @@ static void notify_grant_delayed(federate_info_t* fed, tag_t tag, bool is_provis */ static int get_num_absent_upstream_transients(federate_info_t* fed) { int num_absent_upstream_transients = 0; - for (int j = 0; j < fed->enclave.num_upstream; j++) { - federate_info_t* upstream = GET_FED_INFO(fed->enclave.upstream[j]); + for (int j = 0; j < fed->enclave.num_immediate_upstreams; j++) { + federate_info_t* upstream = GET_FED_INFO(fed->enclave.immediate_upstreams[j]); // Ignore this enclave if it no longer connected. if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { num_absent_upstream_transients++; @@ -275,8 +275,8 @@ static void notify_federate_disconnected(federate_info_t* fed) { // Notify downstream federates. Need to hold the mutex lock to do this. if (fed->is_transient) { LF_MUTEX_LOCK(&rti_mutex); - for (int j = 0; j < fed->enclave.num_downstream; j++) { - federate_info_t* downstream = GET_FED_INFO(fed->enclave.downstream[j]); + for (int j = 0; j < fed->enclave.num_immediate_downstreams; j++) { + federate_info_t* downstream = GET_FED_INFO(fed->enclave.immediate_downstreams[j]); // Ignore this enclave if it no longer connected. if (downstream->enclave.state != NOT_CONNECTED) { // Notify the downstream enclave. @@ -582,9 +582,9 @@ void handle_timed_message(federate_info_t* sending_federate, unsigned char* buff // message from the socket, and return. federate_info_t* fed = GET_FED_INFO(federate_id); interval_t delay = NEVER; - for (int i = 0; i < fed->enclave.num_upstream; i++) { - if (fed->enclave.upstream[i] == sending_federate->enclave.id) { - delay = fed->enclave.upstream_delay[i]; + for (int i = 0; i < fed->enclave.num_immediate_upstreams; i++) { + if (fed->enclave.immediate_upstreams[i] == sending_federate->enclave.id) { + delay = fed->enclave.immediate_upstream_delays[i]; break; } } @@ -996,8 +996,8 @@ static void send_start_tag_locked(federate_info_t* my_fed, instant_t federation_ // Notify my_fed of any upstream transient federates that are connected. // This has to occur before sending the start tag so that my_fed does not begin executing thinking that these // upstream federates are not connected. - for (int i = 0; i < my_fed->enclave.num_upstream; i++) { - federate_info_t* fed = GET_FED_INFO(my_fed->enclave.upstream[i]); + for (int i = 0; i < my_fed->enclave.num_immediate_upstreams; i++) { + federate_info_t* fed = GET_FED_INFO(my_fed->enclave.immediate_upstreams[i]); if (fed->is_transient && fed->enclave.state == GRANTED) { send_upstream_connected_locked(my_fed, fed); } @@ -1029,8 +1029,8 @@ static void send_start_tag_locked(federate_info_t* my_fed, instant_t federation_ // If this is a transient federate, notify its downstream federates that it is now connected. if (my_fed->is_transient) { - for (int i = 0; i < my_fed->enclave.num_downstream; i++) { - send_upstream_connected_locked(GET_FED_INFO(my_fed->enclave.downstream[i]), my_fed); + for (int i = 0; i < my_fed->enclave.num_immediate_downstreams; i++) { + send_upstream_connected_locked(GET_FED_INFO(my_fed->enclave.immediate_downstreams[i]), my_fed); } } } @@ -1122,8 +1122,8 @@ void handle_timestamp(federate_info_t* my_fed) { } // Condition 4. Iterate over the downstream federates - for (int j = 0; j < my_fed->enclave.num_downstream; j++) { - federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); + for (int j = 0; j < my_fed->enclave.num_immediate_downstreams; j++) { + federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.immediate_downstreams[j]); // Get the max over the TAG of the downstreams if (lf_tag_compare(downstream->enclave.last_granted, my_fed->effective_start_tag) >= 0) { @@ -1144,8 +1144,8 @@ void handle_timestamp(federate_info_t* my_fed) { // because the effective_start_tag is sent while still holding the mutex. // Iterate over the messages from the upstream federates - for (int j = 0; j < my_fed->enclave.num_upstream; j++) { - federate_info_t* upstream = GET_FED_INFO(my_fed->enclave.upstream[j]); + for (int j = 0; j < my_fed->enclave.num_immediate_upstreams; j++) { + federate_info_t* upstream = GET_FED_INFO(my_fed->enclave.immediate_upstreams[j]); size_t queue_size = pqueue_tag_size(upstream->in_transit_message_tags); if (queue_size != 0) { @@ -1163,8 +1163,8 @@ void handle_timestamp(federate_info_t* my_fed) { // FIXME: Should this be higher-than or equal to? // FIXME: Also, won't the grant simply be lost? // If the joining federate doesn't send anything, the downstream federate won't issue another NET. - for (int j = 0; j < my_fed->enclave.num_downstream; j++) { - federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.downstream[j]); + for (int j = 0; j < my_fed->enclave.num_immediate_downstreams; j++) { + federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.immediate_downstreams[j]); // Ignore this federate if it has resigned. if (downstream->enclave.state == NOT_CONNECTED) { @@ -1189,11 +1189,8 @@ void handle_timestamp(federate_info_t* my_fed) { // Whenver a transient joins, invalidate all federates, so that all min_delays_upstream // get re-computed. - // FIXME: Needs to be optimized to only invalidate those affected by the transient - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); - invalidate_min_delays_upstream(&(fed->enclave)); - } + // FIXME: Maybe optimize it to only invalidate those affected by the transient + invalidate_min_delays(); LF_MUTEX_UNLOCK(&rti_mutex); } @@ -1639,12 +1636,6 @@ static int32_t receive_and_check_fed_id_message(int* socket_id) { send_reject(socket_id, FEDERATE_ID_OUT_OF_RANGE); return -1; } else { - // Find out if it is a new connection or a hot swap. - // Reject if: - // - duplicate of a connected persistent federate - // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that - // particular federate - // - or it is a hot swap, but it is not the execution phase yet // Find out if it is a new connection or a hot swap. // Reject if: // - duplicate of a connected persistent federate @@ -1839,22 +1830,22 @@ static int receive_connection_information(int* socket_id, uint16_t fed_id) { // Now, compare the previous and the new neighberhood structure // Start with the number of upstreams and downstreams bool reject = false; - if ((fed->enclave.num_upstream != temp_fed->enclave.num_upstream) || - (fed->enclave.num_downstream != temp_fed->enclave.num_downstream)) { + if ((fed->enclave.num_immediate_upstreams != temp_fed->enclave.num_immediate_upstreams) || + (fed->enclave.num_immediate_downstreams != temp_fed->enclave.num_immediate_downstreams)) { reject = true; } else { // Then check all upstreams and their delays - for (int i = 0; i < fed->enclave.num_upstream; i++) { - if ((fed->enclave.upstream[i] != temp_fed->enclave.upstream[i]) || - (fed->enclave.upstream_delay[i] != temp_fed->enclave.upstream_delay[i])) { + for (int i = 0; i < fed->enclave.num_immediate_upstreams; i++) { + if ((fed->enclave.immediate_upstreams[i] != temp_fed->enclave.immediate_upstreams[i]) || + (fed->enclave.immediate_upstream_delays[i] != temp_fed->enclave.immediate_upstream_delays[i])) { reject = true; break; } } if (!reject) { // Finally, check all downstream federates - for (int i = 0; i < fed->enclave.num_downstream; i++) { - if (fed->enclave.downstream[i] != temp_fed->enclave.downstream[i]) { + for (int i = 0; i < fed->enclave.num_immediate_downstreams; i++) { + if (fed->enclave.immediate_downstreams[i] != temp_fed->enclave.immediate_downstreams[i]) { reject = true; break; } @@ -2322,7 +2313,10 @@ void reset_transient_federate(federate_info_t* fed) { fed->server_port = -1; fed->requested_stop = false; fed->effective_start_tag = NEVER_TAG; - // invalidate_all_min_delays(); + // Whenver a transient resigns or leaves, invalidate all federates, so that all min_delays_upstream + // get re-computed. + // FIXME: Maybe optimize it to only invalidate those affected by the transient + invalidate_min_delays(); } int32_t start_rti_server(uint16_t port) { @@ -2356,8 +2350,8 @@ int32_t start_rti_server(uint16_t port) { static int set_has_upstream_transient_federates_parameter_and_check() { for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { federate_info_t* fed = GET_FED_INFO(i); - for (int j = 0; j < fed->enclave.num_upstream; j++) { - federate_info_t* upstream_fed = GET_FED_INFO(fed->enclave.upstream[j]); + for (int j = 0; j < fed->enclave.num_immediate_upstreams; j++) { + federate_info_t* upstream_fed = GET_FED_INFO(fed->enclave.immediate_upstreams[j]); if (upstream_fed->is_transient) { fed->has_upstream_transient_federates = true; break; @@ -2514,14 +2508,14 @@ void free_scheduling_nodes(scheduling_node_t** scheduling_nodes, uint16_t number for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { // FIXME: Gives error freeing memory not allocated!!!! scheduling_node_t* node = scheduling_nodes[i]; - if (node->immediate_upstreams != NULL) { - free(node->immediate_upstreams); - free(node->immediate_upstream_delays); - } - if (node->immediate_downstreams != NULL) { - free(node->immediate_downstreams); - } - free(node); + if (node->immediate_upstreams != NULL) { + free(node->immediate_upstreams); + free(node->immediate_upstream_delays); + } + if (node->immediate_downstreams != NULL) { + free(node->immediate_downstreams); + } + free(node); } free(scheduling_nodes); } diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index 0e612db1f..d6b214221 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -653,14 +653,14 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * A message the informs a downstream federate that a federate upstream of it * is connected. The next 2 bytes are the federate ID of the upstream federate. */ -#define MSG_TYPE_UPSTREAM_CONNECTED 26 +#define MSG_TYPE_UPSTREAM_CONNECTED 27 #define MSG_TYPE_UPSTREAM_CONNECTED_LENGTH (1 + sizeof(uint16_t)) /** * A message the informs a downstream federate that a federate upstream of it * is no longer connected. The next 2 bytes are the federate ID of the upstream federate. */ -#define MSG_TYPE_UPSTREAM_DISCONNECTED 27 +#define MSG_TYPE_UPSTREAM_DISCONNECTED 28 #define MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH (1 + sizeof(uint16_t)) /** From 7e2971ba3a2f1d4820efccd3108faef589237f84 Mon Sep 17 00:00:00 2001 From: Chadlia Jerad Date: Fri, 31 Jan 2025 07:28:39 +0100 Subject: [PATCH 148/148] Apply formatter --- core/federated/RTI/rti_remote.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 0792a3777..cda3ce9e8 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -2508,14 +2508,14 @@ void free_scheduling_nodes(scheduling_node_t** scheduling_nodes, uint16_t number for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { // FIXME: Gives error freeing memory not allocated!!!! scheduling_node_t* node = scheduling_nodes[i]; - if (node->immediate_upstreams != NULL) { - free(node->immediate_upstreams); - free(node->immediate_upstream_delays); - } - if (node->immediate_downstreams != NULL) { - free(node->immediate_downstreams); - } - free(node); + if (node->immediate_upstreams != NULL) { + free(node->immediate_upstreams); + free(node->immediate_upstream_delays); + } + if (node->immediate_downstreams != NULL) { + free(node->immediate_downstreams); + } + free(node); } free(scheduling_nodes); }