From 6569050edf1391a5f37b48435a4c507af8460ab8 Mon Sep 17 00:00:00 2001 From: dgsudharsan Date: Mon, 17 Mar 2025 21:07:52 +0000 Subject: [PATCH] [FRR]Fixing BGP session not being stable after flapping --- ...r-peer-connection-error-with-per-bgp.patch | 367 ++++++++ ...85-bgpd-remove-apis-from-bgp_route.h.patch | 26 + ...batch-peer-connection-error-clearing.patch | 874 ++++++++++++++++++ ...conn-error-list-to-connection-struct.patch | 210 +++++ src/sonic-frr/patch/series | 4 + 5 files changed, 1481 insertions(+) create mode 100644 src/sonic-frr/patch/0084-bgpd-Replace-per-peer-connection-error-with-per-bgp.patch create mode 100644 src/sonic-frr/patch/0085-bgpd-remove-apis-from-bgp_route.h.patch create mode 100644 src/sonic-frr/patch/0086-bgpd-batch-peer-connection-error-clearing.patch create mode 100644 src/sonic-frr/patch/0087-zebra-move-peer-conn-error-list-to-connection-struct.patch diff --git a/src/sonic-frr/patch/0084-bgpd-Replace-per-peer-connection-error-with-per-bgp.patch b/src/sonic-frr/patch/0084-bgpd-Replace-per-peer-connection-error-with-per-bgp.patch new file mode 100644 index 000000000000..794ea13b916b --- /dev/null +++ b/src/sonic-frr/patch/0084-bgpd-Replace-per-peer-connection-error-with-per-bgp.patch @@ -0,0 +1,367 @@ +From 655c84f5ca5534323ae8accb8d313615440ab034 Mon Sep 17 00:00:00 2001 +From: Mark Stapp +Date: Thu, 26 Sep 2024 11:09:35 -0400 +Subject: [PATCH 1/4] bgpd: Replace per-peer connection error with per-bgp + +Replace the per-peer connection error with a per-bgp event and +a list. The io pthread enqueues peers per-bgp-instance, and the +error-handing code can process multiple peers if there have been +multiple failures. + +Signed-off-by: Mark Stapp + +diff --git a/bgpd/bgp_io.c b/bgpd/bgp_io.c +index b07e69ac3..4d5e1d9c5 100644 +--- a/bgpd/bgp_io.c ++++ b/bgpd/bgp_io.c +@@ -100,7 +100,6 @@ void bgp_reads_off(struct peer_connection *connection) + + event_cancel_async(fpt->master, &connection->t_read, NULL); + EVENT_OFF(connection->t_process_packet); +- EVENT_OFF(connection->t_process_packet_error); + + UNSET_FLAG(connection->thread_flags, PEER_THREAD_READS_ON); + } +@@ -252,8 +251,7 @@ static void bgp_process_reads(struct event *thread) + /* Handle the error in the main pthread, include the + * specific state change from 'bgp_read'. + */ +- event_add_event(bm->master, bgp_packet_process_error, connection, +- code, &connection->t_process_packet_error); ++ bgp_enqueue_conn_err_peer(peer->bgp, connection->peer, code); + goto done; + } + +diff --git a/bgpd/bgp_packet.c b/bgpd/bgp_packet.c +index 2e682c773..26c00cfe1 100644 +--- a/bgpd/bgp_packet.c ++++ b/bgpd/bgp_packet.c +@@ -4003,35 +4003,60 @@ void bgp_send_delayed_eor(struct bgp *bgp) + } + + /* +- * Task callback to handle socket error encountered in the io pthread. We avoid +- * having the io pthread try to enqueue fsm events or mess with the peer +- * struct. ++ * Task callback in the main pthread to handle socket error ++ * encountered in the io pthread. We avoid having the io pthread try ++ * to enqueue fsm events or mess with the peer struct. + */ ++ ++/* Max number of peers to process without rescheduling */ ++#define BGP_CONN_ERROR_DEQUEUE_MAX 10 ++ + void bgp_packet_process_error(struct event *thread) + { + struct peer_connection *connection; + struct peer *peer; +- int code; ++ struct bgp *bgp; ++ int counter = 0; ++ bool more_p = false; + +- connection = EVENT_ARG(thread); +- peer = connection->peer; +- code = EVENT_VAL(thread); ++ bgp = EVENT_ARG(thread); + +- if (bgp_debug_neighbor_events(peer)) +- zlog_debug("%s [Event] BGP error %d on fd %d", peer->host, code, +- connection->fd); +- +- /* Closed connection or error on the socket */ +- if (peer_established(connection)) { +- if ((CHECK_FLAG(peer->flags, PEER_FLAG_GRACEFUL_RESTART) +- || CHECK_FLAG(peer->flags, +- PEER_FLAG_GRACEFUL_RESTART_HELPER)) +- && CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_MODE)) { +- peer->last_reset = PEER_DOWN_NSF_CLOSE_SESSION; +- SET_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT); +- } else +- peer->last_reset = PEER_DOWN_CLOSE_SESSION; ++ /* Dequeue peers from the error list */ ++ while ((peer = bgp_dequeue_conn_err_peer(bgp, &more_p)) != NULL) { ++ connection = peer->connection; ++ ++ if (bgp_debug_neighbor_events(peer)) ++ zlog_debug("%s [Event] BGP error %d on fd %d", ++ peer->host, peer->connection_errcode, ++ connection->fd); ++ ++ /* Closed connection or error on the socket */ ++ if (peer_established(connection)) { ++ if ((CHECK_FLAG(peer->flags, PEER_FLAG_GRACEFUL_RESTART) ++ || CHECK_FLAG(peer->flags, ++ PEER_FLAG_GRACEFUL_RESTART_HELPER)) ++ && CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_MODE)) { ++ peer->last_reset = PEER_DOWN_NSF_CLOSE_SESSION; ++ SET_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT); ++ } else ++ peer->last_reset = PEER_DOWN_CLOSE_SESSION; ++ } ++ ++ /* No need for keepalives, if enabled */ ++ bgp_keepalives_off(connection); ++ ++ bgp_event_update(connection, peer->connection_errcode); ++ ++ counter++; ++ if (counter >= BGP_CONN_ERROR_DEQUEUE_MAX) ++ break; + } + +- bgp_event_update(connection, code); ++ /* Reschedule event if necessary */ ++ if (more_p) ++ bgp_conn_err_reschedule(bgp); ++ ++ if (bgp_debug_neighbor_events(NULL)) ++ zlog_debug("%s: dequeued and processed %d peers", __func__, ++ counter); + } +diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c +index 591696de7..7159fe4df 100644 +--- a/bgpd/bgpd.c ++++ b/bgpd/bgpd.c +@@ -86,6 +86,9 @@ DEFINE_QOBJ_TYPE(bgp); + DEFINE_QOBJ_TYPE(peer); + DEFINE_HOOK(bgp_inst_delete, (struct bgp *bgp), (bgp)); + ++/* Peers with connection error/failure, per bgp instance */ ++DECLARE_LIST(bgp_peer_conn_errlist, struct peer, conn_err_link); ++ + /* BGP process wide configuration. */ + static struct bgp_master bgp_master; + +@@ -2684,6 +2687,9 @@ int peer_delete(struct peer *peer) + + assert(peer->connection->status != Deleted); + ++ if (bgp_debug_neighbor_events(peer)) ++ zlog_debug("%s: peer %pBP", __func__, peer); ++ + bgp = peer->bgp; + accept_peer = CHECK_FLAG(peer->sflags, PEER_STATUS_ACCEPT_PEER); + +@@ -2701,6 +2707,13 @@ int peer_delete(struct peer *peer) + PEER_THREAD_READS_ON)); + assert(!CHECK_FLAG(peer->thread_flags, PEER_THREAD_KEEPALIVES_ON)); + ++ /* Ensure the peer is removed from the connection error list */ ++ frr_with_mutex (&bgp->peer_errs_mtx) { ++ if (bgp_peer_conn_errlist_anywhere(peer)) ++ bgp_peer_conn_errlist_del(&bgp->peer_conn_errlist, ++ peer); ++ } ++ + if (CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT)) + peer_nsf_stop(peer); + +@@ -3567,6 +3580,10 @@ static struct bgp *bgp_create(as_t *as, const char *name, + memset(&bgp->ebgprequirespolicywarning, 0, + sizeof(bgp->ebgprequirespolicywarning)); + ++ /* Init peer connection error info */ ++ pthread_mutex_init(&bgp->peer_errs_mtx, NULL); ++ bgp_peer_conn_errlist_init(&bgp->peer_conn_errlist); ++ + return bgp; + } + +@@ -4000,6 +4017,18 @@ int bgp_delete(struct bgp *bgp) + if (i != ZEBRA_ROUTE_BGP) + bgp_redistribute_unset(bgp, afi, i, 0); + ++ /* Clear list of peers with connection errors - each ++ * peer will need to check again, in case the io pthread is racing ++ * with us, but this batch cleanup should make the per-peer check ++ * cheaper. ++ */ ++ frr_with_mutex (&bgp->peer_errs_mtx) { ++ do { ++ peer = bgp_peer_conn_errlist_pop( ++ &bgp->peer_conn_errlist); ++ } while (peer != NULL); ++ } ++ + /* Free peers and peer-groups. */ + for (ALL_LIST_ELEMENTS(bgp->group, node, next, group)) + peer_group_delete(group); +@@ -4016,6 +4045,9 @@ int bgp_delete(struct bgp *bgp) + + update_bgp_group_free(bgp); + ++ /* Cancel peer connection errors event */ ++ EVENT_OFF(bgp->t_conn_errors); ++ + /* TODO - Other memory may need to be freed - e.g., NHT */ + + #ifdef ENABLE_BGP_VNC +@@ -4168,6 +4200,9 @@ void bgp_free(struct bgp *bgp) + bgp_srv6_cleanup(bgp); + bgp_confederation_id_unset(bgp); + ++ bgp_peer_conn_errlist_init(&bgp->peer_conn_errlist); ++ pthread_mutex_destroy(&bgp->peer_errs_mtx); ++ + for (int i = 0; i < bgp->confed_peers_cnt; i++) + XFREE(MTYPE_BGP_NAME, bgp->confed_peers[i].as_pretty); + +@@ -8704,6 +8739,59 @@ void bgp_gr_apply_running_config(void) + } + } + ++/* ++ * Enqueue a peer with a connection error to be handled in the main pthread ++ */ ++int bgp_enqueue_conn_err_peer(struct bgp *bgp, struct peer *peer, int errcode) ++{ ++ frr_with_mutex (&bgp->peer_errs_mtx) { ++ peer->connection_errcode = errcode; ++ ++ /* Careful not to double-enqueue */ ++ if (!bgp_peer_conn_errlist_anywhere(peer)) { ++ bgp_peer_conn_errlist_add_tail(&bgp->peer_conn_errlist, ++ peer); ++ } ++ } ++ /* Ensure an event is scheduled */ ++ event_add_event(bm->master, bgp_packet_process_error, bgp, 0, ++ &bgp->t_conn_errors); ++ return 0; ++} ++ ++/* ++ * Dequeue a peer that encountered a connection error; signal whether there ++ * are more queued peers. ++ */ ++struct peer *bgp_dequeue_conn_err_peer(struct bgp *bgp, bool *more_p) ++{ ++ struct peer *peer = NULL; ++ bool more = false; ++ ++ frr_with_mutex (&bgp->peer_errs_mtx) { ++ peer = bgp_peer_conn_errlist_pop(&bgp->peer_conn_errlist); ++ ++ if (bgp_peer_conn_errlist_const_first( ++ &bgp->peer_conn_errlist) != NULL) ++ more = true; ++ } ++ ++ if (more_p) ++ *more_p = more; ++ ++ return peer; ++} ++ ++/* ++ * Reschedule the connection error event - probably after processing ++ * some of the peers on the list. ++ */ ++void bgp_conn_err_reschedule(struct bgp *bgp) ++{ ++ event_add_event(bm->master, bgp_packet_process_error, bgp, 0, ++ &bgp->t_conn_errors); ++} ++ + printfrr_ext_autoreg_p("BP", printfrr_bp); + static ssize_t printfrr_bp(struct fbuf *buf, struct printfrr_eargs *ea, + const void *ptr) +diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h +index 0341ff68e..bc15c0662 100644 +--- a/bgpd/bgpd.h ++++ b/bgpd/bgpd.h +@@ -353,6 +353,33 @@ struct as_confed { + struct bgp_mplsvpn_nh_label_bind_cache; + PREDECL_RBTREE_UNIQ(bgp_mplsvpn_nh_label_bind_cache); + ++/* List of peers that have connection errors in the io pthread */ ++PREDECL_LIST(bgp_peer_conn_errlist); ++ ++/* List of info about peers that are being cleared from BGP RIBs in a batch */ ++PREDECL_LIST(bgp_clearing_info); ++ ++/* Hash of peers in clearing info object */ ++PREDECL_HASH(bgp_clearing_hash); ++ ++/* Info about a batch of peers that need to be cleared from the RIB. ++ * If many peers need to be cleared, we process them in batches, taking ++ * one walk through the RIB for each batch. ++ */ ++struct bgp_clearing_info { ++ /* Hash of peers */ ++ struct bgp_clearing_hash_head peers; ++ ++ /* Event to schedule/reschedule processing */ ++ struct thread *t_sched; ++ ++ /* RIB dest for rescheduling */ ++ struct bgp_dest *last_dest; ++ ++ /* Linkage for list of batches per-bgp */ ++ struct bgp_clearing_info_item link; ++}; ++ + /* BGP instance structure. */ + struct bgp { + /* AS number of this BGP instance. */ +@@ -827,6 +854,21 @@ struct bgp { + uint16_t tcp_keepalive_intvl; + uint16_t tcp_keepalive_probes; + ++ /* List of peers that have connection errors in the IO pthread */ ++ struct bgp_peer_conn_errlist_head peer_conn_errlist; ++ ++ /* Mutex that guards the connection-errors list */ ++ pthread_mutex_t peer_errs_mtx; ++ ++ /* Event indicating that there have been connection errors; this ++ * is typically signalled in the IO pthread; it's handled in the ++ * main pthread. ++ */ ++ struct event *t_conn_errors; ++ ++ /* List of batches of peers being cleared from BGP RIBs */ ++ struct bgp_clearing_info_head clearing_list; ++ + struct timeval ebgprequirespolicywarning; + #define FIFTEENMINUTE2USEC (int64_t)15 * 60 * 1000000 + +@@ -1188,7 +1230,6 @@ struct peer_connection { + + struct event *t_routeadv; + struct event *t_process_packet; +- struct event *t_process_packet_error; + + struct event *t_stop_with_notify; + +@@ -1856,6 +1897,15 @@ struct peer { + /* Add-Path Best selected paths number to advertise */ + uint8_t addpath_best_selected[AFI_MAX][SAFI_MAX]; + ++ /* Linkage for list of peers with connection errors from IO pthread */ ++ struct bgp_peer_conn_errlist_item conn_err_link; ++ ++ /* Connection error code */ ++ uint16_t connection_errcode; ++ ++ /* Linkage for hash of clearing peers being cleared in a batch */ ++ struct bgp_clearing_hash_item clear_hash_link; ++ + QOBJ_FIELDS; + }; + DECLARE_QOBJ_TYPE(peer); +@@ -2490,6 +2540,10 @@ void bgp_gr_apply_running_config(void); + int bgp_global_gr_init(struct bgp *bgp); + int bgp_peer_gr_init(struct peer *peer); + ++/* APIs for the per-bgp peer connection error list */ ++int bgp_enqueue_conn_err_peer(struct bgp *bgp, struct peer *peer, int errcode); ++struct peer *bgp_dequeue_conn_err_peer(struct bgp *bgp, bool *more_p); ++void bgp_conn_err_reschedule(struct bgp *bgp); + + #define BGP_GR_ROUTER_DETECT_AND_SEND_CAPABILITY_TO_ZEBRA(_bgp, _peer_list) \ + do { \ +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0085-bgpd-remove-apis-from-bgp_route.h.patch b/src/sonic-frr/patch/0085-bgpd-remove-apis-from-bgp_route.h.patch new file mode 100644 index 000000000000..f5d058cf1a7f --- /dev/null +++ b/src/sonic-frr/patch/0085-bgpd-remove-apis-from-bgp_route.h.patch @@ -0,0 +1,26 @@ +From 8ee8e2511328e8c27ad7c16bd0854cc4335df399 Mon Sep 17 00:00:00 2001 +From: Mark Stapp +Date: Tue, 1 Oct 2024 09:23:26 -0400 +Subject: [PATCH 2/4] bgpd: remove apis from bgp_route.h + +Remove a couple of apis that don't exist. + +Signed-off-by: Mark Stapp + +diff --git a/bgpd/bgp_route.h b/bgpd/bgp_route.h +index 61492980a..39e282712 100644 +--- a/bgpd/bgp_route.h ++++ b/bgpd/bgp_route.h +@@ -885,9 +885,6 @@ extern bool subgroup_announce_check(struct bgp_dest *dest, + const struct prefix *p, struct attr *attr, + struct attr *post_attr); + +-extern void bgp_peer_clear_node_queue_drain_immediate(struct peer *peer); +-extern void bgp_process_queues_drain_immediate(void); +- + /* for encap/vpn */ + extern struct bgp_dest *bgp_safi_node_lookup(struct bgp_table *table, + safi_t safi, +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0086-bgpd-batch-peer-connection-error-clearing.patch b/src/sonic-frr/patch/0086-bgpd-batch-peer-connection-error-clearing.patch new file mode 100644 index 000000000000..17e5bcb6100f --- /dev/null +++ b/src/sonic-frr/patch/0086-bgpd-batch-peer-connection-error-clearing.patch @@ -0,0 +1,874 @@ +From 6134671a2980cf33aca723e81be642a48a69bb93 Mon Sep 17 00:00:00 2001 +From: Mark Stapp +Date: Tue, 1 Oct 2024 16:30:44 -0400 +Subject: [PATCH] bgpd: batch peer connection error clearing + +When peer connections encounter errors, attempt to batch some +of the clearing processing that occurs. Add a new batch object, +add multiple peers to it, if possible. Do one rib walk for the +batch, rather than one walk per peer. Use a handler callback +per batch to check and remove peers' path-infos, rather than +a work-queue and callback per peer. The original clearing code +remains; it's used for single peers. + +Signed-off-by: Mark Stapp + +diff --git a/bgpd/bgp_fsm.c b/bgpd/bgp_fsm.c +index 650301163..254043407 100644 +--- a/bgpd/bgp_fsm.c ++++ b/bgpd/bgp_fsm.c +@@ -1263,7 +1263,8 @@ void bgp_fsm_change_status(struct peer_connection *connection, + * Clearing + * (or Deleted). + */ +- if (!work_queue_is_scheduled(peer->clear_node_queue) && ++ if (!CHECK_FLAG(peer->flags, PEER_FLAG_CLEARING_BATCH) && ++ !work_queue_is_scheduled(peer->clear_node_queue) && + status != Deleted) + BGP_EVENT_ADD(connection, Clearing_Completed); + } +diff --git a/bgpd/bgp_memory.h b/bgpd/bgp_memory.h +index 29584cd78..321fd6118 100644 +--- a/bgpd/bgp_memory.h ++++ b/bgpd/bgp_memory.h +@@ -131,4 +131,6 @@ DECLARE_MTYPE(BGP_NOTIFICATION); + + DECLARE_MTYPE(BGP_SOFT_VERSION); + ++DECLARE_MTYPE(CLEARING_BATCH); ++ + #endif /* _QUAGGA_BGP_MEMORY_H */ +diff --git a/bgpd/bgp_packet.c b/bgpd/bgp_packet.c +index 26c00cfe1..658f82863 100644 +--- a/bgpd/bgp_packet.c ++++ b/bgpd/bgp_packet.c +@@ -4001,62 +4001,3 @@ void bgp_send_delayed_eor(struct bgp *bgp) + for (ALL_LIST_ELEMENTS(bgp->peer, node, nnode, peer)) + bgp_write_proceed_actions(peer); + } +- +-/* +- * Task callback in the main pthread to handle socket error +- * encountered in the io pthread. We avoid having the io pthread try +- * to enqueue fsm events or mess with the peer struct. +- */ +- +-/* Max number of peers to process without rescheduling */ +-#define BGP_CONN_ERROR_DEQUEUE_MAX 10 +- +-void bgp_packet_process_error(struct event *thread) +-{ +- struct peer_connection *connection; +- struct peer *peer; +- struct bgp *bgp; +- int counter = 0; +- bool more_p = false; +- +- bgp = EVENT_ARG(thread); +- +- /* Dequeue peers from the error list */ +- while ((peer = bgp_dequeue_conn_err_peer(bgp, &more_p)) != NULL) { +- connection = peer->connection; +- +- if (bgp_debug_neighbor_events(peer)) +- zlog_debug("%s [Event] BGP error %d on fd %d", +- peer->host, peer->connection_errcode, +- connection->fd); +- +- /* Closed connection or error on the socket */ +- if (peer_established(connection)) { +- if ((CHECK_FLAG(peer->flags, PEER_FLAG_GRACEFUL_RESTART) +- || CHECK_FLAG(peer->flags, +- PEER_FLAG_GRACEFUL_RESTART_HELPER)) +- && CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_MODE)) { +- peer->last_reset = PEER_DOWN_NSF_CLOSE_SESSION; +- SET_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT); +- } else +- peer->last_reset = PEER_DOWN_CLOSE_SESSION; +- } +- +- /* No need for keepalives, if enabled */ +- bgp_keepalives_off(connection); +- +- bgp_event_update(connection, peer->connection_errcode); +- +- counter++; +- if (counter >= BGP_CONN_ERROR_DEQUEUE_MAX) +- break; +- } +- +- /* Reschedule event if necessary */ +- if (more_p) +- bgp_conn_err_reschedule(bgp); +- +- if (bgp_debug_neighbor_events(NULL)) +- zlog_debug("%s: dequeued and processed %d peers", __func__, +- counter); +-} +diff --git a/bgpd/bgp_packet.h b/bgpd/bgp_packet.h +index b67acf205..7fc0189be 100644 +--- a/bgpd/bgp_packet.h ++++ b/bgpd/bgp_packet.h +@@ -74,8 +74,6 @@ extern void bgp_process_packet(struct event *event); + + extern void bgp_send_delayed_eor(struct bgp *bgp); + +-/* Task callback to handle socket error encountered in the io pthread */ +-void bgp_packet_process_error(struct event *thread); + extern struct bgp_notify + bgp_notify_decapsulate_hard_reset(struct bgp_notify *notify); + extern bool bgp_has_graceful_restart_notification(struct peer *peer); +diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c +index 7c596f02a..c6fa5e5b4 100644 +--- a/bgpd/bgp_route.c ++++ b/bgpd/bgp_route.c +@@ -78,6 +78,9 @@ + + #include "bgpd/bgp_route_clippy.c" + ++/* Memory for batched clearing of peers from the RIB */ ++DEFINE_MTYPE(BGPD, CLEARING_BATCH, "Clearing batch"); ++ + DEFINE_HOOK(bgp_snmp_update_stats, + (struct bgp_dest *rn, struct bgp_path_info *pi, bool added), + (rn, pi, added)); +@@ -6107,11 +6110,244 @@ void bgp_clear_route(struct peer *peer, afi_t afi, safi_t safi) + peer_unlock(peer); + } + ++/* ++ * Callback scheduled to process prefixes/dests for batch clearing; the ++ * dests were found via a rib walk. ++ * The one-peer version of this uses a per-peer workqueue to manage ++ * rescheduling, but we're just using a fixed limit here. ++ */ ++ ++/* Limit the number of dests we'll process per callback */ ++#define BGP_CLEARING_BATCH_MAX_DESTS 100 ++ ++static void bgp_clear_batch_dests_task(struct event *event) ++{ ++ struct bgp_clearing_info *cinfo = EVENT_ARG(event); ++ struct bgp_dest *dest; ++ struct bgp_path_info *pi, *next; ++ struct bgp_table *table; ++ struct bgp *bgp; ++ afi_t afi; ++ safi_t safi; ++ int counter = 0; ++ ++ bgp = cinfo->bgp; ++ ++next_dest: ++ ++ dest = bgp_clearing_batch_next_dest(cinfo); ++ if (dest == NULL) ++ goto done; ++ ++ table = bgp_dest_table(dest); ++ afi = table->afi; ++ safi = table->safi; ++ ++ /* Have to check every path: it is possible that we have multiple paths ++ * for a prefix from a peer if that peer is using AddPath. ++ * Note that the clearing action may change the pi list; we try to do ++ * a "safe" iteration. ++ */ ++ for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = next) { ++ next = pi ? pi->next : NULL; ++ ++ if (!bgp_clearing_batch_check_peer(cinfo, pi->peer)) ++ continue; ++ ++ /* graceful restart STALE flag set. */ ++ if (((CHECK_FLAG(pi->peer->sflags, PEER_STATUS_NSF_WAIT) ++ && pi->peer->nsf[afi][safi]) ++ || CHECK_FLAG(pi->peer->af_sflags[afi][safi], ++ PEER_STATUS_ENHANCED_REFRESH)) ++ && !CHECK_FLAG(pi->flags, BGP_PATH_STALE) ++ && !CHECK_FLAG(pi->flags, BGP_PATH_UNUSEABLE)) ++ bgp_path_info_set_flag(dest, pi, BGP_PATH_STALE); ++ else { ++ /* If this is an EVPN route, process for ++ * un-import. */ ++ if (safi == SAFI_EVPN) ++ bgp_evpn_unimport_route( ++ bgp, afi, safi, ++ bgp_dest_get_prefix(dest), pi); ++ /* Handle withdraw for VRF route-leaking and L3VPN */ ++ if (SAFI_UNICAST == safi ++ && (bgp->inst_type == BGP_INSTANCE_TYPE_VRF || ++ bgp->inst_type == BGP_INSTANCE_TYPE_DEFAULT)) { ++ vpn_leak_from_vrf_withdraw(bgp_get_default(), ++ bgp, pi); ++ } ++ if (SAFI_MPLS_VPN == safi && ++ bgp->inst_type == BGP_INSTANCE_TYPE_DEFAULT) { ++ vpn_leak_to_vrf_withdraw(pi); ++ } ++ ++ bgp_rib_remove(dest, pi, pi->peer, afi, safi); ++ } ++ } ++ ++ /* Unref this dest and table */ ++ bgp_dest_unlock_node(dest); ++ bgp_table_unlock(table); ++ ++ counter++; ++ if (counter < BGP_CLEARING_BATCH_MAX_DESTS) ++ goto next_dest; ++ ++done: ++ ++ /* If there are still dests to process, reschedule. */ ++ if (bgp_clearing_batch_dests_present(cinfo)) { ++ if (bgp_debug_neighbor_events(NULL)) ++ zlog_debug("%s: Batch %p: Rescheduled after processing %d dests", ++ __func__, cinfo, counter); ++ ++ event_add_event(bm->master, bgp_clear_batch_dests_task, cinfo, ++ 0, &cinfo->t_sched); ++ } else { ++ if (bgp_debug_neighbor_events(NULL)) ++ zlog_debug("%s: Batch %p: Done after processing %d dests", ++ __func__, cinfo, counter); ++ bgp_clearing_batch_completed(cinfo); ++ } ++ ++ return; ++} ++ ++/* ++ * Walk a single table for batch peer clearing processing ++ */ ++static void clear_batch_table_helper(struct bgp_clearing_info *cinfo, ++ struct bgp_table *table) ++{ ++ struct bgp_dest *dest; ++ bool force = (cinfo->bgp->process_queue == NULL); ++ uint32_t examined = 0, queued = 0; ++ ++ for (dest = bgp_table_top(table); dest; dest = bgp_route_next(dest)) { ++ struct bgp_path_info *pi, *next; ++ struct bgp_adj_in *ain; ++ struct bgp_adj_in *ain_next; ++ ++ examined++; ++ ++ ain = dest->adj_in; ++ while (ain) { ++ ain_next = ain->next; ++ ++ if (bgp_clearing_batch_check_peer(cinfo, ain->peer)) ++ bgp_adj_in_remove(&dest, ain); ++ ++ ain = ain_next; ++ ++ assert(dest != NULL); ++ } ++ ++ for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = next) { ++ next = pi->next; ++ if (!bgp_clearing_batch_check_peer(cinfo, pi->peer)) ++ continue; ++ ++ queued++; ++ ++ if (force) { ++ bgp_path_info_reap(dest, pi); ++ } else { ++ /* Unlocked after processing */ ++ bgp_table_lock(bgp_dest_table(dest)); ++ bgp_dest_lock_node(dest); ++ ++ bgp_clearing_batch_add_dest(cinfo, dest); ++ break; ++ } ++ } ++ } ++ ++ if (examined > 0) { ++ if (bgp_debug_neighbor_events(NULL)) ++ zlog_debug("%s: %s/%s: examined %u, queued %u", ++ __func__, afi2str(table->afi), ++ safi2str(table->safi), examined, queued); ++ } ++} ++ ++/* ++ * RIB-walking helper for batch clearing work: walk all tables, identify ++ * dests that are affected by the peers in the batch, enqueue the dests for ++ * async processing. ++ */ ++static void clear_batch_rib_helper(struct bgp_clearing_info *cinfo) ++{ ++ afi_t afi; ++ safi_t safi; ++ struct bgp_dest *dest; ++ struct bgp_table *table; ++ ++ FOREACH_AFI_SAFI (afi, safi) { ++ /* Identify table to be examined */ ++ if (safi != SAFI_MPLS_VPN && safi != SAFI_ENCAP && ++ safi != SAFI_EVPN) { ++ table = cinfo->bgp->rib[afi][safi]; ++ if (!table) ++ continue; ++ ++ clear_batch_table_helper(cinfo, table); ++ } else { ++ for (dest = bgp_table_top(cinfo->bgp->rib[afi][safi]); ++ dest; dest = bgp_route_next(dest)) { ++ table = bgp_dest_get_bgp_table_info(dest); ++ if (!table) ++ continue; ++ ++ /* TODO -- record the tables we've seen ++ * and don't repeat any? ++ */ ++ ++ clear_batch_table_helper(cinfo, table); ++ } ++ } ++ } ++} ++ ++/* ++ * Identify prefixes that need to be cleared for a batch of peers in 'cinfo'. ++ * The actual clearing processing will be done async... ++ */ ++void bgp_clear_route_batch(struct bgp_clearing_info *cinfo) ++{ ++ if (bgp_debug_neighbor_events(NULL)) ++ zlog_debug("%s: BGP %s, batch %p", __func__, ++ cinfo->bgp->name_pretty, cinfo); ++ ++ /* Walk the rib, checking the peers in the batch */ ++ clear_batch_rib_helper(cinfo); ++ ++ /* If we found some prefixes, schedule a task to begin work. */ ++ if (bgp_clearing_batch_dests_present(cinfo)) ++ event_add_event(bm->master, bgp_clear_batch_dests_task, cinfo, ++ 0, &cinfo->t_sched); ++ ++ /* NB -- it's the caller's job to clean up, release refs, etc. if ++ * we didn't find any dests ++ */ ++} ++ + void bgp_clear_route_all(struct peer *peer) + { + afi_t afi; + safi_t safi; + ++ /* We may be able to batch multiple peers' clearing work: check ++ * and see. ++ */ ++ if (bgp_clearing_batch_add_peer(peer->bgp, peer)) { ++ if (bgp_debug_neighbor_events(peer)) ++ zlog_debug("%s: peer %pBP batched", __func__, peer); ++ return; ++ } ++ ++ if (bgp_debug_neighbor_events(peer)) ++ zlog_debug("%s: peer %pBP", __func__, peer); ++ + FOREACH_AFI_SAFI (afi, safi) + bgp_clear_route(peer, afi, safi); + +diff --git a/bgpd/bgp_route.h b/bgpd/bgp_route.h +index 39e282712..9f0563c25 100644 +--- a/bgpd/bgp_route.h ++++ b/bgpd/bgp_route.h +@@ -744,6 +744,9 @@ extern void bgp_soft_reconfig_table_task_cancel(const struct bgp *bgp, + extern bool bgp_soft_reconfig_in(struct peer *peer, afi_t afi, safi_t safi); + extern void bgp_clear_route(struct peer *, afi_t, safi_t); + extern void bgp_clear_route_all(struct peer *); ++/* Clear routes for a batch of peers */ ++void bgp_clear_route_batch(struct bgp_clearing_info *cinfo); ++ + extern void bgp_clear_adj_in(struct peer *, afi_t, safi_t); + extern void bgp_clear_stale_route(struct peer *, afi_t, safi_t); + extern void bgp_set_stale_route(struct peer *peer, afi_t afi, safi_t safi); +diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c +index 7159fe4df..af4a235ec 100644 +--- a/bgpd/bgpd.c ++++ b/bgpd/bgpd.c +@@ -87,7 +87,20 @@ DEFINE_QOBJ_TYPE(peer); + DEFINE_HOOK(bgp_inst_delete, (struct bgp *bgp), (bgp)); + + /* Peers with connection error/failure, per bgp instance */ +-DECLARE_LIST(bgp_peer_conn_errlist, struct peer, conn_err_link); ++DECLARE_DLIST(bgp_peer_conn_errlist, struct peer, conn_err_link); ++ ++/* List of info about peers that are being cleared from BGP RIBs in a batch */ ++DECLARE_DLIST(bgp_clearing_info, struct bgp_clearing_info, link); ++ ++/* List of dests that need to be processed in a clearing batch */ ++DECLARE_LIST(bgp_clearing_destlist, struct bgp_clearing_dest, link); ++ ++/* Hash of peers in clearing info object */ ++static int peer_clearing_hash_cmp(const struct peer *p1, const struct peer *p2); ++static uint32_t peer_clearing_hashfn(const struct peer *p1); ++ ++DECLARE_HASH(bgp_clearing_hash, struct peer, clear_hash_link, ++ peer_clearing_hash_cmp, peer_clearing_hashfn); + + /* BGP process wide configuration. */ + static struct bgp_master bgp_master; +@@ -3583,6 +3596,7 @@ static struct bgp *bgp_create(as_t *as, const char *name, + /* Init peer connection error info */ + pthread_mutex_init(&bgp->peer_errs_mtx, NULL); + bgp_peer_conn_errlist_init(&bgp->peer_conn_errlist); ++ bgp_clearing_info_init(&bgp->clearing_list); + + return bgp; + } +@@ -3934,6 +3948,10 @@ int bgp_delete(struct bgp *bgp) + zlog_debug("Zebra Announce Fifo cleanup count before %u and after %u during BGP %s deletion", + cnt_before, cnt_after, bgp->name_pretty); + ++ /* Cleanup for peer connection batching */ ++ while ((cinfo = bgp_clearing_info_first(&bgp->clearing_list)) != NULL) ++ bgp_clearing_batch_completed(cinfo); ++ + bgp_soft_reconfig_table_task_cancel(bgp, NULL, NULL); + + /* make sure we withdraw any exported routes */ +@@ -4048,6 +4066,10 @@ int bgp_delete(struct bgp *bgp) + /* Cancel peer connection errors event */ + EVENT_OFF(bgp->t_conn_errors); + ++ /* Cleanup for peer connection batching */ ++ while ((cinfo = bgp_clearing_info_pop(&bgp->clearing_list)) != NULL) ++ bgp_clearing_batch_completed(cinfo); ++ + /* TODO - Other memory may need to be freed - e.g., NHT */ + + #ifdef ENABLE_BGP_VNC +@@ -8739,8 +8761,303 @@ void bgp_gr_apply_running_config(void) + } + } + ++/* Hash of peers in clearing info object */ ++static int peer_clearing_hash_cmp(const struct peer *p1, const struct peer *p2) ++{ ++ if (p1 == p2) ++ return 0; ++ else if (p1 < p2) ++ return -1; ++ else ++ return 1; ++} ++ ++static uint32_t peer_clearing_hashfn(const struct peer *p1) ++{ ++ return (uint32_t)((intptr_t)p1 & 0xffffffffULL); ++} ++ ++/* ++ * Free a clearing batch: this really just does the memory cleanup; the ++ * clearing code is expected to manage the peer, dest, table, etc refcounts ++ */ ++static void bgp_clearing_batch_free(struct bgp *bgp, ++ struct bgp_clearing_info **pinfo) ++{ ++ struct bgp_clearing_info *cinfo = *pinfo; ++ struct bgp_clearing_dest *destinfo; ++ ++ if (bgp_clearing_info_anywhere(cinfo)) ++ bgp_clearing_info_del(&bgp->clearing_list, cinfo); ++ ++ while ((destinfo = bgp_clearing_destlist_pop(&cinfo->destlist)) != NULL) ++ XFREE(MTYPE_CLEARING_BATCH, destinfo); ++ ++ bgp_clearing_hash_fini(&cinfo->peers); ++ ++ XFREE(MTYPE_CLEARING_BATCH, *pinfo); ++} ++ ++/* ++ * Done with a peer that was part of a clearing batch ++ */ ++static void bgp_clearing_peer_done(struct peer *peer) ++{ ++ UNSET_FLAG(peer->flags, PEER_FLAG_CLEARING_BATCH); ++ ++ /* Tickle FSM to start moving again */ ++ BGP_EVENT_ADD(peer->connection, Clearing_Completed); ++ ++ peer_unlock(peer); /* bgp_clear_route */ ++} ++ ++/* ++ * Initialize a new batch struct for clearing peer(s) from the RIB ++ */ ++static void bgp_clearing_batch_begin(struct bgp *bgp) ++{ ++ struct bgp_clearing_info *cinfo; ++ ++ cinfo = XCALLOC(MTYPE_CLEARING_BATCH, sizeof(struct bgp_clearing_info)); ++ ++ cinfo->bgp = bgp; ++ ++ /* Init hash of peers and list of dests */ ++ bgp_clearing_hash_init(&cinfo->peers); ++ bgp_clearing_destlist_init(&cinfo->destlist); ++ ++ /* Batch is open for more peers */ ++ SET_FLAG(cinfo->flags, BGP_CLEARING_INFO_FLAG_OPEN); ++ ++ bgp_clearing_info_add_head(&bgp->clearing_list, cinfo); ++} ++ ++/* ++ * Close a batch of clearing peers, and begin working on the RIB ++ */ ++static void bgp_clearing_batch_end(struct bgp *bgp) ++{ ++ struct bgp_clearing_info *cinfo; ++ ++ cinfo = bgp_clearing_info_first(&bgp->clearing_list); ++ ++ assert(cinfo != NULL); ++ assert(CHECK_FLAG(cinfo->flags, BGP_CLEARING_INFO_FLAG_OPEN)); ++ ++ /* Batch is closed */ ++ UNSET_FLAG(cinfo->flags, BGP_CLEARING_INFO_FLAG_OPEN); ++ ++ /* If we have no peers to examine, just discard the batch info */ ++ if (bgp_clearing_hash_count(&cinfo->peers) == 0) { ++ bgp_clearing_batch_free(bgp, &cinfo); ++ return; ++ } ++ ++ /* Do a RIB walk for the current batch. If it finds dests/prefixes ++ * to work on, this will schedule a task to process ++ * the dests/prefixes in the batch. ++ */ ++ bgp_clear_route_batch(cinfo); ++ ++ /* If we found no prefixes/dests, just discard the batch, ++ * remembering that we're holding a ref for each peer. ++ */ ++ if (bgp_clearing_destlist_count(&cinfo->destlist) == 0) { ++ bgp_clearing_batch_completed(cinfo); ++ } ++} ++ ++/* Check whether a dest's peer is relevant to a clearing batch */ ++bool bgp_clearing_batch_check_peer(struct bgp_clearing_info *cinfo, ++ const struct peer *peer) ++{ ++ struct peer *p; ++ ++ p = bgp_clearing_hash_find(&cinfo->peers, peer); ++ return (p != NULL); ++} ++ ++/* ++ * Check whether a clearing batch has any dests to process ++ */ ++bool bgp_clearing_batch_dests_present(struct bgp_clearing_info *cinfo) ++{ ++ return (bgp_clearing_destlist_count(&cinfo->destlist) > 0); ++} ++ ++/* ++ * Done with a peer clearing batch; deal with refcounts, free memory ++ */ ++void bgp_clearing_batch_completed(struct bgp_clearing_info *cinfo) ++{ ++ struct peer *peer; ++ struct bgp_dest *dest; ++ struct bgp_clearing_dest *destinfo; ++ struct bgp_table *table; ++ ++ /* Ensure event is not scheduled */ ++ event_cancel_event(bm->master, &cinfo->t_sched); ++ ++ /* Remove all peers and un-ref */ ++ while ((peer = bgp_clearing_hash_pop(&cinfo->peers)) != NULL) ++ bgp_clearing_peer_done(peer); ++ ++ /* Remove any dests/prefixes and unlock */ ++ destinfo = bgp_clearing_destlist_pop(&cinfo->destlist); ++ while (destinfo) { ++ dest = destinfo->dest; ++ XFREE(MTYPE_CLEARING_BATCH, destinfo); ++ ++ table = bgp_dest_table(dest); ++ bgp_dest_unlock_node(dest); ++ bgp_table_unlock(table); ++ ++ destinfo = bgp_clearing_destlist_pop(&cinfo->destlist); ++ } ++ ++ /* Free memory */ ++ bgp_clearing_batch_free(cinfo->bgp, &cinfo); ++} ++ ++/* ++ * Add a prefix/dest to a clearing batch ++ */ ++void bgp_clearing_batch_add_dest(struct bgp_clearing_info *cinfo, ++ struct bgp_dest *dest) ++{ ++ struct bgp_clearing_dest *destinfo; ++ ++ destinfo = XCALLOC(MTYPE_CLEARING_BATCH, ++ sizeof(struct bgp_clearing_dest)); ++ ++ destinfo->dest = dest; ++ bgp_clearing_destlist_add_tail(&cinfo->destlist, destinfo); ++} ++ ++/* ++ * Return the next dest for batch clear processing ++ */ ++struct bgp_dest *bgp_clearing_batch_next_dest(struct bgp_clearing_info *cinfo) ++{ ++ struct bgp_clearing_dest *destinfo; ++ struct bgp_dest *dest = NULL; ++ ++ destinfo = bgp_clearing_destlist_pop(&cinfo->destlist); ++ if (destinfo) { ++ dest = destinfo->dest; ++ XFREE(MTYPE_CLEARING_BATCH, destinfo); ++ } ++ ++ return dest; ++} ++ ++/* If a clearing batch is available for 'peer', add it and return 'true', ++ * else return 'false'. ++ */ ++bool bgp_clearing_batch_add_peer(struct bgp *bgp, struct peer *peer) ++{ ++ struct bgp_clearing_info *cinfo; ++ ++ cinfo = bgp_clearing_info_first(&bgp->clearing_list); ++ ++ if (cinfo && CHECK_FLAG(cinfo->flags, BGP_CLEARING_INFO_FLAG_OPEN)) { ++ if (!CHECK_FLAG(peer->flags, PEER_FLAG_CLEARING_BATCH)) { ++ /* Add a peer ref */ ++ peer_lock(peer); ++ ++ bgp_clearing_hash_add(&cinfo->peers, peer); ++ SET_FLAG(peer->flags, PEER_FLAG_CLEARING_BATCH); ++ } ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * Task callback in the main pthread to handle socket errors ++ * encountered in the io pthread. We avoid having the io pthread try ++ * to enqueue fsm events or mess with the peer struct. ++ */ ++ ++/* TODO -- should this be configurable? */ ++/* Max number of peers to process without rescheduling */ ++#define BGP_CONN_ERROR_DEQUEUE_MAX 10 ++ ++static void bgp_process_conn_error(struct event *event) ++{ ++ struct bgp *bgp; ++ struct peer *peer; ++ struct peer_connection *connection; ++ int counter = 0; ++ size_t list_count = 0; ++ bool more_p = false; ++ ++ bgp = EVENT_ARG(event); ++ ++ frr_with_mutex (&bgp->peer_errs_mtx) { ++ peer = bgp_peer_conn_errlist_pop(&bgp->peer_conn_errlist); ++ ++ list_count = ++ bgp_peer_conn_errlist_count(&bgp->peer_conn_errlist); ++ } ++ ++ /* If we have multiple peers with errors, try to batch some ++ * clearing work. ++ */ ++ if (list_count > 0) ++ bgp_clearing_batch_begin(bgp); ++ ++ /* Dequeue peers from the error list */ ++ while (peer != NULL) { ++ connection = peer->connection; ++ ++ if (bgp_debug_neighbor_events(peer)) ++ zlog_debug("%s [Event] BGP error %d on fd %d", ++ peer->host, peer->connection_errcode, ++ connection->fd); ++ ++ /* Closed connection or error on the socket */ ++ if (peer_established(connection)) { ++ if ((CHECK_FLAG(peer->flags, PEER_FLAG_GRACEFUL_RESTART) ++ || CHECK_FLAG(peer->flags, ++ PEER_FLAG_GRACEFUL_RESTART_HELPER)) ++ && CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_MODE)) { ++ peer->last_reset = PEER_DOWN_NSF_CLOSE_SESSION; ++ SET_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT); ++ } else ++ peer->last_reset = PEER_DOWN_CLOSE_SESSION; ++ } ++ ++ /* No need for keepalives, if enabled */ ++ bgp_keepalives_off(peer->connection); ++ ++ /* Drive into state-machine changes */ ++ bgp_event_update(connection, peer->connection_errcode); ++ ++ counter++; ++ if (counter >= BGP_CONN_ERROR_DEQUEUE_MAX) ++ break; ++ ++ peer = bgp_dequeue_conn_err_peer(bgp, &more_p); ++ } ++ ++ /* Reschedule event if necessary */ ++ if (more_p) ++ bgp_conn_err_reschedule(bgp); ++ ++ /* Done with a clearing batch */ ++ if (list_count > 0) ++ bgp_clearing_batch_end(bgp); ++ ++ if (bgp_debug_neighbor_events(NULL)) ++ zlog_debug("%s: dequeued and processed %d peers", __func__, ++ counter); ++} ++ + /* +- * Enqueue a peer with a connection error to be handled in the main pthread ++ * Enqueue a peer with a connection error to be handled in the main pthread; ++ * this is called from the io pthread. + */ + int bgp_enqueue_conn_err_peer(struct bgp *bgp, struct peer *peer, int errcode) + { +@@ -8754,7 +9071,7 @@ int bgp_enqueue_conn_err_peer(struct bgp *bgp, struct peer *peer, int errcode) + } + } + /* Ensure an event is scheduled */ +- event_add_event(bm->master, bgp_packet_process_error, bgp, 0, ++ event_add_event(bm->master, bgp_process_conn_error, bgp, 0, + &bgp->t_conn_errors); + return 0; + } +@@ -8788,7 +9105,7 @@ struct peer *bgp_dequeue_conn_err_peer(struct bgp *bgp, bool *more_p) + */ + void bgp_conn_err_reschedule(struct bgp *bgp) + { +- event_add_event(bm->master, bgp_packet_process_error, bgp, 0, ++ event_add_event(bm->master, bgp_process_conn_error, bgp, 0, + &bgp->t_conn_errors); + } + +diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h +index bc15c0662..ff3f1b9f1 100644 +--- a/bgpd/bgpd.h ++++ b/bgpd/bgpd.h +@@ -354,32 +354,54 @@ struct bgp_mplsvpn_nh_label_bind_cache; + PREDECL_RBTREE_UNIQ(bgp_mplsvpn_nh_label_bind_cache); + + /* List of peers that have connection errors in the io pthread */ +-PREDECL_LIST(bgp_peer_conn_errlist); ++PREDECL_DLIST(bgp_peer_conn_errlist); + + /* List of info about peers that are being cleared from BGP RIBs in a batch */ +-PREDECL_LIST(bgp_clearing_info); ++PREDECL_DLIST(bgp_clearing_info); + + /* Hash of peers in clearing info object */ + PREDECL_HASH(bgp_clearing_hash); + ++/* List of dests that need to be processed in a clearing batch */ ++PREDECL_LIST(bgp_clearing_destlist); ++ ++struct bgp_clearing_dest { ++ struct bgp_dest *dest; ++ struct bgp_clearing_destlist_item link; ++}; ++ + /* Info about a batch of peers that need to be cleared from the RIB. + * If many peers need to be cleared, we process them in batches, taking +- * one walk through the RIB for each batch. ++ * one walk through the RIB for each batch. This is only used for "all" ++ * afi/safis, typically when processing peer connection errors. + */ + struct bgp_clearing_info { ++ /* Owning bgp instance */ ++ struct bgp *bgp; ++ + /* Hash of peers */ + struct bgp_clearing_hash_head peers; + ++ /* Flags */ ++ uint32_t flags; ++ ++ /* List of dests - wrapped by a small wrapper struct */ ++ struct bgp_clearing_destlist_head destlist; ++ + /* Event to schedule/reschedule processing */ +- struct thread *t_sched; ++ struct event *t_sched; ++ ++ /* TODO -- id, serial number, for debugging/logging? */ + +- /* RIB dest for rescheduling */ +- struct bgp_dest *last_dest; ++ /* TODO -- info for rescheduling the RIB walk? future? */ + +- /* Linkage for list of batches per-bgp */ ++ /* Linkage for list of batches per bgp */ + struct bgp_clearing_info_item link; + }; + ++/* Batch is open, new peers can be added */ ++#define BGP_CLEARING_INFO_FLAG_OPEN (1 << 0) ++ + /* BGP instance structure. */ + struct bgp { + /* AS number of this BGP instance. */ +@@ -1513,6 +1535,8 @@ struct peer { + #define PEER_FLAG_GRACEFUL_SHUTDOWN (1ULL << 35) + #define PEER_FLAG_CAPABILITY_SOFT_VERSION (1ULL << 36) + #define PEER_FLAG_CAPABILITY_FQDN (1ULL << 37) /* fqdn capability */ ++/* Peer is part of a batch clearing its routes */ ++#define PEER_FLAG_CLEARING_BATCH (1ULL << 41) + + /* + *GR-Disabled mode means unset PEER_FLAG_GRACEFUL_RESTART +@@ -2815,6 +2839,23 @@ extern bool bgp_path_attribute_treat_as_withdraw(struct peer *peer, char *buf, + + extern void srv6_function_free(struct bgp_srv6_function *func); + ++/* If a clearing batch is available for 'peer', add it and return 'true', ++ * else return 'false'. ++ */ ++bool bgp_clearing_batch_add_peer(struct bgp *bgp, struct peer *peer); ++/* Add a prefix/dest to a clearing batch */ ++void bgp_clearing_batch_add_dest(struct bgp_clearing_info *cinfo, ++ struct bgp_dest *dest); ++/* Check whether a dest's peer is relevant to a clearing batch */ ++bool bgp_clearing_batch_check_peer(struct bgp_clearing_info *cinfo, ++ const struct peer *peer); ++/* Check whether a clearing batch has any dests to process */ ++bool bgp_clearing_batch_dests_present(struct bgp_clearing_info *cinfo); ++/* Returns the next dest for batch clear processing */ ++struct bgp_dest *bgp_clearing_batch_next_dest(struct bgp_clearing_info *cinfo); ++/* Done with a peer clearing batch; deal with refcounts, free memory */ ++void bgp_clearing_batch_completed(struct bgp_clearing_info *cinfo); ++ + #ifdef _FRR_ATTRIBUTE_PRINTFRR + /* clang-format off */ + #pragma FRR printfrr_ext "%pBP" (struct peer *) +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0087-zebra-move-peer-conn-error-list-to-connection-struct.patch b/src/sonic-frr/patch/0087-zebra-move-peer-conn-error-list-to-connection-struct.patch new file mode 100644 index 000000000000..05812d075474 --- /dev/null +++ b/src/sonic-frr/patch/0087-zebra-move-peer-conn-error-list-to-connection-struct.patch @@ -0,0 +1,210 @@ +From e9eee90c3f68dc83a92f519841732c951948875b Mon Sep 17 00:00:00 2001 +From: Mark Stapp +Date: Thu, 7 Nov 2024 10:55:19 -0500 +Subject: [PATCH 4/4] zebra: move peer conn error list to connection struct + +Move the peer connection error list to the peer_connection +struct; that seems to line up better with the way that struct +works. + +Signed-off-by: Mark Stapp + +diff --git a/bgpd/bgp_io.c b/bgpd/bgp_io.c +index 4d5e1d9c5..31aba90d2 100644 +--- a/bgpd/bgp_io.c ++++ b/bgpd/bgp_io.c +@@ -251,7 +251,7 @@ static void bgp_process_reads(struct event *thread) + /* Handle the error in the main pthread, include the + * specific state change from 'bgp_read'. + */ +- bgp_enqueue_conn_err_peer(peer->bgp, connection->peer, code); ++ bgp_enqueue_conn_err(peer->bgp, connection, code); + goto done; + } + +diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c +index e3f103724..7d5e2b5c6 100644 +--- a/bgpd/bgpd.c ++++ b/bgpd/bgpd.c +@@ -87,7 +87,7 @@ DEFINE_QOBJ_TYPE(peer); + DEFINE_HOOK(bgp_inst_delete, (struct bgp *bgp), (bgp)); + + /* Peers with connection error/failure, per bgp instance */ +-DECLARE_DLIST(bgp_peer_conn_errlist, struct peer, conn_err_link); ++DECLARE_DLIST(bgp_peer_conn_errlist, struct peer_connection, conn_err_link); + + /* List of info about peers that are being cleared from BGP RIBs in a batch */ + DECLARE_DLIST(bgp_clearing_info, struct bgp_clearing_info, link); +@@ -2722,9 +2722,9 @@ int peer_delete(struct peer *peer) + + /* Ensure the peer is removed from the connection error list */ + frr_with_mutex (&bgp->peer_errs_mtx) { +- if (bgp_peer_conn_errlist_anywhere(peer)) ++ if (bgp_peer_conn_errlist_anywhere(peer->connection)) + bgp_peer_conn_errlist_del(&bgp->peer_conn_errlist, +- peer); ++ peer->connection); + } + + if (CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT)) +@@ -3924,6 +3924,8 @@ int bgp_delete(struct bgp *bgp) + struct bgp_table *dest_table = NULL; + struct graceful_restart_info *gr_info; + uint32_t cnt_before, cnt_after; ++ struct bgp_clearing_info *cinfo; ++ struct peer_connection *connection; + + assert(bgp); + +@@ -4044,9 +4046,9 @@ int bgp_delete(struct bgp *bgp) + */ + frr_with_mutex (&bgp->peer_errs_mtx) { + do { +- peer = bgp_peer_conn_errlist_pop( ++ connection = bgp_peer_conn_errlist_pop( + &bgp->peer_conn_errlist); +- } while (peer != NULL); ++ } while (connection != NULL); + } + + /* Free peers and peer-groups. */ +@@ -8998,7 +9000,7 @@ static void bgp_process_conn_error(struct event *event) + bgp = EVENT_ARG(event); + + frr_with_mutex (&bgp->peer_errs_mtx) { +- peer = bgp_peer_conn_errlist_pop(&bgp->peer_conn_errlist); ++ connection = bgp_peer_conn_errlist_pop(&bgp->peer_conn_errlist); + + list_count = + bgp_peer_conn_errlist_count(&bgp->peer_conn_errlist); +@@ -9011,12 +9013,12 @@ static void bgp_process_conn_error(struct event *event) + bgp_clearing_batch_begin(bgp); + + /* Dequeue peers from the error list */ +- while (peer != NULL) { +- connection = peer->connection; ++ while (connection != NULL) { ++ peer = connection->peer; + + if (bgp_debug_neighbor_events(peer)) + zlog_debug("%s [Event] BGP error %d on fd %d", +- peer->host, peer->connection_errcode, ++ peer->host, connection->connection_errcode, + connection->fd); + + /* Closed connection or error on the socket */ +@@ -9035,13 +9037,13 @@ static void bgp_process_conn_error(struct event *event) + bgp_keepalives_off(peer->connection); + + /* Drive into state-machine changes */ +- bgp_event_update(connection, peer->connection_errcode); ++ bgp_event_update(connection, connection->connection_errcode); + + counter++; + if (counter >= BGP_CONN_ERROR_DEQUEUE_MAX) + break; + +- peer = bgp_dequeue_conn_err_peer(bgp, &more_p); ++ connection = bgp_dequeue_conn_err(bgp, &more_p); + } + + /* Reschedule event if necessary */ +@@ -9058,18 +9060,19 @@ static void bgp_process_conn_error(struct event *event) + } + + /* +- * Enqueue a peer with a connection error to be handled in the main pthread; ++ * Enqueue a connection with an error to be handled in the main pthread; + * this is called from the io pthread. + */ +-int bgp_enqueue_conn_err_peer(struct bgp *bgp, struct peer *peer, int errcode) ++int bgp_enqueue_conn_err(struct bgp *bgp, struct peer_connection *connection, ++ int errcode) + { + frr_with_mutex (&bgp->peer_errs_mtx) { +- peer->connection_errcode = errcode; ++ connection->connection_errcode = errcode; + + /* Careful not to double-enqueue */ +- if (!bgp_peer_conn_errlist_anywhere(peer)) { ++ if (!bgp_peer_conn_errlist_anywhere(connection)) { + bgp_peer_conn_errlist_add_tail(&bgp->peer_conn_errlist, +- peer); ++ connection); + } + } + /* Ensure an event is scheduled */ +@@ -9079,16 +9082,16 @@ int bgp_enqueue_conn_err_peer(struct bgp *bgp, struct peer *peer, int errcode) + } + + /* +- * Dequeue a peer that encountered a connection error; signal whether there ++ * Dequeue a connection that encountered a connection error; signal whether there + * are more queued peers. + */ +-struct peer *bgp_dequeue_conn_err_peer(struct bgp *bgp, bool *more_p) ++struct peer_connection *bgp_dequeue_conn_err(struct bgp *bgp, bool *more_p) + { +- struct peer *peer = NULL; ++ struct peer_connection *connection = NULL; + bool more = false; + + frr_with_mutex (&bgp->peer_errs_mtx) { +- peer = bgp_peer_conn_errlist_pop(&bgp->peer_conn_errlist); ++ connection = bgp_peer_conn_errlist_pop(&bgp->peer_conn_errlist); + + if (bgp_peer_conn_errlist_const_first( + &bgp->peer_conn_errlist) != NULL) +@@ -9098,7 +9101,7 @@ struct peer *bgp_dequeue_conn_err_peer(struct bgp *bgp, bool *more_p) + if (more_p) + *more_p = more; + +- return peer; ++ return connection; + } + + /* +diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h +index ff3f1b9f1..b53ece40d 100644 +--- a/bgpd/bgpd.h ++++ b/bgpd/bgpd.h +@@ -1254,6 +1254,11 @@ struct peer_connection { + struct event *t_process_packet; + + struct event *t_stop_with_notify; ++ /* Linkage for list connections with errors, from IO pthread */ ++ struct bgp_peer_conn_errlist_item conn_err_link; ++ ++ /* Connection error code */ ++ uint16_t connection_errcode; + + union sockunion su; + #define BGP_CONNECTION_SU_UNSPEC(connection) \ +@@ -1921,12 +1926,6 @@ struct peer { + /* Add-Path Best selected paths number to advertise */ + uint8_t addpath_best_selected[AFI_MAX][SAFI_MAX]; + +- /* Linkage for list of peers with connection errors from IO pthread */ +- struct bgp_peer_conn_errlist_item conn_err_link; +- +- /* Connection error code */ +- uint16_t connection_errcode; +- + /* Linkage for hash of clearing peers being cleared in a batch */ + struct bgp_clearing_hash_item clear_hash_link; + +@@ -2565,8 +2564,9 @@ int bgp_global_gr_init(struct bgp *bgp); + int bgp_peer_gr_init(struct peer *peer); + + /* APIs for the per-bgp peer connection error list */ +-int bgp_enqueue_conn_err_peer(struct bgp *bgp, struct peer *peer, int errcode); +-struct peer *bgp_dequeue_conn_err_peer(struct bgp *bgp, bool *more_p); ++int bgp_enqueue_conn_err(struct bgp *bgp, struct peer_connection *connection, ++ int errcode); ++struct peer_connection *bgp_dequeue_conn_err(struct bgp *bgp, bool *more_p); + void bgp_conn_err_reschedule(struct bgp *bgp); + + #define BGP_GR_ROUTER_DETECT_AND_SEND_CAPABILITY_TO_ZEBRA(_bgp, _peer_list) \ +-- +2.43.2 + diff --git a/src/sonic-frr/patch/series b/src/sonic-frr/patch/series index 6f7c40d653af..7b16c04e89fe 100644 --- a/src/sonic-frr/patch/series +++ b/src/sonic-frr/patch/series @@ -64,3 +64,7 @@ 0081-bgpd-Optimize-evaluate-paths-for-a-peer-going-down.patch 0082-Revert-bgpd-upon-if-event-evaluate-bnc-with-matching.patch 0083-staticd-add-cli-to-support-steering-of-ipv4-traffic-over-srv6-sid-list.patch +0084-bgpd-Replace-per-peer-connection-error-with-per-bgp.patch +0085-bgpd-remove-apis-from-bgp_route.h.patch +0086-bgpd-batch-peer-connection-error-clearing.patch +0087-zebra-move-peer-conn-error-list-to-connection-struct.patch