From 6569050edf1391a5f37b48435a4c507af8460ab8 Mon Sep 17 00:00:00 2001
From: dgsudharsan <sudharsand@nvidia.com>
Date: Mon, 17 Mar 2025 21:07:52 +0000
Subject: [PATCH] [FRR]Fixing BGP session not being stable after flapping

---
 ...r-peer-connection-error-with-per-bgp.patch | 367 ++++++++
 ...85-bgpd-remove-apis-from-bgp_route.h.patch |  26 +
 ...batch-peer-connection-error-clearing.patch | 874 ++++++++++++++++++
 ...conn-error-list-to-connection-struct.patch | 210 +++++
 src/sonic-frr/patch/series                    |   4 +
 5 files changed, 1481 insertions(+)
 create mode 100644 src/sonic-frr/patch/0084-bgpd-Replace-per-peer-connection-error-with-per-bgp.patch
 create mode 100644 src/sonic-frr/patch/0085-bgpd-remove-apis-from-bgp_route.h.patch
 create mode 100644 src/sonic-frr/patch/0086-bgpd-batch-peer-connection-error-clearing.patch
 create mode 100644 src/sonic-frr/patch/0087-zebra-move-peer-conn-error-list-to-connection-struct.patch

diff --git a/src/sonic-frr/patch/0084-bgpd-Replace-per-peer-connection-error-with-per-bgp.patch b/src/sonic-frr/patch/0084-bgpd-Replace-per-peer-connection-error-with-per-bgp.patch
new file mode 100644
index 000000000000..794ea13b916b
--- /dev/null
+++ b/src/sonic-frr/patch/0084-bgpd-Replace-per-peer-connection-error-with-per-bgp.patch
@@ -0,0 +1,367 @@
+From 655c84f5ca5534323ae8accb8d313615440ab034 Mon Sep 17 00:00:00 2001
+From: Mark Stapp <mjs@cisco.com>
+Date: Thu, 26 Sep 2024 11:09:35 -0400
+Subject: [PATCH 1/4] bgpd: Replace per-peer connection error with per-bgp
+
+Replace the per-peer connection error with a per-bgp event and
+a list. The io pthread enqueues peers per-bgp-instance, and the
+error-handing code can process multiple peers if there have been
+multiple failures.
+
+Signed-off-by: Mark Stapp <mjs@cisco.com>
+
+diff --git a/bgpd/bgp_io.c b/bgpd/bgp_io.c
+index b07e69ac3..4d5e1d9c5 100644
+--- a/bgpd/bgp_io.c
++++ b/bgpd/bgp_io.c
+@@ -100,7 +100,6 @@ void bgp_reads_off(struct peer_connection *connection)
+ 
+ 	event_cancel_async(fpt->master, &connection->t_read, NULL);
+ 	EVENT_OFF(connection->t_process_packet);
+-	EVENT_OFF(connection->t_process_packet_error);
+ 
+ 	UNSET_FLAG(connection->thread_flags, PEER_THREAD_READS_ON);
+ }
+@@ -252,8 +251,7 @@ static void bgp_process_reads(struct event *thread)
+ 		/* Handle the error in the main pthread, include the
+ 		 * specific state change from 'bgp_read'.
+ 		 */
+-		event_add_event(bm->master, bgp_packet_process_error, connection,
+-				code, &connection->t_process_packet_error);
++		bgp_enqueue_conn_err_peer(peer->bgp, connection->peer, code);
+ 		goto done;
+ 	}
+ 
+diff --git a/bgpd/bgp_packet.c b/bgpd/bgp_packet.c
+index 2e682c773..26c00cfe1 100644
+--- a/bgpd/bgp_packet.c
++++ b/bgpd/bgp_packet.c
+@@ -4003,35 +4003,60 @@ void bgp_send_delayed_eor(struct bgp *bgp)
+ }
+ 
+ /*
+- * Task callback to handle socket error encountered in the io pthread. We avoid
+- * having the io pthread try to enqueue fsm events or mess with the peer
+- * struct.
++ * Task callback in the main pthread to handle socket error
++ * encountered in the io pthread. We avoid having the io pthread try
++ * to enqueue fsm events or mess with the peer struct.
+  */
++
++/* Max number of peers to process without rescheduling */
++#define BGP_CONN_ERROR_DEQUEUE_MAX 10
++
+ void bgp_packet_process_error(struct event *thread)
+ {
+ 	struct peer_connection *connection;
+ 	struct peer *peer;
+-	int code;
++	struct bgp *bgp;
++	int counter = 0;
++	bool more_p = false;
+ 
+-	connection = EVENT_ARG(thread);
+-	peer = connection->peer;
+-	code = EVENT_VAL(thread);
++	bgp = EVENT_ARG(thread);
+ 
+-	if (bgp_debug_neighbor_events(peer))
+-		zlog_debug("%s [Event] BGP error %d on fd %d", peer->host, code,
+-			   connection->fd);
+-
+-	/* Closed connection or error on the socket */
+-	if (peer_established(connection)) {
+-		if ((CHECK_FLAG(peer->flags, PEER_FLAG_GRACEFUL_RESTART)
+-		     || CHECK_FLAG(peer->flags,
+-				   PEER_FLAG_GRACEFUL_RESTART_HELPER))
+-		    && CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_MODE)) {
+-			peer->last_reset = PEER_DOWN_NSF_CLOSE_SESSION;
+-			SET_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT);
+-		} else
+-			peer->last_reset = PEER_DOWN_CLOSE_SESSION;
++	/* Dequeue peers from the error list */
++	while ((peer = bgp_dequeue_conn_err_peer(bgp, &more_p)) != NULL) {
++		connection = peer->connection;
++
++		if (bgp_debug_neighbor_events(peer))
++			zlog_debug("%s [Event] BGP error %d on fd %d",
++				   peer->host, peer->connection_errcode,
++				   connection->fd);
++
++		/* Closed connection or error on the socket */
++		if (peer_established(connection)) {
++			if ((CHECK_FLAG(peer->flags, PEER_FLAG_GRACEFUL_RESTART)
++			     || CHECK_FLAG(peer->flags,
++					   PEER_FLAG_GRACEFUL_RESTART_HELPER))
++			    && CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_MODE)) {
++				peer->last_reset = PEER_DOWN_NSF_CLOSE_SESSION;
++				SET_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT);
++			} else
++				peer->last_reset = PEER_DOWN_CLOSE_SESSION;
++		}
++
++		/* No need for keepalives, if enabled */
++		bgp_keepalives_off(connection);
++
++		bgp_event_update(connection, peer->connection_errcode);
++
++		counter++;
++		if (counter >= BGP_CONN_ERROR_DEQUEUE_MAX)
++			break;
+ 	}
+ 
+-	bgp_event_update(connection, code);
++	/* Reschedule event if necessary */
++	if (more_p)
++		bgp_conn_err_reschedule(bgp);
++
++	if (bgp_debug_neighbor_events(NULL))
++		zlog_debug("%s: dequeued and processed %d peers", __func__,
++			   counter);
+ }
+diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c
+index 591696de7..7159fe4df 100644
+--- a/bgpd/bgpd.c
++++ b/bgpd/bgpd.c
+@@ -86,6 +86,9 @@ DEFINE_QOBJ_TYPE(bgp);
+ DEFINE_QOBJ_TYPE(peer);
+ DEFINE_HOOK(bgp_inst_delete, (struct bgp *bgp), (bgp));
+ 
++/* Peers with connection error/failure, per bgp instance */
++DECLARE_LIST(bgp_peer_conn_errlist, struct peer, conn_err_link);
++
+ /* BGP process wide configuration.  */
+ static struct bgp_master bgp_master;
+ 
+@@ -2684,6 +2687,9 @@ int peer_delete(struct peer *peer)
+ 
+ 	assert(peer->connection->status != Deleted);
+ 
++	if (bgp_debug_neighbor_events(peer))
++		zlog_debug("%s: peer %pBP", __func__, peer);
++
+ 	bgp = peer->bgp;
+ 	accept_peer = CHECK_FLAG(peer->sflags, PEER_STATUS_ACCEPT_PEER);
+ 
+@@ -2701,6 +2707,13 @@ int peer_delete(struct peer *peer)
+ 			   PEER_THREAD_READS_ON));
+ 	assert(!CHECK_FLAG(peer->thread_flags, PEER_THREAD_KEEPALIVES_ON));
+ 
++	/* Ensure the peer is removed from the connection error list */
++	frr_with_mutex (&bgp->peer_errs_mtx) {
++		if (bgp_peer_conn_errlist_anywhere(peer))
++			bgp_peer_conn_errlist_del(&bgp->peer_conn_errlist,
++						  peer);
++	}
++
+ 	if (CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT))
+ 		peer_nsf_stop(peer);
+ 
+@@ -3567,6 +3580,10 @@ static struct bgp *bgp_create(as_t *as, const char *name,
+ 	memset(&bgp->ebgprequirespolicywarning, 0,
+ 	       sizeof(bgp->ebgprequirespolicywarning));
+ 
++	/* Init peer connection error info */
++	pthread_mutex_init(&bgp->peer_errs_mtx, NULL);
++	bgp_peer_conn_errlist_init(&bgp->peer_conn_errlist);
++
+ 	return bgp;
+ }
+ 
+@@ -4000,6 +4017,18 @@ int bgp_delete(struct bgp *bgp)
+ 			if (i != ZEBRA_ROUTE_BGP)
+ 				bgp_redistribute_unset(bgp, afi, i, 0);
+ 
++	/* Clear list of peers with connection errors - each
++	 * peer will need to check again, in case the io pthread is racing
++	 * with us, but this batch cleanup should make the per-peer check
++	 * cheaper.
++	 */
++	frr_with_mutex (&bgp->peer_errs_mtx) {
++		do {
++			peer = bgp_peer_conn_errlist_pop(
++				&bgp->peer_conn_errlist);
++		} while (peer != NULL);
++	}
++
+ 	/* Free peers and peer-groups. */
+ 	for (ALL_LIST_ELEMENTS(bgp->group, node, next, group))
+ 		peer_group_delete(group);
+@@ -4016,6 +4045,9 @@ int bgp_delete(struct bgp *bgp)
+ 
+ 	update_bgp_group_free(bgp);
+ 
++	/* Cancel peer connection errors event */
++	EVENT_OFF(bgp->t_conn_errors);
++
+ /* TODO - Other memory may need to be freed - e.g., NHT */
+ 
+ #ifdef ENABLE_BGP_VNC
+@@ -4168,6 +4200,9 @@ void bgp_free(struct bgp *bgp)
+ 	bgp_srv6_cleanup(bgp);
+ 	bgp_confederation_id_unset(bgp);
+ 
++	bgp_peer_conn_errlist_init(&bgp->peer_conn_errlist);
++	pthread_mutex_destroy(&bgp->peer_errs_mtx);
++
+ 	for (int i = 0; i < bgp->confed_peers_cnt; i++)
+ 		XFREE(MTYPE_BGP_NAME, bgp->confed_peers[i].as_pretty);
+ 
+@@ -8704,6 +8739,59 @@ void bgp_gr_apply_running_config(void)
+ 	}
+ }
+ 
++/*
++ * Enqueue a peer with a connection error to be handled in the main pthread
++ */
++int bgp_enqueue_conn_err_peer(struct bgp *bgp, struct peer *peer, int errcode)
++{
++	frr_with_mutex (&bgp->peer_errs_mtx) {
++		peer->connection_errcode = errcode;
++
++		/* Careful not to double-enqueue */
++		if (!bgp_peer_conn_errlist_anywhere(peer)) {
++			bgp_peer_conn_errlist_add_tail(&bgp->peer_conn_errlist,
++						       peer);
++		}
++	}
++	/* Ensure an event is scheduled */
++	event_add_event(bm->master, bgp_packet_process_error, bgp, 0,
++			&bgp->t_conn_errors);
++	return 0;
++}
++
++/*
++ * Dequeue a peer that encountered a connection error; signal whether there
++ * are more queued peers.
++ */
++struct peer *bgp_dequeue_conn_err_peer(struct bgp *bgp, bool *more_p)
++{
++	struct peer *peer = NULL;
++	bool more = false;
++
++	frr_with_mutex (&bgp->peer_errs_mtx) {
++		peer = bgp_peer_conn_errlist_pop(&bgp->peer_conn_errlist);
++
++		if (bgp_peer_conn_errlist_const_first(
++			    &bgp->peer_conn_errlist) != NULL)
++			more = true;
++	}
++
++	if (more_p)
++		*more_p = more;
++
++	return peer;
++}
++
++/*
++ * Reschedule the connection error event - probably after processing
++ * some of the peers on the list.
++ */
++void bgp_conn_err_reschedule(struct bgp *bgp)
++{
++	event_add_event(bm->master, bgp_packet_process_error, bgp, 0,
++			&bgp->t_conn_errors);
++}
++
+ printfrr_ext_autoreg_p("BP", printfrr_bp);
+ static ssize_t printfrr_bp(struct fbuf *buf, struct printfrr_eargs *ea,
+ 			   const void *ptr)
+diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h
+index 0341ff68e..bc15c0662 100644
+--- a/bgpd/bgpd.h
++++ b/bgpd/bgpd.h
+@@ -353,6 +353,33 @@ struct as_confed {
+ struct bgp_mplsvpn_nh_label_bind_cache;
+ PREDECL_RBTREE_UNIQ(bgp_mplsvpn_nh_label_bind_cache);
+ 
++/* List of peers that have connection errors in the io pthread */
++PREDECL_LIST(bgp_peer_conn_errlist);
++
++/* List of info about peers that are being cleared from BGP RIBs in a batch */
++PREDECL_LIST(bgp_clearing_info);
++
++/* Hash of peers in clearing info object */
++PREDECL_HASH(bgp_clearing_hash);
++
++/* Info about a batch of peers that need to be cleared from the RIB.
++ * If many peers need to be cleared, we process them in batches, taking
++ * one walk through the RIB for each batch.
++ */
++struct bgp_clearing_info {
++	/* Hash of peers */
++	struct bgp_clearing_hash_head peers;
++
++	/* Event to schedule/reschedule processing */
++	struct thread *t_sched;
++
++	/* RIB dest for rescheduling */
++	struct bgp_dest *last_dest;
++
++	/* Linkage for list of batches per-bgp */
++	struct bgp_clearing_info_item link;
++};
++
+ /* BGP instance structure.  */
+ struct bgp {
+ 	/* AS number of this BGP instance.  */
+@@ -827,6 +854,21 @@ struct bgp {
+ 	uint16_t tcp_keepalive_intvl;
+ 	uint16_t tcp_keepalive_probes;
+ 
++	/* List of peers that have connection errors in the IO pthread */
++	struct bgp_peer_conn_errlist_head peer_conn_errlist;
++
++	/* Mutex that guards the connection-errors list */
++	pthread_mutex_t peer_errs_mtx;
++
++	/* Event indicating that there have been connection errors; this
++	 * is typically signalled in the IO pthread; it's handled in the
++	 * main pthread.
++	 */
++	struct event *t_conn_errors;
++
++	/* List of batches of peers being cleared from BGP RIBs */
++	struct bgp_clearing_info_head clearing_list;
++
+ 	struct timeval ebgprequirespolicywarning;
+ #define FIFTEENMINUTE2USEC (int64_t)15 * 60 * 1000000
+ 
+@@ -1188,7 +1230,6 @@ struct peer_connection {
+ 
+ 	struct event *t_routeadv;
+ 	struct event *t_process_packet;
+-	struct event *t_process_packet_error;
+ 
+ 	struct event *t_stop_with_notify;
+ 
+@@ -1856,6 +1897,15 @@ struct peer {
+ 	/* Add-Path Best selected paths number to advertise */
+ 	uint8_t addpath_best_selected[AFI_MAX][SAFI_MAX];
+ 
++	/* Linkage for list of peers with connection errors from IO pthread */
++	struct bgp_peer_conn_errlist_item conn_err_link;
++
++	/* Connection error code */
++	uint16_t connection_errcode;
++
++	/* Linkage for hash of clearing peers being cleared in a batch */
++	struct bgp_clearing_hash_item clear_hash_link;
++
+ 	QOBJ_FIELDS;
+ };
+ DECLARE_QOBJ_TYPE(peer);
+@@ -2490,6 +2540,10 @@ void bgp_gr_apply_running_config(void);
+ int bgp_global_gr_init(struct bgp *bgp);
+ int bgp_peer_gr_init(struct peer *peer);
+ 
++/* APIs for the per-bgp peer connection error list */
++int bgp_enqueue_conn_err_peer(struct bgp *bgp, struct peer *peer, int errcode);
++struct peer *bgp_dequeue_conn_err_peer(struct bgp *bgp, bool *more_p);
++void bgp_conn_err_reschedule(struct bgp *bgp);
+ 
+ #define BGP_GR_ROUTER_DETECT_AND_SEND_CAPABILITY_TO_ZEBRA(_bgp, _peer_list)    \
+ 	do {                                                                   \
+-- 
+2.43.2
+
diff --git a/src/sonic-frr/patch/0085-bgpd-remove-apis-from-bgp_route.h.patch b/src/sonic-frr/patch/0085-bgpd-remove-apis-from-bgp_route.h.patch
new file mode 100644
index 000000000000..f5d058cf1a7f
--- /dev/null
+++ b/src/sonic-frr/patch/0085-bgpd-remove-apis-from-bgp_route.h.patch
@@ -0,0 +1,26 @@
+From 8ee8e2511328e8c27ad7c16bd0854cc4335df399 Mon Sep 17 00:00:00 2001
+From: Mark Stapp <mjs@cisco.com>
+Date: Tue, 1 Oct 2024 09:23:26 -0400
+Subject: [PATCH 2/4] bgpd: remove apis from bgp_route.h
+
+Remove a couple of apis that don't exist.
+
+Signed-off-by: Mark Stapp <mjs@cisco.com>
+
+diff --git a/bgpd/bgp_route.h b/bgpd/bgp_route.h
+index 61492980a..39e282712 100644
+--- a/bgpd/bgp_route.h
++++ b/bgpd/bgp_route.h
+@@ -885,9 +885,6 @@ extern bool subgroup_announce_check(struct bgp_dest *dest,
+ 				    const struct prefix *p, struct attr *attr,
+ 				    struct attr *post_attr);
+ 
+-extern void bgp_peer_clear_node_queue_drain_immediate(struct peer *peer);
+-extern void bgp_process_queues_drain_immediate(void);
+-
+ /* for encap/vpn */
+ extern struct bgp_dest *bgp_safi_node_lookup(struct bgp_table *table,
+ 					     safi_t safi,
+-- 
+2.43.2
+
diff --git a/src/sonic-frr/patch/0086-bgpd-batch-peer-connection-error-clearing.patch b/src/sonic-frr/patch/0086-bgpd-batch-peer-connection-error-clearing.patch
new file mode 100644
index 000000000000..17e5bcb6100f
--- /dev/null
+++ b/src/sonic-frr/patch/0086-bgpd-batch-peer-connection-error-clearing.patch
@@ -0,0 +1,874 @@
+From 6134671a2980cf33aca723e81be642a48a69bb93 Mon Sep 17 00:00:00 2001
+From: Mark Stapp <mjs@cisco.com>
+Date: Tue, 1 Oct 2024 16:30:44 -0400
+Subject: [PATCH] bgpd: batch peer connection error clearing
+
+When peer connections encounter errors, attempt to batch some
+of the clearing processing that occurs. Add a new batch object,
+add multiple peers to it, if possible. Do one rib walk for the
+batch, rather than one walk per peer. Use a handler callback
+per batch to check and remove peers' path-infos, rather than
+a work-queue and callback per peer. The original clearing code
+remains; it's used for single peers.
+
+Signed-off-by: Mark Stapp <mjs@cisco.com>
+
+diff --git a/bgpd/bgp_fsm.c b/bgpd/bgp_fsm.c
+index 650301163..254043407 100644
+--- a/bgpd/bgp_fsm.c
++++ b/bgpd/bgp_fsm.c
+@@ -1263,7 +1263,8 @@ void bgp_fsm_change_status(struct peer_connection *connection,
+ 		 * Clearing
+ 		 * (or Deleted).
+ 		 */
+-		if (!work_queue_is_scheduled(peer->clear_node_queue) &&
++		if (!CHECK_FLAG(peer->flags, PEER_FLAG_CLEARING_BATCH) &&
++		    !work_queue_is_scheduled(peer->clear_node_queue) &&
+ 		    status != Deleted)
+ 			BGP_EVENT_ADD(connection, Clearing_Completed);
+ 	}
+diff --git a/bgpd/bgp_memory.h b/bgpd/bgp_memory.h
+index 29584cd78..321fd6118 100644
+--- a/bgpd/bgp_memory.h
++++ b/bgpd/bgp_memory.h
+@@ -131,4 +131,6 @@ DECLARE_MTYPE(BGP_NOTIFICATION);
+ 
+ DECLARE_MTYPE(BGP_SOFT_VERSION);
+ 
++DECLARE_MTYPE(CLEARING_BATCH);
++
+ #endif /* _QUAGGA_BGP_MEMORY_H */
+diff --git a/bgpd/bgp_packet.c b/bgpd/bgp_packet.c
+index 26c00cfe1..658f82863 100644
+--- a/bgpd/bgp_packet.c
++++ b/bgpd/bgp_packet.c
+@@ -4001,62 +4001,3 @@ void bgp_send_delayed_eor(struct bgp *bgp)
+ 	for (ALL_LIST_ELEMENTS(bgp->peer, node, nnode, peer))
+ 		bgp_write_proceed_actions(peer);
+ }
+-
+-/*
+- * Task callback in the main pthread to handle socket error
+- * encountered in the io pthread. We avoid having the io pthread try
+- * to enqueue fsm events or mess with the peer struct.
+- */
+-
+-/* Max number of peers to process without rescheduling */
+-#define BGP_CONN_ERROR_DEQUEUE_MAX 10
+-
+-void bgp_packet_process_error(struct event *thread)
+-{
+-	struct peer_connection *connection;
+-	struct peer *peer;
+-	struct bgp *bgp;
+-	int counter = 0;
+-	bool more_p = false;
+-
+-	bgp = EVENT_ARG(thread);
+-
+-	/* Dequeue peers from the error list */
+-	while ((peer = bgp_dequeue_conn_err_peer(bgp, &more_p)) != NULL) {
+-		connection = peer->connection;
+-
+-		if (bgp_debug_neighbor_events(peer))
+-			zlog_debug("%s [Event] BGP error %d on fd %d",
+-				   peer->host, peer->connection_errcode,
+-				   connection->fd);
+-
+-		/* Closed connection or error on the socket */
+-		if (peer_established(connection)) {
+-			if ((CHECK_FLAG(peer->flags, PEER_FLAG_GRACEFUL_RESTART)
+-			     || CHECK_FLAG(peer->flags,
+-					   PEER_FLAG_GRACEFUL_RESTART_HELPER))
+-			    && CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_MODE)) {
+-				peer->last_reset = PEER_DOWN_NSF_CLOSE_SESSION;
+-				SET_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT);
+-			} else
+-				peer->last_reset = PEER_DOWN_CLOSE_SESSION;
+-		}
+-
+-		/* No need for keepalives, if enabled */
+-		bgp_keepalives_off(connection);
+-
+-		bgp_event_update(connection, peer->connection_errcode);
+-
+-		counter++;
+-		if (counter >= BGP_CONN_ERROR_DEQUEUE_MAX)
+-			break;
+-	}
+-
+-	/* Reschedule event if necessary */
+-	if (more_p)
+-		bgp_conn_err_reschedule(bgp);
+-
+-	if (bgp_debug_neighbor_events(NULL))
+-		zlog_debug("%s: dequeued and processed %d peers", __func__,
+-			   counter);
+-}
+diff --git a/bgpd/bgp_packet.h b/bgpd/bgp_packet.h
+index b67acf205..7fc0189be 100644
+--- a/bgpd/bgp_packet.h
++++ b/bgpd/bgp_packet.h
+@@ -74,8 +74,6 @@ extern void bgp_process_packet(struct event *event);
+ 
+ extern void bgp_send_delayed_eor(struct bgp *bgp);
+ 
+-/* Task callback to handle socket error encountered in the io pthread */
+-void bgp_packet_process_error(struct event *thread);
+ extern struct bgp_notify
+ bgp_notify_decapsulate_hard_reset(struct bgp_notify *notify);
+ extern bool bgp_has_graceful_restart_notification(struct peer *peer);
+diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c
+index 7c596f02a..c6fa5e5b4 100644
+--- a/bgpd/bgp_route.c
++++ b/bgpd/bgp_route.c
+@@ -78,6 +78,9 @@
+ 
+ #include "bgpd/bgp_route_clippy.c"
+ 
++/* Memory for batched clearing of peers from the RIB */
++DEFINE_MTYPE(BGPD, CLEARING_BATCH, "Clearing batch");
++
+ DEFINE_HOOK(bgp_snmp_update_stats,
+ 	    (struct bgp_dest *rn, struct bgp_path_info *pi, bool added),
+ 	    (rn, pi, added));
+@@ -6107,11 +6110,244 @@ void bgp_clear_route(struct peer *peer, afi_t afi, safi_t safi)
+ 		peer_unlock(peer);
+ }
+ 
++/*
++ * Callback scheduled to process prefixes/dests for batch clearing; the
++ * dests were found via a rib walk.
++ * The one-peer version of this uses a per-peer workqueue to manage
++ * rescheduling, but we're just using a fixed limit here.
++ */
++
++/* Limit the number of dests we'll process per callback */
++#define BGP_CLEARING_BATCH_MAX_DESTS 100
++
++static void bgp_clear_batch_dests_task(struct event *event)
++{
++	struct bgp_clearing_info *cinfo = EVENT_ARG(event);
++	struct bgp_dest *dest;
++	struct bgp_path_info *pi, *next;
++	struct bgp_table *table;
++	struct bgp *bgp;
++	afi_t afi;
++	safi_t safi;
++	int counter = 0;
++
++	bgp = cinfo->bgp;
++
++next_dest:
++
++	dest = bgp_clearing_batch_next_dest(cinfo);
++	if (dest == NULL)
++		goto done;
++
++	table = bgp_dest_table(dest);
++	afi = table->afi;
++	safi = table->safi;
++
++	/* Have to check every path: it is possible that we have multiple paths
++	 * for a prefix from a peer if that peer is using AddPath.
++	 * Note that the clearing action may change the pi list; we try to do
++	 * a "safe" iteration.
++	 */
++	for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = next) {
++		next = pi ? pi->next : NULL;
++
++		if (!bgp_clearing_batch_check_peer(cinfo, pi->peer))
++			continue;
++
++		/* graceful restart STALE flag set. */
++		if (((CHECK_FLAG(pi->peer->sflags, PEER_STATUS_NSF_WAIT)
++		      && pi->peer->nsf[afi][safi])
++		     || CHECK_FLAG(pi->peer->af_sflags[afi][safi],
++				   PEER_STATUS_ENHANCED_REFRESH))
++		    && !CHECK_FLAG(pi->flags, BGP_PATH_STALE)
++		    && !CHECK_FLAG(pi->flags, BGP_PATH_UNUSEABLE))
++			bgp_path_info_set_flag(dest, pi, BGP_PATH_STALE);
++		else {
++			/* If this is an EVPN route, process for
++			 * un-import. */
++			if (safi == SAFI_EVPN)
++				bgp_evpn_unimport_route(
++					bgp, afi, safi,
++					bgp_dest_get_prefix(dest), pi);
++			/* Handle withdraw for VRF route-leaking and L3VPN */
++			if (SAFI_UNICAST == safi
++			    && (bgp->inst_type == BGP_INSTANCE_TYPE_VRF ||
++				bgp->inst_type == BGP_INSTANCE_TYPE_DEFAULT)) {
++				vpn_leak_from_vrf_withdraw(bgp_get_default(),
++							   bgp, pi);
++			}
++			if (SAFI_MPLS_VPN == safi &&
++			    bgp->inst_type == BGP_INSTANCE_TYPE_DEFAULT) {
++				vpn_leak_to_vrf_withdraw(pi);
++			}
++
++			bgp_rib_remove(dest, pi, pi->peer, afi, safi);
++		}
++	}
++
++	/* Unref this dest and table */
++	bgp_dest_unlock_node(dest);
++	bgp_table_unlock(table);
++
++	counter++;
++	if (counter < BGP_CLEARING_BATCH_MAX_DESTS)
++		goto next_dest;
++
++done:
++
++	/* If there are still dests to process, reschedule. */
++	if (bgp_clearing_batch_dests_present(cinfo)) {
++		if (bgp_debug_neighbor_events(NULL))
++			zlog_debug("%s: Batch %p: Rescheduled after processing %d dests",
++				   __func__, cinfo, counter);
++
++		event_add_event(bm->master, bgp_clear_batch_dests_task, cinfo,
++				0, &cinfo->t_sched);
++	} else {
++		if (bgp_debug_neighbor_events(NULL))
++			zlog_debug("%s: Batch %p: Done after processing %d dests",
++				   __func__, cinfo, counter);
++		bgp_clearing_batch_completed(cinfo);
++	}
++
++	return;
++}
++
++/*
++ * Walk a single table for batch peer clearing processing
++ */
++static void clear_batch_table_helper(struct bgp_clearing_info *cinfo,
++				     struct bgp_table *table)
++{
++	struct bgp_dest *dest;
++	bool force = (cinfo->bgp->process_queue == NULL);
++	uint32_t examined = 0, queued = 0;
++
++	for (dest = bgp_table_top(table); dest; dest = bgp_route_next(dest)) {
++		struct bgp_path_info *pi, *next;
++		struct bgp_adj_in *ain;
++		struct bgp_adj_in *ain_next;
++
++		examined++;
++
++		ain = dest->adj_in;
++		while (ain) {
++			ain_next = ain->next;
++
++			if (bgp_clearing_batch_check_peer(cinfo, ain->peer))
++				bgp_adj_in_remove(&dest, ain);
++
++			ain = ain_next;
++
++			assert(dest != NULL);
++		}
++
++		for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = next) {
++			next = pi->next;
++			if (!bgp_clearing_batch_check_peer(cinfo, pi->peer))
++				continue;
++
++			queued++;
++
++			if (force) {
++				bgp_path_info_reap(dest, pi);
++			} else {
++				/* Unlocked after processing */
++				bgp_table_lock(bgp_dest_table(dest));
++				bgp_dest_lock_node(dest);
++
++				bgp_clearing_batch_add_dest(cinfo, dest);
++				break;
++			}
++		}
++	}
++
++	if (examined > 0) {
++		if (bgp_debug_neighbor_events(NULL))
++			zlog_debug("%s: %s/%s: examined %u, queued %u",
++				   __func__, afi2str(table->afi),
++				   safi2str(table->safi), examined, queued);
++	}
++}
++
++/*
++ * RIB-walking helper for batch clearing work: walk all tables, identify
++ * dests that are affected by the peers in the batch, enqueue the dests for
++ * async processing.
++ */
++static void clear_batch_rib_helper(struct bgp_clearing_info *cinfo)
++{
++	afi_t afi;
++	safi_t safi;
++	struct bgp_dest *dest;
++	struct bgp_table *table;
++
++	FOREACH_AFI_SAFI (afi, safi) {
++		/* Identify table to be examined */
++		if (safi != SAFI_MPLS_VPN && safi != SAFI_ENCAP &&
++		    safi != SAFI_EVPN) {
++			table = cinfo->bgp->rib[afi][safi];
++			if (!table)
++				continue;
++
++			clear_batch_table_helper(cinfo, table);
++		} else {
++			for (dest = bgp_table_top(cinfo->bgp->rib[afi][safi]);
++			     dest; dest = bgp_route_next(dest)) {
++				table = bgp_dest_get_bgp_table_info(dest);
++				if (!table)
++					continue;
++
++				/* TODO -- record the tables we've seen
++				 * and don't repeat any?
++				 */
++
++				clear_batch_table_helper(cinfo, table);
++			}
++		}
++	}
++}
++
++/*
++ * Identify prefixes that need to be cleared for a batch of peers in 'cinfo'.
++ * The actual clearing processing will be done async...
++ */
++void bgp_clear_route_batch(struct bgp_clearing_info *cinfo)
++{
++	if (bgp_debug_neighbor_events(NULL))
++		zlog_debug("%s: BGP %s, batch %p", __func__,
++			   cinfo->bgp->name_pretty, cinfo);
++
++	/* Walk the rib, checking the peers in the batch */
++	clear_batch_rib_helper(cinfo);
++
++	/* If we found some prefixes, schedule a task to begin work. */
++	if (bgp_clearing_batch_dests_present(cinfo))
++		event_add_event(bm->master, bgp_clear_batch_dests_task, cinfo,
++				0, &cinfo->t_sched);
++
++	/* NB -- it's the caller's job to clean up, release refs, etc. if
++	 * we didn't find any dests
++	 */
++}
++
+ void bgp_clear_route_all(struct peer *peer)
+ {
+ 	afi_t afi;
+ 	safi_t safi;
+ 
++	/* We may be able to batch multiple peers' clearing work: check
++	 * and see.
++	 */
++	if (bgp_clearing_batch_add_peer(peer->bgp, peer)) {
++		if (bgp_debug_neighbor_events(peer))
++			zlog_debug("%s: peer %pBP batched", __func__, peer);
++		return;
++	}
++
++	if (bgp_debug_neighbor_events(peer))
++		zlog_debug("%s: peer %pBP", __func__, peer);
++
+ 	FOREACH_AFI_SAFI (afi, safi)
+ 		bgp_clear_route(peer, afi, safi);
+ 
+diff --git a/bgpd/bgp_route.h b/bgpd/bgp_route.h
+index 39e282712..9f0563c25 100644
+--- a/bgpd/bgp_route.h
++++ b/bgpd/bgp_route.h
+@@ -744,6 +744,9 @@ extern void bgp_soft_reconfig_table_task_cancel(const struct bgp *bgp,
+ extern bool bgp_soft_reconfig_in(struct peer *peer, afi_t afi, safi_t safi);
+ extern void bgp_clear_route(struct peer *, afi_t, safi_t);
+ extern void bgp_clear_route_all(struct peer *);
++/* Clear routes for a batch of peers */
++void bgp_clear_route_batch(struct bgp_clearing_info *cinfo);
++
+ extern void bgp_clear_adj_in(struct peer *, afi_t, safi_t);
+ extern void bgp_clear_stale_route(struct peer *, afi_t, safi_t);
+ extern void bgp_set_stale_route(struct peer *peer, afi_t afi, safi_t safi);
+diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c
+index 7159fe4df..af4a235ec 100644
+--- a/bgpd/bgpd.c
++++ b/bgpd/bgpd.c
+@@ -87,7 +87,20 @@ DEFINE_QOBJ_TYPE(peer);
+ DEFINE_HOOK(bgp_inst_delete, (struct bgp *bgp), (bgp));
+ 
+ /* Peers with connection error/failure, per bgp instance */
+-DECLARE_LIST(bgp_peer_conn_errlist, struct peer, conn_err_link);
++DECLARE_DLIST(bgp_peer_conn_errlist, struct peer, conn_err_link);
++
++/* List of info about peers that are being cleared from BGP RIBs in a batch */
++DECLARE_DLIST(bgp_clearing_info, struct bgp_clearing_info, link);
++
++/* List of dests that need to be processed in a clearing batch */
++DECLARE_LIST(bgp_clearing_destlist, struct bgp_clearing_dest, link);
++
++/* Hash of peers in clearing info object */
++static int peer_clearing_hash_cmp(const struct peer *p1, const struct peer *p2);
++static uint32_t peer_clearing_hashfn(const struct peer *p1);
++
++DECLARE_HASH(bgp_clearing_hash, struct peer, clear_hash_link,
++	     peer_clearing_hash_cmp, peer_clearing_hashfn);
+ 
+ /* BGP process wide configuration.  */
+ static struct bgp_master bgp_master;
+@@ -3583,6 +3596,7 @@ static struct bgp *bgp_create(as_t *as, const char *name,
+ 	/* Init peer connection error info */
+ 	pthread_mutex_init(&bgp->peer_errs_mtx, NULL);
+ 	bgp_peer_conn_errlist_init(&bgp->peer_conn_errlist);
++	bgp_clearing_info_init(&bgp->clearing_list);
+ 
+ 	return bgp;
+ }
+@@ -3934,6 +3948,10 @@ int bgp_delete(struct bgp *bgp)
+ 		zlog_debug("Zebra Announce Fifo cleanup count before %u and after %u during BGP %s deletion",
+ 			   cnt_before, cnt_after, bgp->name_pretty);
+ 
++	/* Cleanup for peer connection batching */
++	while ((cinfo = bgp_clearing_info_first(&bgp->clearing_list)) != NULL)
++		bgp_clearing_batch_completed(cinfo);
++
+ 	bgp_soft_reconfig_table_task_cancel(bgp, NULL, NULL);
+ 
+ 	/* make sure we withdraw any exported routes */
+@@ -4048,6 +4066,10 @@ int bgp_delete(struct bgp *bgp)
+ 	/* Cancel peer connection errors event */
+ 	EVENT_OFF(bgp->t_conn_errors);
+ 
++	/* Cleanup for peer connection batching */
++	while ((cinfo = bgp_clearing_info_pop(&bgp->clearing_list)) != NULL)
++		bgp_clearing_batch_completed(cinfo);
++
+ /* TODO - Other memory may need to be freed - e.g., NHT */
+ 
+ #ifdef ENABLE_BGP_VNC
+@@ -8739,8 +8761,303 @@ void bgp_gr_apply_running_config(void)
+ 	}
+ }
+ 
++/* Hash of peers in clearing info object */
++static int peer_clearing_hash_cmp(const struct peer *p1, const struct peer *p2)
++{
++	if (p1 == p2)
++		return 0;
++	else if (p1 < p2)
++		return -1;
++	else
++		return 1;
++}
++
++static uint32_t peer_clearing_hashfn(const struct peer *p1)
++{
++	return (uint32_t)((intptr_t)p1 & 0xffffffffULL);
++}
++
++/*
++ * Free a clearing batch: this really just does the memory cleanup; the
++ * clearing code is expected to manage the peer, dest, table, etc refcounts
++ */
++static void bgp_clearing_batch_free(struct bgp *bgp,
++				    struct bgp_clearing_info **pinfo)
++{
++	struct bgp_clearing_info *cinfo = *pinfo;
++	struct bgp_clearing_dest *destinfo;
++
++	if (bgp_clearing_info_anywhere(cinfo))
++		bgp_clearing_info_del(&bgp->clearing_list, cinfo);
++
++	while ((destinfo = bgp_clearing_destlist_pop(&cinfo->destlist)) != NULL)
++		XFREE(MTYPE_CLEARING_BATCH, destinfo);
++
++	bgp_clearing_hash_fini(&cinfo->peers);
++
++	XFREE(MTYPE_CLEARING_BATCH, *pinfo);
++}
++
++/*
++ * Done with a peer that was part of a clearing batch
++ */
++static void bgp_clearing_peer_done(struct peer *peer)
++{
++	UNSET_FLAG(peer->flags, PEER_FLAG_CLEARING_BATCH);
++
++	/* Tickle FSM to start moving again */
++	BGP_EVENT_ADD(peer->connection, Clearing_Completed);
++
++	peer_unlock(peer); /* bgp_clear_route */
++}
++
++/*
++ * Initialize a new batch struct for clearing peer(s) from the RIB
++ */
++static void bgp_clearing_batch_begin(struct bgp *bgp)
++{
++	struct bgp_clearing_info *cinfo;
++
++	cinfo = XCALLOC(MTYPE_CLEARING_BATCH, sizeof(struct bgp_clearing_info));
++
++	cinfo->bgp = bgp;
++
++	/* Init hash of peers and list of dests */
++	bgp_clearing_hash_init(&cinfo->peers);
++	bgp_clearing_destlist_init(&cinfo->destlist);
++
++	/* Batch is open for more peers */
++	SET_FLAG(cinfo->flags, BGP_CLEARING_INFO_FLAG_OPEN);
++
++	bgp_clearing_info_add_head(&bgp->clearing_list, cinfo);
++}
++
++/*
++ * Close a batch of clearing peers, and begin working on the RIB
++ */
++static void bgp_clearing_batch_end(struct bgp *bgp)
++{
++	struct bgp_clearing_info *cinfo;
++
++	cinfo = bgp_clearing_info_first(&bgp->clearing_list);
++
++	assert(cinfo != NULL);
++	assert(CHECK_FLAG(cinfo->flags, BGP_CLEARING_INFO_FLAG_OPEN));
++
++	/* Batch is closed */
++	UNSET_FLAG(cinfo->flags, BGP_CLEARING_INFO_FLAG_OPEN);
++
++	/* If we have no peers to examine, just discard the batch info */
++	if (bgp_clearing_hash_count(&cinfo->peers) == 0) {
++		bgp_clearing_batch_free(bgp, &cinfo);
++		return;
++	}
++
++	/* Do a RIB walk for the current batch. If it finds dests/prefixes
++	 * to work on, this will schedule a task to process
++	 * the dests/prefixes in the batch.
++	 */
++	bgp_clear_route_batch(cinfo);
++
++	/* If we found no prefixes/dests, just discard the batch,
++	 * remembering that we're holding a ref for each peer.
++	 */
++	if (bgp_clearing_destlist_count(&cinfo->destlist) == 0) {
++		bgp_clearing_batch_completed(cinfo);
++	}
++}
++
++/* Check whether a dest's peer is relevant to a clearing batch */
++bool bgp_clearing_batch_check_peer(struct bgp_clearing_info *cinfo,
++				   const struct peer *peer)
++{
++	struct peer *p;
++
++	p = bgp_clearing_hash_find(&cinfo->peers, peer);
++	return (p != NULL);
++}
++
++/*
++ * Check whether a clearing batch has any dests to process
++ */
++bool bgp_clearing_batch_dests_present(struct bgp_clearing_info *cinfo)
++{
++	return (bgp_clearing_destlist_count(&cinfo->destlist) > 0);
++}
++
++/*
++ * Done with a peer clearing batch; deal with refcounts, free memory
++ */
++void bgp_clearing_batch_completed(struct bgp_clearing_info *cinfo)
++{
++	struct peer *peer;
++	struct bgp_dest *dest;
++	struct bgp_clearing_dest *destinfo;
++	struct bgp_table *table;
++
++	/* Ensure event is not scheduled */
++	event_cancel_event(bm->master, &cinfo->t_sched);
++
++	/* Remove all peers and un-ref */
++	while ((peer = bgp_clearing_hash_pop(&cinfo->peers)) != NULL)
++		bgp_clearing_peer_done(peer);
++
++	/* Remove any dests/prefixes and unlock */
++	destinfo = bgp_clearing_destlist_pop(&cinfo->destlist);
++	while (destinfo) {
++		dest = destinfo->dest;
++		XFREE(MTYPE_CLEARING_BATCH, destinfo);
++
++		table = bgp_dest_table(dest);
++		bgp_dest_unlock_node(dest);
++		bgp_table_unlock(table);
++
++		destinfo = bgp_clearing_destlist_pop(&cinfo->destlist);
++	}
++
++	/* Free memory */
++	bgp_clearing_batch_free(cinfo->bgp, &cinfo);
++}
++
++/*
++ * Add a prefix/dest to a clearing batch
++ */
++void bgp_clearing_batch_add_dest(struct bgp_clearing_info *cinfo,
++				 struct bgp_dest *dest)
++{
++	struct bgp_clearing_dest *destinfo;
++
++	destinfo = XCALLOC(MTYPE_CLEARING_BATCH,
++			   sizeof(struct bgp_clearing_dest));
++
++	destinfo->dest = dest;
++	bgp_clearing_destlist_add_tail(&cinfo->destlist, destinfo);
++}
++
++/*
++ * Return the next dest for batch clear processing
++ */
++struct bgp_dest *bgp_clearing_batch_next_dest(struct bgp_clearing_info *cinfo)
++{
++	struct bgp_clearing_dest *destinfo;
++	struct bgp_dest *dest = NULL;
++
++	destinfo = bgp_clearing_destlist_pop(&cinfo->destlist);
++	if (destinfo) {
++		dest = destinfo->dest;
++		XFREE(MTYPE_CLEARING_BATCH, destinfo);
++	}
++
++	return dest;
++}
++
++/* If a clearing batch is available for 'peer', add it and return 'true',
++ * else return 'false'.
++ */
++bool bgp_clearing_batch_add_peer(struct bgp *bgp, struct peer *peer)
++{
++	struct bgp_clearing_info *cinfo;
++
++	cinfo = bgp_clearing_info_first(&bgp->clearing_list);
++
++	if (cinfo && CHECK_FLAG(cinfo->flags, BGP_CLEARING_INFO_FLAG_OPEN)) {
++		if (!CHECK_FLAG(peer->flags, PEER_FLAG_CLEARING_BATCH)) {
++			/* Add a peer ref */
++			peer_lock(peer);
++
++			bgp_clearing_hash_add(&cinfo->peers, peer);
++			SET_FLAG(peer->flags, PEER_FLAG_CLEARING_BATCH);
++		}
++		return true;
++	}
++
++	return false;
++}
++
++/*
++ * Task callback in the main pthread to handle socket errors
++ * encountered in the io pthread. We avoid having the io pthread try
++ * to enqueue fsm events or mess with the peer struct.
++ */
++
++/* TODO -- should this be configurable? */
++/* Max number of peers to process without rescheduling */
++#define BGP_CONN_ERROR_DEQUEUE_MAX 10
++
++static void bgp_process_conn_error(struct event *event)
++{
++	struct bgp *bgp;
++	struct peer *peer;
++	struct peer_connection *connection;
++	int counter = 0;
++	size_t list_count = 0;
++	bool more_p = false;
++
++	bgp = EVENT_ARG(event);
++
++	frr_with_mutex (&bgp->peer_errs_mtx) {
++		peer = bgp_peer_conn_errlist_pop(&bgp->peer_conn_errlist);
++
++		list_count =
++			bgp_peer_conn_errlist_count(&bgp->peer_conn_errlist);
++	}
++
++	/* If we have multiple peers with errors, try to batch some
++	 * clearing work.
++	 */
++	if (list_count > 0)
++		bgp_clearing_batch_begin(bgp);
++
++	/* Dequeue peers from the error list */
++	while (peer != NULL) {
++		connection = peer->connection;
++
++		if (bgp_debug_neighbor_events(peer))
++			zlog_debug("%s [Event] BGP error %d on fd %d",
++				   peer->host, peer->connection_errcode,
++				   connection->fd);
++
++		/* Closed connection or error on the socket */
++		if (peer_established(connection)) {
++			if ((CHECK_FLAG(peer->flags, PEER_FLAG_GRACEFUL_RESTART)
++			     || CHECK_FLAG(peer->flags,
++					   PEER_FLAG_GRACEFUL_RESTART_HELPER))
++			    && CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_MODE)) {
++				peer->last_reset = PEER_DOWN_NSF_CLOSE_SESSION;
++				SET_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT);
++			} else
++				peer->last_reset = PEER_DOWN_CLOSE_SESSION;
++		}
++
++		/* No need for keepalives, if enabled */
++		bgp_keepalives_off(peer->connection);
++
++		/* Drive into state-machine changes */
++		bgp_event_update(connection, peer->connection_errcode);
++
++		counter++;
++		if (counter >= BGP_CONN_ERROR_DEQUEUE_MAX)
++			break;
++
++		peer = bgp_dequeue_conn_err_peer(bgp, &more_p);
++	}
++
++	/* Reschedule event if necessary */
++	if (more_p)
++		bgp_conn_err_reschedule(bgp);
++
++	/* Done with a clearing batch */
++	if (list_count > 0)
++		bgp_clearing_batch_end(bgp);
++
++	if (bgp_debug_neighbor_events(NULL))
++		zlog_debug("%s: dequeued and processed %d peers", __func__,
++			   counter);
++}
++
+ /*
+- * Enqueue a peer with a connection error to be handled in the main pthread
++ * Enqueue a peer with a connection error to be handled in the main pthread;
++ * this is called from the io pthread.
+  */
+ int bgp_enqueue_conn_err_peer(struct bgp *bgp, struct peer *peer, int errcode)
+ {
+@@ -8754,7 +9071,7 @@ int bgp_enqueue_conn_err_peer(struct bgp *bgp, struct peer *peer, int errcode)
+ 		}
+ 	}
+ 	/* Ensure an event is scheduled */
+-	event_add_event(bm->master, bgp_packet_process_error, bgp, 0,
++	event_add_event(bm->master, bgp_process_conn_error, bgp, 0,
+ 			&bgp->t_conn_errors);
+ 	return 0;
+ }
+@@ -8788,7 +9105,7 @@ struct peer *bgp_dequeue_conn_err_peer(struct bgp *bgp, bool *more_p)
+  */
+ void bgp_conn_err_reschedule(struct bgp *bgp)
+ {
+-	event_add_event(bm->master, bgp_packet_process_error, bgp, 0,
++	event_add_event(bm->master, bgp_process_conn_error, bgp, 0,
+ 			&bgp->t_conn_errors);
+ }
+ 
+diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h
+index bc15c0662..ff3f1b9f1 100644
+--- a/bgpd/bgpd.h
++++ b/bgpd/bgpd.h
+@@ -354,32 +354,54 @@ struct bgp_mplsvpn_nh_label_bind_cache;
+ PREDECL_RBTREE_UNIQ(bgp_mplsvpn_nh_label_bind_cache);
+ 
+ /* List of peers that have connection errors in the io pthread */
+-PREDECL_LIST(bgp_peer_conn_errlist);
++PREDECL_DLIST(bgp_peer_conn_errlist);
+ 
+ /* List of info about peers that are being cleared from BGP RIBs in a batch */
+-PREDECL_LIST(bgp_clearing_info);
++PREDECL_DLIST(bgp_clearing_info);
+ 
+ /* Hash of peers in clearing info object */
+ PREDECL_HASH(bgp_clearing_hash);
+ 
++/* List of dests that need to be processed in a clearing batch */
++PREDECL_LIST(bgp_clearing_destlist);
++
++struct bgp_clearing_dest {
++	struct bgp_dest *dest;
++	struct bgp_clearing_destlist_item link;
++};
++
+ /* Info about a batch of peers that need to be cleared from the RIB.
+  * If many peers need to be cleared, we process them in batches, taking
+- * one walk through the RIB for each batch.
++ * one walk through the RIB for each batch. This is only used for "all"
++ * afi/safis, typically when processing peer connection errors.
+  */
+ struct bgp_clearing_info {
++	/* Owning bgp instance */
++	struct bgp *bgp;
++
+ 	/* Hash of peers */
+ 	struct bgp_clearing_hash_head peers;
+ 
++	/* Flags */
++	uint32_t flags;
++
++	/* List of dests - wrapped by a small wrapper struct */
++	struct bgp_clearing_destlist_head destlist;
++
+ 	/* Event to schedule/reschedule processing */
+-	struct thread *t_sched;
++	struct event *t_sched;
++
++	/* TODO -- id, serial number, for debugging/logging? */
+ 
+-	/* RIB dest for rescheduling */
+-	struct bgp_dest *last_dest;
++	/* TODO -- info for rescheduling the RIB walk? future? */
+ 
+-	/* Linkage for list of batches per-bgp */
++	/* Linkage for list of batches per bgp */
+ 	struct bgp_clearing_info_item link;
+ };
+ 
++/* Batch is open, new peers can be added */
++#define BGP_CLEARING_INFO_FLAG_OPEN  (1 << 0)
++
+ /* BGP instance structure.  */
+ struct bgp {
+ 	/* AS number of this BGP instance.  */
+@@ -1513,6 +1535,8 @@ struct peer {
+ #define PEER_FLAG_GRACEFUL_SHUTDOWN (1ULL << 35)
+ #define PEER_FLAG_CAPABILITY_SOFT_VERSION (1ULL << 36)
+ #define PEER_FLAG_CAPABILITY_FQDN (1ULL << 37)  /* fqdn capability */
++/* Peer is part of a batch clearing its routes */
++#define PEER_FLAG_CLEARING_BATCH (1ULL << 41)
+ 
+ 	/*
+ 	 *GR-Disabled mode means unset PEER_FLAG_GRACEFUL_RESTART
+@@ -2815,6 +2839,23 @@ extern bool bgp_path_attribute_treat_as_withdraw(struct peer *peer, char *buf,
+ 
+ extern void srv6_function_free(struct bgp_srv6_function *func);
+ 
++/* If a clearing batch is available for 'peer', add it and return 'true',
++ * else return 'false'.
++ */
++bool bgp_clearing_batch_add_peer(struct bgp *bgp, struct peer *peer);
++/* Add a prefix/dest to a clearing batch */
++void bgp_clearing_batch_add_dest(struct bgp_clearing_info *cinfo,
++				 struct bgp_dest *dest);
++/* Check whether a dest's peer is relevant to a clearing batch */
++bool bgp_clearing_batch_check_peer(struct bgp_clearing_info *cinfo,
++				   const struct peer *peer);
++/* Check whether a clearing batch has any dests to process */
++bool bgp_clearing_batch_dests_present(struct bgp_clearing_info *cinfo);
++/* Returns the next dest for batch clear processing */
++struct bgp_dest *bgp_clearing_batch_next_dest(struct bgp_clearing_info *cinfo);
++/* Done with a peer clearing batch; deal with refcounts, free memory */
++void bgp_clearing_batch_completed(struct bgp_clearing_info *cinfo);
++
+ #ifdef _FRR_ATTRIBUTE_PRINTFRR
+ /* clang-format off */
+ #pragma FRR printfrr_ext "%pBP" (struct peer *)
+-- 
+2.43.2
+
diff --git a/src/sonic-frr/patch/0087-zebra-move-peer-conn-error-list-to-connection-struct.patch b/src/sonic-frr/patch/0087-zebra-move-peer-conn-error-list-to-connection-struct.patch
new file mode 100644
index 000000000000..05812d075474
--- /dev/null
+++ b/src/sonic-frr/patch/0087-zebra-move-peer-conn-error-list-to-connection-struct.patch
@@ -0,0 +1,210 @@
+From e9eee90c3f68dc83a92f519841732c951948875b Mon Sep 17 00:00:00 2001
+From: Mark Stapp <mjs@cisco.com>
+Date: Thu, 7 Nov 2024 10:55:19 -0500
+Subject: [PATCH 4/4] zebra: move peer conn error list to connection struct
+
+Move the peer connection error list to the peer_connection
+struct; that seems to line up better with the way that struct
+works.
+
+Signed-off-by: Mark Stapp <mjs@cisco.com>
+
+diff --git a/bgpd/bgp_io.c b/bgpd/bgp_io.c
+index 4d5e1d9c5..31aba90d2 100644
+--- a/bgpd/bgp_io.c
++++ b/bgpd/bgp_io.c
+@@ -251,7 +251,7 @@ static void bgp_process_reads(struct event *thread)
+ 		/* Handle the error in the main pthread, include the
+ 		 * specific state change from 'bgp_read'.
+ 		 */
+-		bgp_enqueue_conn_err_peer(peer->bgp, connection->peer, code);
++		bgp_enqueue_conn_err(peer->bgp, connection, code);
+ 		goto done;
+ 	}
+ 
+diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c
+index e3f103724..7d5e2b5c6 100644
+--- a/bgpd/bgpd.c
++++ b/bgpd/bgpd.c
+@@ -87,7 +87,7 @@ DEFINE_QOBJ_TYPE(peer);
+ DEFINE_HOOK(bgp_inst_delete, (struct bgp *bgp), (bgp));
+ 
+ /* Peers with connection error/failure, per bgp instance */
+-DECLARE_DLIST(bgp_peer_conn_errlist, struct peer, conn_err_link);
++DECLARE_DLIST(bgp_peer_conn_errlist, struct peer_connection, conn_err_link);
+ 
+ /* List of info about peers that are being cleared from BGP RIBs in a batch */
+ DECLARE_DLIST(bgp_clearing_info, struct bgp_clearing_info, link);
+@@ -2722,9 +2722,9 @@ int peer_delete(struct peer *peer)
+ 
+ 	/* Ensure the peer is removed from the connection error list */
+ 	frr_with_mutex (&bgp->peer_errs_mtx) {
+-		if (bgp_peer_conn_errlist_anywhere(peer))
++		if (bgp_peer_conn_errlist_anywhere(peer->connection))
+ 			bgp_peer_conn_errlist_del(&bgp->peer_conn_errlist,
+-						  peer);
++						  peer->connection);
+ 	}
+ 
+ 	if (CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT))
+@@ -3924,6 +3924,8 @@ int bgp_delete(struct bgp *bgp)
+ 	struct bgp_table *dest_table = NULL;
+ 	struct graceful_restart_info *gr_info;
+ 	uint32_t cnt_before, cnt_after;
++	struct bgp_clearing_info *cinfo;
++	struct peer_connection *connection;
+ 
+ 	assert(bgp);
+ 
+@@ -4044,9 +4046,9 @@ int bgp_delete(struct bgp *bgp)
+ 	 */
+ 	frr_with_mutex (&bgp->peer_errs_mtx) {
+ 		do {
+-			peer = bgp_peer_conn_errlist_pop(
++			connection = bgp_peer_conn_errlist_pop(
+ 				&bgp->peer_conn_errlist);
+-		} while (peer != NULL);
++		} while (connection != NULL);
+ 	}
+ 
+ 	/* Free peers and peer-groups. */
+@@ -8998,7 +9000,7 @@ static void bgp_process_conn_error(struct event *event)
+ 	bgp = EVENT_ARG(event);
+ 
+ 	frr_with_mutex (&bgp->peer_errs_mtx) {
+-		peer = bgp_peer_conn_errlist_pop(&bgp->peer_conn_errlist);
++		connection = bgp_peer_conn_errlist_pop(&bgp->peer_conn_errlist);
+ 
+ 		list_count =
+ 			bgp_peer_conn_errlist_count(&bgp->peer_conn_errlist);
+@@ -9011,12 +9013,12 @@ static void bgp_process_conn_error(struct event *event)
+ 		bgp_clearing_batch_begin(bgp);
+ 
+ 	/* Dequeue peers from the error list */
+-	while (peer != NULL) {
+-		connection = peer->connection;
++	while (connection != NULL) {
++		peer = connection->peer;
+ 
+ 		if (bgp_debug_neighbor_events(peer))
+ 			zlog_debug("%s [Event] BGP error %d on fd %d",
+-				   peer->host, peer->connection_errcode,
++				   peer->host, connection->connection_errcode,
+ 				   connection->fd);
+ 
+ 		/* Closed connection or error on the socket */
+@@ -9035,13 +9037,13 @@ static void bgp_process_conn_error(struct event *event)
+ 		bgp_keepalives_off(peer->connection);
+ 
+ 		/* Drive into state-machine changes */
+-		bgp_event_update(connection, peer->connection_errcode);
++		bgp_event_update(connection, connection->connection_errcode);
+ 
+ 		counter++;
+ 		if (counter >= BGP_CONN_ERROR_DEQUEUE_MAX)
+ 			break;
+ 
+-		peer = bgp_dequeue_conn_err_peer(bgp, &more_p);
++		connection = bgp_dequeue_conn_err(bgp, &more_p);
+ 	}
+ 
+ 	/* Reschedule event if necessary */
+@@ -9058,18 +9060,19 @@ static void bgp_process_conn_error(struct event *event)
+ }
+ 
+ /*
+- * Enqueue a peer with a connection error to be handled in the main pthread;
++ * Enqueue a connection with an error to be handled in the main pthread;
+  * this is called from the io pthread.
+  */
+-int bgp_enqueue_conn_err_peer(struct bgp *bgp, struct peer *peer, int errcode)
++int bgp_enqueue_conn_err(struct bgp *bgp, struct peer_connection *connection,
++			 int errcode)
+ {
+ 	frr_with_mutex (&bgp->peer_errs_mtx) {
+-		peer->connection_errcode = errcode;
++		connection->connection_errcode = errcode;
+ 
+ 		/* Careful not to double-enqueue */
+-		if (!bgp_peer_conn_errlist_anywhere(peer)) {
++		if (!bgp_peer_conn_errlist_anywhere(connection)) {
+ 			bgp_peer_conn_errlist_add_tail(&bgp->peer_conn_errlist,
+-						       peer);
++						       connection);
+ 		}
+ 	}
+ 	/* Ensure an event is scheduled */
+@@ -9079,16 +9082,16 @@ int bgp_enqueue_conn_err_peer(struct bgp *bgp, struct peer *peer, int errcode)
+ }
+ 
+ /*
+- * Dequeue a peer that encountered a connection error; signal whether there
++ * Dequeue a connection that encountered a connection error; signal whether there
+  * are more queued peers.
+  */
+-struct peer *bgp_dequeue_conn_err_peer(struct bgp *bgp, bool *more_p)
++struct peer_connection *bgp_dequeue_conn_err(struct bgp *bgp, bool *more_p)
+ {
+-	struct peer *peer = NULL;
++	struct peer_connection *connection = NULL;
+ 	bool more = false;
+ 
+ 	frr_with_mutex (&bgp->peer_errs_mtx) {
+-		peer = bgp_peer_conn_errlist_pop(&bgp->peer_conn_errlist);
++		connection = bgp_peer_conn_errlist_pop(&bgp->peer_conn_errlist);
+ 
+ 		if (bgp_peer_conn_errlist_const_first(
+ 			    &bgp->peer_conn_errlist) != NULL)
+@@ -9098,7 +9101,7 @@ struct peer *bgp_dequeue_conn_err_peer(struct bgp *bgp, bool *more_p)
+ 	if (more_p)
+ 		*more_p = more;
+ 
+-	return peer;
++	return connection;
+ }
+ 
+ /*
+diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h
+index ff3f1b9f1..b53ece40d 100644
+--- a/bgpd/bgpd.h
++++ b/bgpd/bgpd.h
+@@ -1254,6 +1254,11 @@ struct peer_connection {
+ 	struct event *t_process_packet;
+ 
+ 	struct event *t_stop_with_notify;
++	/* Linkage for list connections with errors, from IO pthread */
++	struct bgp_peer_conn_errlist_item conn_err_link;
++
++	/* Connection error code */
++	uint16_t connection_errcode;
+ 
+ 	union sockunion su;
+ #define BGP_CONNECTION_SU_UNSPEC(connection)                                   \
+@@ -1921,12 +1926,6 @@ struct peer {
+ 	/* Add-Path Best selected paths number to advertise */
+ 	uint8_t addpath_best_selected[AFI_MAX][SAFI_MAX];
+ 
+-	/* Linkage for list of peers with connection errors from IO pthread */
+-	struct bgp_peer_conn_errlist_item conn_err_link;
+-
+-	/* Connection error code */
+-	uint16_t connection_errcode;
+-
+ 	/* Linkage for hash of clearing peers being cleared in a batch */
+ 	struct bgp_clearing_hash_item clear_hash_link;
+ 
+@@ -2565,8 +2564,9 @@ int bgp_global_gr_init(struct bgp *bgp);
+ int bgp_peer_gr_init(struct peer *peer);
+ 
+ /* APIs for the per-bgp peer connection error list */
+-int bgp_enqueue_conn_err_peer(struct bgp *bgp, struct peer *peer, int errcode);
+-struct peer *bgp_dequeue_conn_err_peer(struct bgp *bgp, bool *more_p);
++int bgp_enqueue_conn_err(struct bgp *bgp, struct peer_connection *connection,
++			 int errcode);
++struct peer_connection *bgp_dequeue_conn_err(struct bgp *bgp, bool *more_p);
+ void bgp_conn_err_reschedule(struct bgp *bgp);
+ 
+ #define BGP_GR_ROUTER_DETECT_AND_SEND_CAPABILITY_TO_ZEBRA(_bgp, _peer_list)    \
+-- 
+2.43.2
+
diff --git a/src/sonic-frr/patch/series b/src/sonic-frr/patch/series
index 6f7c40d653af..7b16c04e89fe 100644
--- a/src/sonic-frr/patch/series
+++ b/src/sonic-frr/patch/series
@@ -64,3 +64,7 @@
 0081-bgpd-Optimize-evaluate-paths-for-a-peer-going-down.patch
 0082-Revert-bgpd-upon-if-event-evaluate-bnc-with-matching.patch
 0083-staticd-add-cli-to-support-steering-of-ipv4-traffic-over-srv6-sid-list.patch
+0084-bgpd-Replace-per-peer-connection-error-with-per-bgp.patch
+0085-bgpd-remove-apis-from-bgp_route.h.patch
+0086-bgpd-batch-peer-connection-error-clearing.patch
+0087-zebra-move-peer-conn-error-list-to-connection-struct.patch