Skip to content

Commit 1100ed4

Browse files
leftwoAlan Hanson
and
Alan Hanson
authored
loop with DNS lookup when trying for SwitchLocation (#7779)
Currently, callers of `map_switch_zone_addrs()` first get the IP for `ServiceName::Dendrite` from DNS, then loop (forever) trying to translate that IP into a `SwitchLocation`. Under normal conditions, this is fine. However, if a sled has been expunged, or a new sled is being added, it's possible that what is returned in: ``` let switch_zone_addresses = match resolver .lookup_all_ipv6(ServiceName::Dendrite) .await ``` Will change. If that changes happens after we start looping in `map_switch_zone_addrs()`, then the loop will go on forever looking for something that is no longer correct. To fix this we put the `lookup_all_ipv6` into the loop by using the function `switch_zone_address_mappings()` instead. `switch_zone_address_mappings()`'s loop includes the call to lookup addresses in DNS will call `map_switch_zone_addrs()`. This allows us to include the DNS lookup inside the loop. Most places where we called `map_switch_zone_addrs()` were also using the same `lookup_all_ipv6()` call, so transitioning them to call `switch_zone_address_mappings()` will just drop right in. A fix for #7739 --------- Co-authored-by: Alan Hanson <alan@oxide.computer>
1 parent 705208a commit 1100ed4

File tree

5 files changed

+72
-82
lines changed

5 files changed

+72
-82
lines changed

nexus/src/app/background/tasks/bfd.rs

+5-12
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@
66
//! (BFD) sessions.
77
88
use crate::app::{
9-
background::tasks::networking::build_mgd_clients, map_switch_zone_addrs,
9+
background::tasks::networking::build_mgd_clients,
10+
switch_zone_address_mappings,
1011
};
1112

1213
use crate::app::background::BackgroundTask;
1314
use futures::FutureExt;
1415
use futures::future::BoxFuture;
1516
use internal_dns_resolver::Resolver;
16-
use internal_dns_types::names::ServiceName;
1717
use mg_admin_client::types::{BfdPeerConfig, SessionMode};
1818
use nexus_db_model::{BfdMode, BfdSession};
1919
use nexus_db_queries::{context::OpContext, db::DataStore};
@@ -117,12 +117,8 @@ impl BackgroundTask for BfdManager {
117117

118118
let mut current: HashSet<BfdSessionKey> = HashSet::new();
119119

120-
let switch_zone_addresses = match self
121-
.resolver
122-
.lookup_all_ipv6(ServiceName::Dendrite)
123-
.await
124-
{
125-
Ok(addrs) => addrs,
120+
let mappings = match switch_zone_address_mappings(&self.resolver, log).await {
121+
Ok(mappings) => mappings,
126122
Err(e) => {
127123
error!(log, "failed to resolve addresses for Dendrite services"; "error" => %e);
128124
return json!({
@@ -131,13 +127,10 @@ impl BackgroundTask for BfdManager {
131127
"failed to resolve addresses for Dendrite services: {:#}",
132128
e
133129
)
134-
});
130+
});
135131
},
136132
};
137133

138-
let mappings =
139-
map_switch_zone_addrs(log, switch_zone_addresses).await;
140-
141134
let mgd_clients = build_mgd_clients(mappings, log);
142135

143136
for (location, c) in &mgd_clients {

nexus/src/app/background/tasks/nat_cleanup.rs

+10-13
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,14 @@
66
//! Responsible for cleaning up soft deleted entries once they
77
//! have been propagated to running dpd instances.
88
9-
use crate::app::map_switch_zone_addrs;
9+
use crate::app::switch_zone_address_mappings;
1010

1111
use super::networking::build_dpd_clients;
1212
use crate::app::background::BackgroundTask;
1313
use chrono::{Duration, Utc};
1414
use futures::FutureExt;
1515
use futures::future::BoxFuture;
1616
use internal_dns_resolver::Resolver;
17-
use internal_dns_types::names::ServiceName;
1817
use nexus_db_queries::context::OpContext;
1918
use nexus_db_queries::db::DataStore;
2019
use serde_json::json;
@@ -65,27 +64,25 @@ impl BackgroundTask for Ipv4NatGarbageCollector {
6564
}
6665
};
6766

68-
let switch_zone_addresses = match self
69-
.resolver
70-
.lookup_all_ipv6(ServiceName::Dendrite)
71-
.await
67+
let mappings = match
68+
switch_zone_address_mappings(&self.resolver, log).await
7269
{
73-
Ok(addrs) => addrs,
70+
Ok(mappings) => mappings,
7471
Err(e) => {
75-
error!(log, "failed to resolve addresses for Dendrite services"; "error" => %e);
72+
error!(
73+
log,
74+
"failed to resolve addresses for Dendrite services"; "error" => %e
75+
);
7676
return json!({
7777
"error":
7878
format!(
7979
"failed to resolve addresses for Dendrite services: {:#}",
8080
e
8181
)
82-
});
83-
},
82+
});
83+
}
8484
};
8585

86-
let mappings =
87-
map_switch_zone_addrs(log, switch_zone_addresses).await;
88-
8986
let dpd_clients = build_dpd_clients(&mappings, log);
9087

9188
for (_location, client) in dpd_clients {

nexus/src/app/background/tasks/sync_service_zone_nat.rs

+5-11
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,14 @@
55
//! Background task for detecting changes to service zone locations and
66
//! updating the NAT rpw table accordingly
77
8-
use crate::app::map_switch_zone_addrs;
8+
use crate::app::switch_zone_address_mappings;
99

1010
use super::networking::build_dpd_clients;
1111
use crate::app::background::BackgroundTask;
1212
use anyhow::Context;
1313
use futures::FutureExt;
1414
use futures::future::BoxFuture;
1515
use internal_dns_resolver::Resolver;
16-
use internal_dns_types::names::ServiceName;
1716
use nexus_db_model::Ipv4NatValues;
1817
use nexus_db_queries::context::OpContext;
1918
use nexus_db_queries::db::DataStore;
@@ -318,12 +317,10 @@ impl BackgroundTask for ServiceZoneNatTracker {
318317
// notify dpd if we've added any new records
319318
if result > 0 {
320319

321-
let switch_zone_addresses = match self
322-
.resolver
323-
.lookup_all_ipv6(ServiceName::Dendrite)
324-
.await
320+
let mappings = match
321+
switch_zone_address_mappings(&self.resolver, log).await
325322
{
326-
Ok(addrs) => addrs,
323+
Ok(mappings) => mappings,
327324
Err(e) => {
328325
error!(log, "failed to resolve addresses for Dendrite services"; "error" => %e);
329326
return json!({
@@ -332,13 +329,10 @@ impl BackgroundTask for ServiceZoneNatTracker {
332329
"failed to resolve addresses for Dendrite services: {:#}",
333330
e
334331
)
335-
});
332+
});
336333
},
337334
};
338335

339-
let mappings =
340-
map_switch_zone_addrs(log, switch_zone_addresses).await;
341-
342336
let dpd_clients = build_dpd_clients(&mappings, log);
343337

344338
for (_location, client) in dpd_clients {

nexus/src/app/background/tasks/sync_switch_configuration.rs

+8-13
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,12 @@ use crate::app::{
99
background::tasks::networking::{
1010
api_to_dpd_port_settings, build_dpd_clients, build_mgd_clients,
1111
},
12-
map_switch_zone_addrs,
12+
switch_zone_address_mappings,
1313
};
1414
use oxnet::Ipv4Net;
1515
use slog::o;
1616

1717
use internal_dns_resolver::Resolver;
18-
use internal_dns_types::names::ServiceName;
1918
use ipnetwork::IpNetwork;
2019
use nexus_db_model::{
2120
AddressLotBlock, BgpConfig, BootstoreConfig, INFRA_LOT, LoopbackAddress,
@@ -322,23 +321,19 @@ impl BackgroundTask for SwitchPortSettingsManager {
322321

323322
// lookup switch zones via DNS
324323
// TODO https://github.com/oxidecomputer/omicron/issues/5201
325-
let switch_zone_addresses = match self
326-
.resolver
327-
.lookup_all_ipv6(ServiceName::Dendrite)
328-
.await
324+
let mappings = match
325+
switch_zone_address_mappings(&self.resolver, &log).await
329326
{
330-
Ok(addrs) => addrs,
327+
Ok(mappings) => mappings,
331328
Err(e) => {
332-
error!(log, "failed to resolve addresses for Dendrite services";
333-
"error" => %DisplayErrorChain::new(&e));
329+
error!(
330+
log,
331+
"failed to resolve addresses for Dendrite services";
332+
"error" => %e);
334333
continue;
335334
},
336335
};
337336

338-
// TODO https://github.com/oxidecomputer/omicron/issues/5201
339-
let mappings =
340-
map_switch_zone_addrs(&log, switch_zone_addresses).await;
341-
342337
// TODO https://github.com/oxidecomputer/omicron/issues/5201
343338
// build sled agent clients
344339
let sled_agent_clients = build_sled_agent_clients(&mappings, &log);

nexus/src/app/mod.rs

+44-33
Original file line numberDiff line numberDiff line change
@@ -1030,21 +1030,35 @@ pub(crate) async fn lldpd_clients(
10301030
Ok(clients)
10311031
}
10321032

1033+
// Look up Dendrite addresses in DNS then determine the switch location for
1034+
// each address DNS returns. If we fail to lookup ServiceName::Dendrite, we
1035+
// return error, otherwise we keep looping until we can determine the switch
1036+
// location of all addresses returned to us from the lookup.
10331037
async fn switch_zone_address_mappings(
10341038
resolver: &internal_dns_resolver::Resolver,
10351039
log: &slog::Logger,
10361040
) -> Result<HashMap<SwitchLocation, Ipv6Addr>, String> {
1037-
let switch_zone_addresses = match resolver
1038-
.lookup_all_ipv6(ServiceName::Dendrite)
1039-
.await
1040-
{
1041-
Ok(addrs) => addrs,
1042-
Err(e) => {
1043-
error!(log, "failed to resolve addresses for Dendrite services"; "error" => %e);
1044-
return Err(e.to_string());
1041+
loop {
1042+
let switch_zone_addresses = match resolver
1043+
.lookup_all_ipv6(ServiceName::Dendrite)
1044+
.await
1045+
{
1046+
Ok(addrs) => addrs,
1047+
Err(e) => {
1048+
error!(log, "failed to resolve addresses for Dendrite services"; "error" => %e);
1049+
return Err(e.to_string());
1050+
}
1051+
};
1052+
match map_switch_zone_addrs(&log, switch_zone_addresses).await {
1053+
Ok(mappings) => {
1054+
return Ok(mappings);
1055+
}
1056+
Err(e) => {
1057+
warn!(log, "Failed to map switch zone addr: {e}, retrying");
1058+
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
1059+
}
10451060
}
1046-
};
1047-
Ok(map_switch_zone_addrs(&log, switch_zone_addresses).await)
1061+
}
10481062
}
10491063

10501064
// TODO: #3596 Allow updating of Nexus from `handoff_to_nexus()`
@@ -1058,7 +1072,7 @@ async fn switch_zone_address_mappings(
10581072
async fn map_switch_zone_addrs(
10591073
log: &Logger,
10601074
switch_zone_addresses: Vec<Ipv6Addr>,
1061-
) -> HashMap<SwitchLocation, Ipv6Addr> {
1075+
) -> Result<HashMap<SwitchLocation, Ipv6Addr>, String> {
10621076
use gateway_client::Client as MgsClient;
10631077
info!(log, "Determining switch slots managed by switch zones");
10641078
let mut switch_zone_addrs = HashMap::new();
@@ -1069,28 +1083,25 @@ async fn map_switch_zone_addrs(
10691083
);
10701084

10711085
info!(log, "determining switch slot managed by dendrite zone"; "zone_address" => #?addr);
1072-
// TODO: #3599 Use retry function instead of looping on a fixed timer
1073-
let switch_slot = loop {
1074-
match mgs_client.sp_local_switch_id().await {
1075-
Ok(switch) => {
1076-
info!(
1077-
log,
1078-
"identified switch slot for dendrite zone";
1079-
"slot" => #?switch,
1080-
"zone_address" => #?addr
1081-
);
1082-
break switch.slot;
1083-
}
1084-
Err(e) => {
1085-
warn!(
1086-
log,
1087-
"failed to identify switch slot for dendrite, will retry in 2 seconds";
1088-
"zone_address" => #?addr,
1089-
"reason" => #?e
1090-
);
1091-
}
1086+
let switch_slot = match mgs_client.sp_local_switch_id().await {
1087+
Ok(switch) => {
1088+
info!(
1089+
log,
1090+
"identified switch slot for dendrite zone";
1091+
"slot" => #?switch,
1092+
"zone_address" => #?addr
1093+
);
1094+
switch.slot
1095+
}
1096+
Err(e) => {
1097+
warn!(
1098+
log,
1099+
"failed to identify switch slot for dendrite";
1100+
"zone_address" => #?addr,
1101+
"reason" => #?e
1102+
);
1103+
return Err(e.to_string());
10921104
}
1093-
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
10941105
};
10951106

10961107
match switch_slot {
@@ -1113,5 +1124,5 @@ async fn map_switch_zone_addrs(
11131124
"completed mapping dendrite zones to switch slots";
11141125
"mappings" => #?switch_zone_addrs
11151126
);
1116-
switch_zone_addrs
1127+
Ok(switch_zone_addrs)
11171128
}

0 commit comments

Comments
 (0)