Skip to content

Commit c31dc2f

Browse files
authored
improved TUF artifact replication robustness (#7519)
Closes #7399. Nexus now owns and maintains a generation number for the set of artifacts the system wants to be fully replicated, which is used by Sled Agent to prevent conflicts. The generation number is stored in a new singleton table based on the existing db_metadata singleton. I wrote up `docs/tuf-artifact-replication.adoc` to provide a top-level overview of the system and some of the conflicts that this refactor seeks to prevent. The Sled Agent artifact store APIs are modified. Two new APIs exist for getting and putting an "artifact configuration", which is the list of wanted artifacts and its associated generation number. The list request returns the current generation number as well, and the PUT and "copy from depot" requests require an up-to-date generation number in the query string. The delete API is removed in favor of Sled Agent managing deletions on its own whenever the configuration is updated.
1 parent 592e8b0 commit c31dc2f

File tree

28 files changed

+1540
-634
lines changed

28 files changed

+1540
-634
lines changed

common/src/api/external/mod.rs

+22-1
Original file line numberDiff line numberDiff line change
@@ -715,7 +715,6 @@ impl From<ByteCount> for i64 {
715715
Ord,
716716
PartialEq,
717717
PartialOrd,
718-
Serialize,
719718
Diffable,
720719
)]
721720
#[daft(leaf)]
@@ -766,6 +765,17 @@ impl<'de> Deserialize<'de> for Generation {
766765
}
767766
}
768767

768+
// This is the equivalent of applying `#[serde(transparent)]`, but that has a
769+
// side effect of changing the JsonSchema derive to no longer emit a schema.
770+
impl Serialize for Generation {
771+
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
772+
where
773+
S: serde::Serializer,
774+
{
775+
self.0.serialize(serializer)
776+
}
777+
}
778+
769779
impl Display for Generation {
770780
fn fmt(&self, f: &mut Formatter<'_>) -> FormatResult {
771781
f.write_str(&self.0.to_string())
@@ -821,6 +831,17 @@ impl FromStr for Generation {
821831
}
822832
}
823833

834+
impl slog::Value for Generation {
835+
fn serialize(
836+
&self,
837+
_rec: &slog::Record,
838+
key: slog::Key,
839+
serializer: &mut dyn slog::Serializer,
840+
) -> slog::Result {
841+
serializer.emit_u64(key, self.0)
842+
}
843+
}
844+
824845
#[derive(Debug, thiserror::Error)]
825846
#[error("negative generation number")]
826847
pub struct GenerationNegativeError(());

dev-tools/omdb/src/bin/omdb/nexus.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -2196,26 +2196,26 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) {
21962196
fn print_task_tuf_artifact_replication(details: &serde_json::Value) {
21972197
fn print_counters(counters: TufArtifactReplicationCounters) {
21982198
const ROWS: &[&str] = &[
2199+
"put config ok:",
2200+
"put config err:",
21992201
"list ok:",
22002202
"list err:",
22012203
"put ok:",
22022204
"put err:",
22032205
"copy ok:",
22042206
"copy err:",
2205-
"delete ok:",
2206-
"delete err:",
22072207
];
22082208
const WIDTH: usize = const_max_len(ROWS);
22092209

22102210
for (label, value) in ROWS.iter().zip([
2211+
counters.put_config_ok,
2212+
counters.put_config_err,
22112213
counters.list_ok,
22122214
counters.list_err,
22132215
counters.put_ok,
22142216
counters.put_err,
22152217
counters.copy_ok,
22162218
counters.copy_err,
2217-
counters.delete_ok,
2218-
counters.delete_err,
22192219
]) {
22202220
println!(" {label:<WIDTH$} {value:>3}");
22212221
}

dev-tools/omdb/tests/successes.out

+32-32
Original file line numberDiff line numberDiff line change
@@ -742,23 +742,23 @@ task: "tuf_artifact_replication"
742742
request ringbuf:
743743
<REDACTED_SECTION>
744744
last run:
745-
list ok: <LIST_OK_REDACTED>
746-
list err: 0
747-
put ok: 0
748-
put err: 0
749-
copy ok: 0
750-
copy err: 0
751-
delete ok: 0
752-
delete err: 0
745+
put config ok: <PUT_CONFIG_OK_REDACTED>
746+
put config err: 0
747+
list ok: <LIST_OK_REDACTED>
748+
list err: 0
749+
put ok: 0
750+
put err: 0
751+
copy ok: 0
752+
copy err: 0
753753
lifetime:
754-
list ok: <LIST_OK_REDACTED>
755-
list err: 0
756-
put ok: 0
757-
put err: 0
758-
copy ok: 0
759-
copy err: 0
760-
delete ok: 0
761-
delete err: 0
754+
put config ok: <PUT_CONFIG_OK_REDACTED>
755+
put config err: 0
756+
list ok: <LIST_OK_REDACTED>
757+
list err: 0
758+
put ok: 0
759+
put err: 0
760+
copy ok: 0
761+
copy err: 0
762762
local repos: 0
763763

764764
task: "v2p_manager"
@@ -1241,23 +1241,23 @@ task: "tuf_artifact_replication"
12411241
request ringbuf:
12421242
<REDACTED_SECTION>
12431243
last run:
1244-
list ok: <LIST_OK_REDACTED>
1245-
list err: 0
1246-
put ok: 0
1247-
put err: 0
1248-
copy ok: 0
1249-
copy err: 0
1250-
delete ok: 0
1251-
delete err: 0
1244+
put config ok: <PUT_CONFIG_OK_REDACTED>
1245+
put config err: 0
1246+
list ok: <LIST_OK_REDACTED>
1247+
list err: 0
1248+
put ok: 0
1249+
put err: 0
1250+
copy ok: 0
1251+
copy err: 0
12521252
lifetime:
1253-
list ok: <LIST_OK_REDACTED>
1254-
list err: 0
1255-
put ok: 0
1256-
put err: 0
1257-
copy ok: 0
1258-
copy err: 0
1259-
delete ok: 0
1260-
delete err: 0
1253+
put config ok: <PUT_CONFIG_OK_REDACTED>
1254+
put config err: 0
1255+
list ok: <LIST_OK_REDACTED>
1256+
list err: 0
1257+
put ok: 0
1258+
put err: 0
1259+
copy ok: 0
1260+
copy err: 0
12611261
local repos: 0
12621262

12631263
task: "v2p_manager"

dev-tools/omdb/tests/test_all_output.rs

+1
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
227227
// execution. These redactions work around the issue described in
228228
// https://github.com/oxidecomputer/omicron/issues/7417.
229229
redactor
230+
.field("put config ok:", r"\d+")
230231
.field("list ok:", r"\d+")
231232
.section(&["task: \"tuf_artifact_replication\"", "request ringbuf:"]);
232233

docs/tuf-artifact-replication.adoc

+186
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
:showtitle:
2+
:numbered:
3+
4+
= TUF Artifact Replication (a.k.a. TUF Repo Depot)
5+
6+
The final output of our release process is a TUF repo consisting of all
7+
of the artifacts the product requires to run. For the update system to
8+
work, it needs access to those artifacts. There are some constraining
9+
factors:
10+
11+
* Nexus is the only way into the system for these artifacts (either
12+
through direct upload from an operator, or a download initiated by
13+
Nexus to a service outside of the system).
14+
* Nexus has no persistent local storage, nor can it directly use the
15+
artifacts (OS and zone images, firmware, etc.) even if it did store
16+
them.
17+
* Sled Agent is generally what will directly use the artifacts (except
18+
for SP and ROT images, which MGS needs), and it can also manage its
19+
own local storage.
20+
21+
Thus Nexus needs to accept artifacts from outside of the system and
22+
immediately offload them to individual Sled Agents for persistent
23+
storage and later use.
24+
25+
We have chosen (see <<rfd424>>) the simplest possible implementation:
26+
every Sled Agent stores a copy of every artifact on each of its M.2
27+
devices. This is storage inefficient but means that a Sled Agent can
28+
directly use those resources to create zones from updated images,
29+
install an updated OS, or manage the installation of updates on other
30+
components, without Nexus having to ensure that it distributed an
31+
artifact to a sled _before_ telling it to use it. A Nexus background
32+
task periodically ensures that all sleds have all artifacts.
33+
34+
== Sled Agent implementation
35+
36+
Sled Agent stores artifacts as a content-addressed store on an *update*
37+
dataset on each M.2 device: the file name of each stored artifact is its
38+
SHA-256 hash.
39+
40+
It also stores an _artifact configuration_ in memory: a list of all
41+
artifact hashes that the sled should store, and a generation number.
42+
The generation number is owned by Nexus, which increments the generation
43+
number when the set of TUF repos on the system changes. Sled Agent
44+
prevents modifying the configuration without an increase in the
45+
generation number.
46+
47+
Sled Agent offers the following APIs on the underlay network, intended
48+
for Nexus:
49+
50+
* `artifact_config_get`: Get the current artifact configuration.
51+
* `artifact_config_put`: Put the artifact configuration that should be
52+
in effect. This API is idempotent (putting the same configuration does
53+
not change anything). Modified configurations must also increase the
54+
generation number.
55+
* `artifact_list`: List the artifacts present in the artifact
56+
configuration along with the count of available copies of each
57+
artifact across the *update* datasets. Also includes the current
58+
generation number.
59+
* `artifact_put`: Put the request body into the artifact store.
60+
Rejects the request if the artifact does not belong to the current
61+
configuration.
62+
* `artifact_copy_from_depot`: Sends a request to another Sled Agent (via
63+
the *TUF Repo Depot API*; see below) to fetch an artifact. The base
64+
URL for the source sled is chosen by the requester. This API responds
65+
after a successful HTTP response from the source sled and the copy
66+
proceeds asynchronously. Rejects the request if the artifact does not
67+
belong to the current configuration.
68+
69+
Sled Agent also spawns another Dropshot API server called the *TUF Repo
70+
Depot API* which offers one API on the underlay network, intended for
71+
other Sled Agents:
72+
73+
* `artifact_get_by_sha256`: Get the content of an artifact.
74+
75+
In an asynchronous task called the _delete reconciler_, Sled Agent
76+
periodically scans the *update* datasets for artifacts that are not
77+
part of the present configuration and deletes them. Prior to each
78+
filesystem operation the task checks the configuration for presence of
79+
that artifact hash. The delete reconciler then waits for an artifact
80+
configuration change until running again.
81+
82+
== Nexus implementation
83+
84+
Nexus has a `tuf_artifact_replication` background task which runs this
85+
reliable persistent workflow:
86+
87+
1. Collect the artifact configuration (the list of artifact hashes, and
88+
the current generation number) from the database.
89+
2. Call `artifact_config_put` on all sleds. Stop if any sled rejects the
90+
configuration (our information is already out of date).
91+
3. Call `artifact_list` on all sleds. Stop if any sled informs us of a
92+
newer generation number.
93+
4. Delete any local copies of repositories where all artifacts are
94+
sufficiently replicated across sleds. ("Sufficiently replicated"
95+
currently means that at least 3 sleds each have at least one copy.)
96+
5. For any artifacts this Nexus has a local copy of, send `artifact_put`
97+
requests to N random sleds, where N is the number of puts required to
98+
sufficienty replicate the artifact.
99+
6. Send `artifact_copy_from_depot` requests to all remaining sleds
100+
missing copies of an artifact. Nexus chooses the source sled randomly
101+
out of the list of sleds that have a copy of the artifact.
102+
103+
In each task execution, Nexus will attempt to do all possible work
104+
that leads to every sled having a copy of the artifact. In the absence
105+
of random I/O errors, a repository will be fully replicated across
106+
all sleds in the system in the first execution, and the Nexus-local
107+
copy of the repository will be deleted in the second execution.
108+
`artifact_copy_from_depot` requests that require the presence of an
109+
artifact on a sled that does not yet have it are scheduled after all
110+
`artifact_put` requests complete.
111+
112+
== Preventing conflicts and loss of artifacts
113+
114+
The artifact configuration is used to prevent conflicts that may be
115+
caused by two Nexus instances running the `tuf_artifact_replication`
116+
background task simultaneously with different information. The worst
117+
case scenario for a conflict is the total loss of an artifact across the
118+
system, although there are lesser evils as well. This section describes
119+
a number of possible faults and the mitigations taken.
120+
121+
=== Recently-uploaded repositories and artifact deletion
122+
123+
When Sled Agent receives an artifact configuration change, the delete
124+
reconciler task begins scanning the *update* datasets for artifacts that
125+
are no longer required and deletes them.
126+
127+
Nexus maintains its local copy of recently-uploaded repositories
128+
until it confirms (via the `artifact_list` operation) that all of the
129+
artifacts in the repository are sufficiently replicated (currently, at
130+
least 3 sleds each have at least 1 copy).
131+
132+
If the `artifact_list` operation lists any artifacts that could be
133+
deleted asynchronously, Nexus could incorrectly assume that an artifact
134+
is sufficiently replicated when it is not. This could happen if a
135+
repository is deleted, and another repository containing the same
136+
artifact is uploaded while another Nexus is running the background task.
137+
138+
The artifact configuration is designed to mitigate this. The
139+
`artifact_list` operation filters the list of artifacts to contain
140+
only artifacts present in the current configuration. The delete
141+
reconciler decides whether to delete a file by re-checking the current
142+
configuration.
143+
144+
When Nexus receives the `artifact_list` response, it verifies that
145+
the generation number reported is the same as the configuration it put
146+
earlier in the same task execution. Because the response only contains
147+
artifacts belonging to the current configuration, and that list of
148+
artifacts is based on the same configuration Nexus believes is current,
149+
it can trust that none of those artifacts are about to be deleted and
150+
safely delete local copies of sufficiently-replicated artifacts.
151+
152+
=== Loss of all sleds with the only copy
153+
154+
There are two potential situations where we could lose the only copy of
155+
an artifact. The first is a Nexus instance crashing or being replaced
156+
before a local artifact can be put to any sleds. Crashes are difficult
157+
to mitigate, as artifacts are currently stored in randomly-named
158+
temporary directories that are non-trivial to recover on startup;
159+
consequently there is no mitigation for this problem today. During
160+
graceful removal of Nexus zones, a quiesced Nexus (see <<rfd459>> and
161+
<<omicron5677>>) should remain alive until all local artifacts are
162+
sufficiently replicated.
163+
164+
The second potential situation is a loss of all sleds that an artifact
165+
is copied to after Nexus deletes its local copy. This is mostly
166+
mitigated by Nexus attempting to fully replicate all artifacts onto
167+
all sleds in every execution of the background task; if there are no
168+
I/O errors, it only takes one task execution to ensure a repository is
169+
present across the entire system.
170+
171+
=== Unnecessary work
172+
173+
`artifact_put` and `artifact_copy_from_depot` requests include the
174+
current generation as a query string parameter. If the generation does
175+
not match the current configuration, or the artifact is not present in
176+
the configuration, Sled Agent rejects the request.
177+
178+
[bibliography]
179+
== References
180+
181+
* [[[rfd424]]] Oxide Computer Company.
182+
https://rfd.shared.oxide.computer/rfd/424[TUF Repo Depot].
183+
* [[[rfd459]]] Oxide Computer Company.
184+
https://rfd.shared.oxide.computer/rfd/424[Control plane component lifecycle].
185+
* [[[omicron5677]]] oxidecomputer/omicron.
186+
https://github.com/oxidecomputer/omicron/issues/5677[nexus 'quiesce' support].

nexus/db-model/src/schema.rs

+8
Original file line numberDiff line numberDiff line change
@@ -1393,6 +1393,7 @@ table! {
13931393
time_created -> Timestamptz,
13941394
sha256 -> Text,
13951395
artifact_size -> Int8,
1396+
generation_added -> Int8,
13961397
}
13971398
}
13981399

@@ -1411,6 +1412,13 @@ allow_tables_to_appear_in_same_query!(
14111412
joinable!(tuf_repo_artifact -> tuf_repo (tuf_repo_id));
14121413
joinable!(tuf_repo_artifact -> tuf_artifact (tuf_artifact_id));
14131414

1415+
table! {
1416+
tuf_generation (singleton) {
1417+
singleton -> Bool,
1418+
generation -> Int8,
1419+
}
1420+
}
1421+
14141422
table! {
14151423
target_release (generation) {
14161424
generation -> Int8,

nexus/db-model/src/schema_versions.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
1616
///
1717
/// This must be updated when you change the database schema. Refer to
1818
/// schema/crdb/README.adoc in the root of this repository for details.
19-
pub const SCHEMA_VERSION: Version = Version::new(130, 0, 0);
19+
pub const SCHEMA_VERSION: Version = Version::new(131, 0, 0);
2020

2121
/// List of all past database schema versions, in *reverse* order
2222
///
@@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
2828
// | leaving the first copy as an example for the next person.
2929
// v
3030
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
31+
KnownVersion::new(131, "tuf-generation"),
3132
KnownVersion::new(130, "bp-sled-agent-generation"),
3233
KnownVersion::new(129, "create-target-release"),
3334
KnownVersion::new(128, "sled-resource-for-vmm"),

0 commit comments

Comments
 (0)