Skip to content

Commit cb8fb15

Browse files
authored
Add explicit Stopping state (oxidecomputer#1570)
There's an unfortunate ambiguity in certain states (e.g. `DsState::Faulted`), which represent two different things: - We've stopped the IO task due to a fault, and are waiting for it to restart - The IO task has restarted, and we're doing negotiation from a faulted state (i.e. will do live-repair) This PR adds a new `DsState::Stopping(ClientStopReason)` state, which represents the former. The previous states (`DsState::Faulted`) now _only_ mean that we're doing negotiation. The new state subsumes `DsState::Deactivated`, `DsState::Replacing`, `DsState::Disabled`, which were specialized states that waited for the IO task to exit. Each of those states is now `DsState::Stopping(..)` with an appropriate `ClientStopReason`. The vast majority of this PR is automatic OpenAPI changes. I don't think anyone is relying on the specific shape of `DsState` (which is only used in `UpstairsInfo` / the `info` endpoint), but please let me know if I'm wrong!
1 parent 01ab087 commit cb8fb15

File tree

8 files changed

+611
-280
lines changed

8 files changed

+611
-280
lines changed

cmon/src/main.rs

+11-6
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use strum::IntoEnumIterator;
88
use strum_macros::EnumIter;
99
use tokio::time::{sleep, Duration};
1010

11-
use crucible::{Arg, DsState};
11+
use crucible::{Arg, ClientStopReason, DsState};
1212

1313
/// Connect to crucible control server
1414
#[derive(Parser, Debug)]
@@ -87,18 +87,23 @@ enum Action {
8787
// Translate a DsState into a three letter string for printing.
8888
fn short_state(dss: DsState) -> String {
8989
match dss {
90-
DsState::New => "NEW".to_string(),
90+
DsState::New
91+
| DsState::Stopping(ClientStopReason::NegotiationFailed(..)) => {
92+
"NEW".to_string()
93+
}
9194
DsState::WaitActive => "WAC".to_string(),
9295
DsState::WaitQuorum => "WAQ".to_string(),
9396
DsState::Reconcile => "REC".to_string(),
9497
DsState::Active => "ACT".to_string(),
95-
DsState::Faulted => "FLT".to_string(),
98+
DsState::Faulted | DsState::Stopping(ClientStopReason::Fault(..)) => {
99+
"FLT".to_string()
100+
}
96101
DsState::LiveRepairReady => "LRR".to_string(),
97102
DsState::LiveRepair => "LR".to_string(),
98103
DsState::Offline => "OFF".to_string(),
99-
DsState::Deactivated => "DAV".to_string(),
100-
DsState::Disabled => "DIS".to_string(),
101-
DsState::Replacing => "RPC".to_string(),
104+
DsState::Stopping(ClientStopReason::Deactivated) => "DAV".to_string(),
105+
DsState::Stopping(ClientStopReason::Disabled) => "DIS".to_string(),
106+
DsState::Stopping(ClientStopReason::Replacing) => "RPC".to_string(),
102107
DsState::Replaced => "RPD".to_string(),
103108
}
104109
}

openapi/crucible-control.json

+327-15
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,173 @@
6565
"acked"
6666
]
6767
},
68+
"ClientFaultReason": {
69+
"description": "Subset of [`ClientStopReason`] for faulting a client",
70+
"oneOf": [
71+
{
72+
"type": "string",
73+
"enum": [
74+
"requested_fault"
75+
]
76+
},
77+
{
78+
"description": "Received an error from some non-recoverable IO (write or flush)",
79+
"type": "string",
80+
"enum": [
81+
"i_o_error"
82+
]
83+
},
84+
{
85+
"description": "Live-repair failed",
86+
"type": "string",
87+
"enum": [
88+
"failed_live_repair"
89+
]
90+
},
91+
{
92+
"description": "Too many jobs in the queue",
93+
"type": "string",
94+
"enum": [
95+
"too_many_outstanding_jobs"
96+
]
97+
},
98+
{
99+
"description": "Too many bytes in the queue",
100+
"type": "string",
101+
"enum": [
102+
"too_many_outstanding_bytes"
103+
]
104+
},
105+
{
106+
"description": "The upstairs has requested that we deactivate when we were offline",
107+
"type": "string",
108+
"enum": [
109+
"offline_deactivated"
110+
]
111+
},
112+
{
113+
"description": "The Upstairs has dropped jobs that would be needed for replay",
114+
"type": "string",
115+
"enum": [
116+
"ineligible_for_replay"
117+
]
118+
}
119+
]
120+
},
121+
"ClientNegotiationFailed": {
122+
"description": "Subset of [`ClientStopReason`] for faulting a client",
123+
"oneOf": [
124+
{
125+
"description": "Reconcile failed and we're restarting",
126+
"type": "string",
127+
"enum": [
128+
"failed_reconcile"
129+
]
130+
},
131+
{
132+
"description": "Negotiation message received out of order",
133+
"type": "string",
134+
"enum": [
135+
"bad_negotiation_order"
136+
]
137+
},
138+
{
139+
"description": "Negotiation says that we are incompatible",
140+
"type": "string",
141+
"enum": [
142+
"incompatible"
143+
]
144+
}
145+
]
146+
},
147+
"ClientStopReason": {
148+
"description": "When the upstairs halts the IO client task, it must provide a reason",
149+
"oneOf": [
150+
{
151+
"description": "We are about to replace the client task",
152+
"type": "object",
153+
"properties": {
154+
"type": {
155+
"type": "string",
156+
"enum": [
157+
"replacing"
158+
]
159+
}
160+
},
161+
"required": [
162+
"type"
163+
]
164+
},
165+
{
166+
"description": "We have disabled the downstairs client for some reason\n\n(for example, we have received `Message::YouAreNoLongerActive`)",
167+
"type": "object",
168+
"properties": {
169+
"type": {
170+
"type": "string",
171+
"enum": [
172+
"disabled"
173+
]
174+
}
175+
},
176+
"required": [
177+
"type"
178+
]
179+
},
180+
{
181+
"description": "The upstairs has requested that we deactivate",
182+
"type": "object",
183+
"properties": {
184+
"type": {
185+
"type": "string",
186+
"enum": [
187+
"deactivated"
188+
]
189+
}
190+
},
191+
"required": [
192+
"type"
193+
]
194+
},
195+
{
196+
"description": "Something went wrong during negotiation",
197+
"type": "object",
198+
"properties": {
199+
"type": {
200+
"type": "string",
201+
"enum": [
202+
"negotiation_failed"
203+
]
204+
},
205+
"value": {
206+
"$ref": "#/components/schemas/ClientNegotiationFailed"
207+
}
208+
},
209+
"required": [
210+
"type",
211+
"value"
212+
]
213+
},
214+
{
215+
"description": "We have explicitly faulted the client",
216+
"type": "object",
217+
"properties": {
218+
"type": {
219+
"type": "string",
220+
"enum": [
221+
"fault"
222+
]
223+
},
224+
"value": {
225+
"$ref": "#/components/schemas/ClientFaultReason"
226+
}
227+
},
228+
"required": [
229+
"type",
230+
"value"
231+
]
232+
}
233+
]
234+
},
68235
"DownstairsWork": {
69236
"description": "`DownstairsWork` holds the information gathered from the downstairs",
70237
"type": "object",
@@ -88,21 +255,166 @@
88255
]
89256
},
90257
"DsState": {
91-
"type": "string",
92-
"enum": [
93-
"new",
94-
"wait_active",
95-
"wait_quorum",
96-
"reconcile",
97-
"active",
98-
"faulted",
99-
"live_repair_ready",
100-
"live_repair",
101-
"offline",
102-
"deactivated",
103-
"disabled",
104-
"replacing",
105-
"replaced"
258+
"oneOf": [
259+
{
260+
"type": "object",
261+
"properties": {
262+
"type": {
263+
"type": "string",
264+
"enum": [
265+
"new"
266+
]
267+
}
268+
},
269+
"required": [
270+
"type"
271+
]
272+
},
273+
{
274+
"type": "object",
275+
"properties": {
276+
"type": {
277+
"type": "string",
278+
"enum": [
279+
"wait_active"
280+
]
281+
}
282+
},
283+
"required": [
284+
"type"
285+
]
286+
},
287+
{
288+
"type": "object",
289+
"properties": {
290+
"type": {
291+
"type": "string",
292+
"enum": [
293+
"wait_quorum"
294+
]
295+
}
296+
},
297+
"required": [
298+
"type"
299+
]
300+
},
301+
{
302+
"type": "object",
303+
"properties": {
304+
"type": {
305+
"type": "string",
306+
"enum": [
307+
"reconcile"
308+
]
309+
}
310+
},
311+
"required": [
312+
"type"
313+
]
314+
},
315+
{
316+
"type": "object",
317+
"properties": {
318+
"type": {
319+
"type": "string",
320+
"enum": [
321+
"active"
322+
]
323+
}
324+
},
325+
"required": [
326+
"type"
327+
]
328+
},
329+
{
330+
"type": "object",
331+
"properties": {
332+
"type": {
333+
"type": "string",
334+
"enum": [
335+
"faulted"
336+
]
337+
}
338+
},
339+
"required": [
340+
"type"
341+
]
342+
},
343+
{
344+
"type": "object",
345+
"properties": {
346+
"type": {
347+
"type": "string",
348+
"enum": [
349+
"live_repair_ready"
350+
]
351+
}
352+
},
353+
"required": [
354+
"type"
355+
]
356+
},
357+
{
358+
"type": "object",
359+
"properties": {
360+
"type": {
361+
"type": "string",
362+
"enum": [
363+
"live_repair"
364+
]
365+
}
366+
},
367+
"required": [
368+
"type"
369+
]
370+
},
371+
{
372+
"type": "object",
373+
"properties": {
374+
"type": {
375+
"type": "string",
376+
"enum": [
377+
"offline"
378+
]
379+
}
380+
},
381+
"required": [
382+
"type"
383+
]
384+
},
385+
{
386+
"type": "object",
387+
"properties": {
388+
"type": {
389+
"type": "string",
390+
"enum": [
391+
"replaced"
392+
]
393+
}
394+
},
395+
"required": [
396+
"type"
397+
]
398+
},
399+
{
400+
"description": "The IO task for the client is being stopped",
401+
"type": "object",
402+
"properties": {
403+
"type": {
404+
"type": "string",
405+
"enum": [
406+
"stopping"
407+
]
408+
},
409+
"value": {
410+
"$ref": "#/components/schemas/ClientStopReason"
411+
}
412+
},
413+
"required": [
414+
"type",
415+
"value"
416+
]
417+
}
106418
]
107419
},
108420
"Error": {

0 commit comments

Comments
 (0)