Skip to content

Commit 14b0892

Browse files
authoredApr 10, 2024··
server: return better HTTP errors when not ensured (#649)
Currently, if the `propolis-server` APIs that require a running instance are called when the server has not received an `InstanceEnsure` request, we return a 404. Sled-agent and Nexus handle this status code by assuming *they* have done something wrong, such as attempting to call an API route that the server doesn't support. This results in issues where sled-agent assumes that it's the one that made a mistake when it tries to call APIs on a `propolis-server` that has lost its VM instance due to being restarted after a panic. To make it easier for callers distinguish between errors that indicate "this server doesn't know about a running instance" and errors that indicate "you attempted to call an API route that I don't know about", I've changed the status code returned by `propolis-server`s that haven't created an instance to HTTP [424 Failed Dependency]. The best thing about returning this status code is that it's "not 404", but it does arguably have the most correct semantics for this specific error: > The HTTP 424 Failed Dependency client error response code indicates > that the method could not be performed on the resource because the > requested action depended on another action, and that action failed. On the other hand, this is kind of a weird status code that's mostly used by WebDAV servers, so I'm open to using something else. We could alternatively return 410 Gone in these cases, but that status is already returned by servers whose VM instance has been deleted. I thought it might be nice to be able to distinguish between "not ensured" and "ensured, and then deleted"...but we could switch to 410 Gone here, too. In addition, I've added an `enum` in `propolis-api-types` that's used to generate Dropshot "error code" strings in HTTP error responses. This can be used by clients to programmatically inspect the error's cause. [424 Failed Dependency]: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/424
1 parent 8ff3ab6 commit 14b0892

File tree

3 files changed

+78
-36
lines changed

3 files changed

+78
-36
lines changed
 

‎bin/propolis-server/src/lib/server.rs

+30-32
Original file line numberDiff line numberDiff line change
@@ -296,12 +296,7 @@ impl DropshotEndpointContext {
296296
self.services.vm.lock().await,
297297
VmControllerState::as_controller,
298298
)
299-
.map_err(|_| {
300-
HttpError::for_not_found(
301-
None,
302-
"Server not initialized (no instance)".to_string(),
303-
)
304-
})
299+
.map_err(|_| not_created_error())
305300
}
306301
}
307302

@@ -576,19 +571,21 @@ async fn instance_ensure_common(
576571
{
577572
let existing_properties = existing.properties();
578573
if existing_properties.id != properties.id {
579-
return Err(HttpError::for_status(
580-
Some(format!(
574+
return Err(HttpError::for_client_error(
575+
Some(api::ErrorCode::AlreadyInitialized.to_string()),
576+
http::status::StatusCode::CONFLICT,
577+
format!(
581578
"Server already initialized with ID {}",
582579
existing_properties.id
583-
)),
584-
http::status::StatusCode::CONFLICT,
580+
),
585581
));
586582
}
587583

588584
if *existing_properties != properties {
589-
return Err(HttpError::for_status(
590-
Some("Cannot update running server".to_string()),
585+
return Err(HttpError::for_client_error(
586+
Some(api::ErrorCode::AlreadyRunning.to_string()),
591587
http::status::StatusCode::CONFLICT,
588+
"Cannot update running server".to_string(),
592589
));
593590
}
594591

@@ -658,10 +655,11 @@ async fn instance_ensure_common(
658655
vm_hdl.await.unwrap()
659656
}
660657
.map_err(|e| {
661-
HttpError::for_internal_error(format!(
662-
"failed to create instance: {}",
663-
e
664-
))
658+
HttpError::for_client_error(
659+
Some(api::ErrorCode::CreateFailed.to_string()),
660+
http::status::StatusCode::INTERNAL_SERVER_ERROR,
661+
format!("failed to create instance: {e}"),
662+
)
665663
})?;
666664

667665
if let Some(ramfb) = vm.framebuffer() {
@@ -785,10 +783,7 @@ async fn instance_get_common(
785783
) -> Result<(api::Instance, VersionedInstanceSpec), HttpError> {
786784
let ctx = rqctx.context();
787785
match &*ctx.services.vm.lock().await {
788-
VmControllerState::NotCreated => Err(HttpError::for_not_found(
789-
None,
790-
"Server not initialized (no instance)".to_string(),
791-
)),
786+
VmControllerState::NotCreated => Err(not_created_error()),
792787
VmControllerState::Created(vm) => {
793788
Ok((
794789
api::Instance {
@@ -862,10 +857,7 @@ async fn instance_state_monitor(
862857
let vm_state = ctx.services.vm.lock().await;
863858
match &*vm_state {
864859
VmControllerState::NotCreated => {
865-
return Err(HttpError::for_not_found(
866-
None,
867-
"Server not initialized (no instance)".to_string(),
868-
));
860+
return Err(not_created_error());
869861
}
870862
VmControllerState::Created(vm) => vm.state_watcher().clone(),
871863
VmControllerState::Destroyed { state_watcher, .. } => {
@@ -885,12 +877,13 @@ async fn instance_state_monitor(
885877
// means it will never reach the number the client wants it to reach.
886878
// Inform the client of this condition so it doesn't wait forever.
887879
state_watcher.changed().await.map_err(|_| {
888-
HttpError::for_status(
889-
Some(format!(
880+
HttpError::for_client_error(
881+
Some(api::ErrorCode::NoInstance.to_string()),
882+
http::status::StatusCode::GONE,
883+
format!(
890884
"No instance present; will never reach generation {}",
891885
gen
892-
)),
893-
http::status::StatusCode::GONE,
886+
),
894887
)
895888
})?;
896889
}
@@ -1046,10 +1039,7 @@ async fn instance_migrate_status(
10461039
let migration_id = path_params.into_inner().migration_id;
10471040
let ctx = rqctx.context();
10481041
match &*ctx.services.vm.lock().await {
1049-
VmControllerState::NotCreated => Err(HttpError::for_not_found(
1050-
None,
1051-
"Server not initialized (no instance)".to_string(),
1052-
)),
1042+
VmControllerState::NotCreated => Err(not_created_error()),
10531043
VmControllerState::Created(vm) => {
10541044
vm.migrate_status(migration_id).map_err(Into::into).map(|state| {
10551045
HttpResponseOk(api::InstanceMigrateStatusResponse {
@@ -1206,6 +1196,14 @@ pub fn api() -> ApiDescription<Arc<DropshotEndpointContext>> {
12061196
api
12071197
}
12081198

1199+
fn not_created_error() -> HttpError {
1200+
HttpError::for_client_error(
1201+
Some(api::ErrorCode::NoInstance.to_string()),
1202+
http::StatusCode::FAILED_DEPENDENCY,
1203+
"Server not initialized (no instance)".to_string(),
1204+
)
1205+
}
1206+
12091207
#[cfg(test)]
12101208
mod tests {
12111209
#[test]

‎crates/propolis-api-types/src/lib.rs

+45-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
//! Definitions for types exposed by the propolis-server API
66
7-
use std::net::SocketAddr;
7+
use std::{fmt, net::SocketAddr};
88

99
use schemars::JsonSchema;
1010
use serde::{Deserialize, Serialize};
@@ -395,3 +395,47 @@ pub struct SnapshotRequestPathParams {
395395
pub struct VCRRequestPathParams {
396396
pub id: Uuid,
397397
}
398+
399+
/// Error codes used to populate the `error_code` field of Dropshot API responses.
400+
#[derive(
401+
Clone, Copy, Debug, Deserialize, PartialEq, Eq, Serialize, JsonSchema,
402+
)]
403+
pub enum ErrorCode {
404+
/// This `propolis-server` process has not received an `InstanceEnsure`
405+
/// request yet.
406+
NoInstance,
407+
/// This `propolis-server` process has already received an `InstanceEnsure`
408+
/// request with a different ID.
409+
AlreadyInitialized,
410+
/// Cannot update a running server.
411+
AlreadyRunning,
412+
/// Instance creation failed
413+
CreateFailed,
414+
}
415+
416+
impl fmt::Display for ErrorCode {
417+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
418+
fmt::Debug::fmt(self, f)
419+
}
420+
}
421+
422+
impl std::str::FromStr for ErrorCode {
423+
type Err = &'static str;
424+
fn from_str(s: &str) -> Result<Self, Self::Err> {
425+
match s.trim() {
426+
s if s.eq_ignore_ascii_case("NoInstance") => Ok(Self::NoInstance),
427+
s if s.eq_ignore_ascii_case("AlreadyInitialized") => {
428+
Ok(ErrorCode::AlreadyInitialized)
429+
}
430+
s if s.eq_ignore_ascii_case("AlreadyRunning") => {
431+
Ok(ErrorCode::AlreadyRunning)
432+
}
433+
s if s.eq_ignore_ascii_case("CreateFailed") => {
434+
Ok(ErrorCode::CreateFailed)
435+
}
436+
_ => Err("unknown error code, expected one of: \
437+
'NoInstance', 'AlreadyInitialized', 'AlreadyRunning', \
438+
'CreateFailed'"),
439+
}
440+
}
441+
}

‎phd-tests/tests/src/server_state_machine.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,15 @@ async fn instance_stop_causes_destroy_test(ctx: &Framework) {
3737

3838
assert!(matches!(
3939
vm.run().await.unwrap_err().status().unwrap(),
40-
http::status::StatusCode::NOT_FOUND
40+
http::status::StatusCode::FAILED_DEPENDENCY
4141
));
4242
assert!(matches!(
4343
vm.stop().await.unwrap_err().status().unwrap(),
44-
http::status::StatusCode::NOT_FOUND
44+
http::status::StatusCode::FAILED_DEPENDENCY
4545
));
4646
assert!(matches!(
4747
vm.reset().await.unwrap_err().status().unwrap(),
48-
http::status::StatusCode::NOT_FOUND
48+
http::status::StatusCode::FAILED_DEPENDENCY
4949
));
5050
}
5151

0 commit comments

Comments
 (0)
Please sign in to comment.