Skip to content

Commit ae8630f

Browse files
leftwoAlan Hanson
and
Alan Hanson
authored
Verify extent under repair is valid after copying files (#1159)
Verify extent under repair is valid after copying files Added a call to the downstairs repair API that will verify that an extent is closed (or the region is read only), meaning the extent is okay to copy. This prevents a rare but possible condition where an extent is closed when a remote side contacts it for repair, but is re-opened before the extent file copy starts and the resulting copy is invalid. Keep a mutex for the repair task --------- Co-authored-by: Alan Hanson <alan@oxide.computer>
1 parent 06c595c commit ae8630f

File tree

8 files changed

+168
-6
lines changed

8 files changed

+168
-6
lines changed

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

downstairs/src/lib.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ mod stats;
4747
mod extent_inner_raw;
4848
mod extent_inner_sqlite;
4949

50+
use extent::ExtentState;
5051
use region::Region;
5152

5253
pub use admin::run_dropshot;
@@ -3211,7 +3212,7 @@ pub async fn start_downstairs(
32113212
let repair_log = d.lock().await.log.new(o!("task" => "repair".to_string()));
32123213

32133214
let repair_listener =
3214-
match repair::repair_main(&dss, repair_address, &repair_log).await {
3215+
match repair::repair_main(dss, repair_address, &repair_log).await {
32153216
Err(e) => {
32163217
// TODO tear down other things if repair server can't be
32173218
// started?

downstairs/src/region.rs

+26-1
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,9 @@ impl Region {
507507
*
508508
* Let us assume we are repairing extent 012
509509
* 1. Make new 012.copy dir (extent name plus: .copy)
510-
* 2. Get all extent files from source side, put in 012.copy directory
510+
* 2. Get all extent files from source side, put in 012.copy directory.
511+
* Verify after the copy completes that the remote side still has
512+
* the extent closed (or the region is read only).
511513
* 3. fsync files we just downloaded
512514
* 4. Rename 012.copy dir to 012.replace dir
513515
* 5. fsync extent directory ( 00/000/ where the extent files live)
@@ -670,6 +672,29 @@ impl Region {
670672
count += 1;
671673
}
672674

675+
// We have copied over the extent. Verify that the remote side
676+
// still believes that it is valid for repair so we know we have
677+
// received a valid copy.
678+
info!(self.log, "Verify extent {eid} still ready for copy");
679+
let rc = match repair_server.extent_repair_ready(eid as u32).await {
680+
Ok(rc) => rc.into_inner(),
681+
Err(e) => {
682+
crucible_bail!(
683+
RepairRequestError,
684+
"Failed to verify extent is still valid for repair {:?}",
685+
e,
686+
);
687+
}
688+
};
689+
690+
if !rc {
691+
warn!(self.log, "The repair of {eid} is being aborted.");
692+
crucible_bail!(
693+
RepairRequestError,
694+
"Extent {eid} on remote repair server is in incorrect state",
695+
);
696+
}
697+
673698
// After we have all files: move the repair dir.
674699
info!(
675700
self.log,

downstairs/src/repair.rs

+30-2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ pub struct FileServerContext {
2525
region_dir: PathBuf,
2626
read_only: bool,
2727
region_definition: RegionDefinition,
28+
downstairs: Arc<Mutex<Downstairs>>,
2829
}
2930

3031
pub fn write_openapi<W: Write>(f: &mut W) -> Result<()> {
@@ -39,13 +40,14 @@ fn build_api() -> ApiDescription<Arc<FileServerContext>> {
3940
api.register(get_files_for_extent).unwrap();
4041
api.register(get_region_info).unwrap();
4142
api.register(get_region_mode).unwrap();
43+
api.register(extent_repair_ready).unwrap();
4244

4345
api
4446
}
4547

4648
/// Returns Ok(listen address) if everything launched ok, Err otherwise
4749
pub async fn repair_main(
48-
ds: &Mutex<Downstairs>,
50+
downstairs: Arc<Mutex<Downstairs>>,
4951
addr: SocketAddr,
5052
log: &Logger,
5153
) -> Result<SocketAddr, String> {
@@ -67,7 +69,7 @@ pub async fn repair_main(
6769
* Record the region directory where all the extents and metadata
6870
* files live.
6971
*/
70-
let ds = ds.lock().await;
72+
let ds = downstairs.lock().await;
7173
let region_dir = ds.region.dir.clone();
7274
let read_only = ds.read_only;
7375
let region_definition = ds.region.def();
@@ -78,6 +80,7 @@ pub async fn repair_main(
7880
region_dir,
7981
read_only,
8082
region_definition,
83+
downstairs,
8184
};
8285

8386
/*
@@ -186,6 +189,31 @@ async fn get_a_file(path: PathBuf) -> Result<Response<Body>, HttpError> {
186189
}
187190
}
188191

192+
/// Return true if the provided extent is closed or the region is read only
193+
#[endpoint {
194+
method = GET,
195+
path = "/extent/{eid}/repair-ready",
196+
}]
197+
async fn extent_repair_ready(
198+
rqctx: RequestContext<Arc<FileServerContext>>,
199+
path: Path<Eid>,
200+
) -> Result<HttpResponseOk<bool>, HttpError> {
201+
let eid: usize = path.into_inner().eid as usize;
202+
let downstairs = &rqctx.context().downstairs;
203+
204+
// If the region is read only, the extent is always ready.
205+
if rqctx.context().read_only {
206+
return Ok(HttpResponseOk(true));
207+
}
208+
209+
let res = {
210+
let ds = downstairs.lock().await;
211+
matches!(ds.region.extents[eid], ExtentState::Closed)
212+
};
213+
214+
Ok(HttpResponseOk(res))
215+
}
216+
189217
/**
190218
* Get the list of files related to an extent.
191219
*

integration_tests/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ futures.workspace = true
2626
hex.workspace = true
2727
httptest.workspace = true
2828
rand.workspace = true
29+
repair-client.workspace = true
2930
reqwest.workspace = true
3031
serde.workspace = true
3132
serde_json.workspace = true

integration_tests/src/lib.rs

+69
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ mod test {
1616
use futures::lock::Mutex;
1717
use httptest::{matchers::*, responders::*, Expectation, Server};
1818
use rand::Rng;
19+
use repair_client::Client;
1920
use sha2::Digest;
2021
use slog::{info, o, warn, Drain, Logger};
2122
use tempfile::*;
@@ -2818,6 +2819,74 @@ mod test {
28182819
Ok(())
28192820
}
28202821

2822+
#[tokio::test]
2823+
async fn integration_test_repair_ready_not_closed() -> Result<()> {
2824+
// Create a new downstairs.
2825+
// Verify repair ready returns false when an extent is open
2826+
2827+
// Create a downstairs
2828+
let new_ds = TestDownstairs::new(
2829+
"127.0.0.1".parse().unwrap(),
2830+
true, // encrypted
2831+
false, // read only
2832+
5,
2833+
2,
2834+
false,
2835+
Backend::RawFile,
2836+
)
2837+
.await
2838+
.unwrap();
2839+
2840+
let repair_addr = new_ds.repair_address().await;
2841+
2842+
let url = format!("http://{:?}", repair_addr);
2843+
let repair_server = Client::new(&url);
2844+
2845+
// Extent are open, so the repair ready request should return false.
2846+
let rc = repair_server.extent_repair_ready(0).await;
2847+
assert!(!rc.unwrap().into_inner());
2848+
2849+
let rc = repair_server.extent_repair_ready(1).await;
2850+
assert!(!rc.unwrap().into_inner());
2851+
2852+
Ok(())
2853+
}
2854+
2855+
#[tokio::test]
2856+
async fn integration_test_repair_ready_read_only() -> Result<()> {
2857+
// Create a new downstairs.
2858+
// Verify repair ready returns true for read only downstairs,
2859+
// even when extents are open.
2860+
2861+
// Create a downstairs
2862+
let new_ds = TestDownstairs::new(
2863+
"127.0.0.1".parse().unwrap(),
2864+
true, // encrypted
2865+
true, // read only
2866+
5,
2867+
2,
2868+
false,
2869+
Backend::RawFile,
2870+
)
2871+
.await
2872+
.unwrap();
2873+
2874+
let repair_addr = new_ds.repair_address().await;
2875+
2876+
let url = format!("http://{:?}", repair_addr);
2877+
let repair_server = Client::new(&url);
2878+
2879+
// Extent are not open, but the region is read only, so the requests
2880+
// should all return true.
2881+
let rc = repair_server.extent_repair_ready(0).await;
2882+
assert!(rc.unwrap().into_inner());
2883+
2884+
let rc = repair_server.extent_repair_ready(1).await;
2885+
assert!(rc.unwrap().into_inner());
2886+
2887+
Ok(())
2888+
}
2889+
28212890
#[tokio::test]
28222891
async fn integration_test_clone_diff_ec() -> Result<()> {
28232892
// Test downstairs region clone.

openapi/downstairs-repair.json

+37
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,43 @@
4646
}
4747
}
4848
},
49+
"/extent/{eid}/repair-ready": {
50+
"get": {
51+
"summary": "Return true if the provided extent is closed or the region is read only",
52+
"operationId": "extent_repair_ready",
53+
"parameters": [
54+
{
55+
"in": "path",
56+
"name": "eid",
57+
"required": true,
58+
"schema": {
59+
"type": "integer",
60+
"format": "uint32",
61+
"minimum": 0
62+
}
63+
}
64+
],
65+
"responses": {
66+
"200": {
67+
"description": "successful operation",
68+
"content": {
69+
"application/json": {
70+
"schema": {
71+
"title": "Boolean",
72+
"type": "boolean"
73+
}
74+
}
75+
}
76+
},
77+
"4XX": {
78+
"$ref": "#/components/responses/Error"
79+
},
80+
"5XX": {
81+
"$ref": "#/components/responses/Error"
82+
}
83+
}
84+
}
85+
},
4986
"/newextent/{eid}/{file_type}": {
5087
"get": {
5188
"operationId": "get_extent_file",

upstairs/src/client.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -1271,8 +1271,8 @@ impl DownstairsClient {
12711271
if old_state != IOState::InProgress {
12721272
// This job is in an unexpected state.
12731273
panic!(
1274-
"[{}] Job {} completed while not InProgress: {:?}",
1275-
self.client_id, ds_id, old_state
1274+
"[{}] Job {} completed while not InProgress: {:?} {:?}",
1275+
self.client_id, ds_id, old_state, job
12761276
);
12771277
}
12781278

0 commit comments

Comments
 (0)