From 3f486dcdc80c738d2f5124a4a8c437e3e9400f67 Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Sat, 27 May 2023 06:36:59 +0000 Subject: [PATCH 1/7] downstairs sqlite: use unix-excl VFS, removing SHM file --- downstairs/src/region.rs | 127 ++++++++++++--------------------- downstairs/src/repair.rs | 31 +++----- openapi/downstairs-repair.json | 1 - 3 files changed, 56 insertions(+), 103 deletions(-) diff --git a/downstairs/src/region.rs b/downstairs/src/region.rs index 844c7d1d2..2ba3a3a78 100644 --- a/downstairs/src/region.rs +++ b/downstairs/src/region.rs @@ -11,7 +11,7 @@ use tokio::sync::{Mutex, MutexGuard}; use anyhow::{anyhow, bail, Result}; use futures::TryStreamExt; use nix::unistd::{sysconf, SysconfVar}; -use rusqlite::{params, Connection}; +use rusqlite::{params, Connection, OpenFlags}; use serde::{Deserialize, Serialize}; use tracing::instrument; @@ -324,7 +324,6 @@ impl Default for ExtentMeta { pub enum ExtentType { Data, Db, - DbShm, DbWal, } @@ -333,7 +332,6 @@ impl fmt::Display for ExtentType { match self { ExtentType::Data => Ok(()), ExtentType::Db => write!(f, "db"), - ExtentType::DbShm => write!(f, "db-shm"), ExtentType::DbWal => write!(f, "db-wal"), } } @@ -348,7 +346,6 @@ impl ExtentType { match self { ExtentType::Data => FileType::Data, ExtentType::Db => FileType::Db, - ExtentType::DbShm => FileType::DbShm, ExtentType::DbWal => FileType::DbWal, } } @@ -362,7 +359,7 @@ pub fn extent_file_name(number: u32, extent_type: ExtentType) -> String { ExtentType::Data => { format!("{:03X}", number & 0xFFF) } - ExtentType::Db | ExtentType::DbShm | ExtentType::DbWal => { + ExtentType::Db | ExtentType::DbWal => { format!("{:03X}.{}", number & 0xFFF, extent_type) } } @@ -456,7 +453,7 @@ pub fn remove_copy_cleanup_dir>(dir: P, eid: u32) -> Result<()> { /** * Validate a list of sorted repair files. - * There are either two or four files we expect to find, any more or less + * There are either two or three files we expect to find, any more or less * and we have a bad list. No duplicates. */ pub fn validate_repair_files(eid: usize, files: &[String]) -> bool { @@ -468,10 +465,7 @@ pub fn validate_repair_files(eid: usize, files: &[String]) -> bool { ]; let mut all = some.clone(); - all.extend(vec![ - extent_file_name(eid, ExtentType::DbShm), - extent_file_name(eid, ExtentType::DbWal), - ]); + all.extend(vec![extent_file_name(eid, ExtentType::DbWal)]); // Either we have some or all. files == some || files == all @@ -480,7 +474,25 @@ pub fn validate_repair_files(eid: usize, files: &[String]) -> bool { /// Always open sqlite with journaling, and synchronous. /// Note: these pragma_updates are not durable fn open_sqlite_connection>(path: &P) -> Result { - let metadb = Connection::open(path)?; + // By default, sqlite uses these shm files for synchronization to support + // DB access from multiple processes. We don't need that at all in crucible; + // we are only going to access the DB from the single process. So we change + // it to unix-excl which prevents other processes from accessing the DB, and + // keeps everything in the heap, but still allows multiple connections from + // in-process. If we ultimately decide we don't want to hold multiple + // connections per DB (and thus, we don't want any parallelism of SQLite + // operations within an extent), we could change this to unix-none to forego + // locking altogether. + // + // See https://www.sqlite.org/vfs.html#standard_unix_vfses for more info. + // metadb.pragma_update(None, "", "")?; + let metadb: Connection = Connection::open_with_flags_and_vfs( + path, + OpenFlags::default(), + "unix-excl", + )?; + + // let metadb = Connection::open(path)?; assert!(metadb.is_autocommit()); metadb.pragma_update(None, "journal_mode", "WAL")?; @@ -1696,9 +1708,9 @@ impl Region { * from 012.replace dir * 7. Remove any files in extent dir that don't exist in replacing dir * For example, if the replacement extent has 012 and 012.db, but - * the current (bad) extent has 012 012.db 012.db-shm - * and 012.db-wal, we want to remove the 012.db-shm and 012.db-wal - * files when we replace 012 and 012.db with the new versions. + * the current (bad) extent has 012 012.db + * and 012.db-wal, we want to remove the 012.db-wal + * file when we replace 012 and 012.db with the new versions. * 8. fsync files after copying them (new location). * 9. fsync containing extent dir * 10. Rename 012.replace dir to 012.completed dir. @@ -1790,7 +1802,7 @@ impl Region { // The repair file list should always contain the extent data // file itself, and the .db file (metadata) for that extent. // Missing these means the repair will not succeed. - // Optionally, there could be both .db-shm and .db-wal. + // Optionally, there could be .db-wal. if !validate_repair_files(eid, &repair_files) { crucible_bail!( RepairFilesInvalid, @@ -1837,7 +1849,8 @@ impl Region { save_stream_to_file(extent_db, repair_stream.into_inner()).await?; // These next two are optional. - for opt_file in &[ExtentType::DbShm, ExtentType::DbWal] { + // YYY + for opt_file in &[ExtentType::DbWal] { let filename = extent_file_name(eid as u32, opt_file.clone()); if repair_files.contains(&filename) { @@ -2291,31 +2304,9 @@ pub fn move_replacement_extent>( } sync_path(&original_file, log)?; - // The .db-shm and .db-wal files may or may not exist. If they don't - // exist on the source side, then be sure to remove them locally to - // avoid database corruption from a mismatch between old and new. - new_file.set_extension("db-shm"); - original_file.set_extension("db-shm"); - if new_file.exists() { - if let Err(e) = std::fs::copy(new_file.clone(), original_file.clone()) { - crucible_bail!( - IoError, - "copy {:?} to {:?} got: {:?}", - new_file, - original_file, - e - ); - } - sync_path(&original_file, log)?; - } else if original_file.exists() { - info!( - log, - "Remove old file {:?} as there is no replacement", - original_file.clone() - ); - std::fs::remove_file(&original_file)?; - } - + // The .db-wal file may or may not exist. If they don't exist on the source + // side, then be sure to remove them locally to avoid database corruption + // from a mismatch between old and new. new_file.set_extension("db-wal"); original_file.set_extension("db-wal"); if new_file.exists() { @@ -2731,10 +2722,6 @@ mod test { dest_path.set_extension("db"); std::fs::copy(source_path.clone(), dest_path.clone())?; - source_path.set_extension("db-shm"); - dest_path.set_extension("db-shm"); - std::fs::copy(source_path.clone(), dest_path.clone())?; - source_path.set_extension("db-wal"); dest_path.set_extension("db-wal"); std::fs::copy(source_path.clone(), dest_path.clone())?; @@ -2765,11 +2752,11 @@ mod test { #[tokio::test] async fn reopen_extent_cleanup_replay_short() -> Result<()> { - // test move_replacement_extent(), create a copy dir, populate it - // and let the reopen do the work. This time we make sure our - // copy dir only has extent data and .db files, and not .db-shm - // nor .db-wal. Verify these files are delete from the original - // extent after the reopen has cleaned them up. + // test move_replacement_extent(), create a copy dir, populate it and + // let the reopen do the work. This time we make sure our copy dir only + // has extent data and .db files, and not .db-wal. + // Verify these files are delete from the original extent after the + // reopen has cleaned them up. // Create the region, make three extents let dir = tempdir()?; let mut region = @@ -2801,16 +2788,11 @@ mod test { let rd = replace_dir(&dir, 1); rename(cp.clone(), rd.clone())?; - // The close may remove the db-shm and db-wal files, manually - // create them here, just to verify they are removed after the - // reopen as they are not included in the files to be recovered - // and this test exists to verify they will be deleted. + // The close may remove the db-wal file, manually create them here, just + // to verify they are removed after the reopen as they are not included + // in the files to be recovered and this test exists to verify they will + // be deleted. let mut invalid_db = extent_path(&dir, 1); - invalid_db.set_extension("db-shm"); - println!("Recreate {:?}", invalid_db); - std::fs::copy(source_path.clone(), invalid_db.clone())?; - assert!(Path::new(&invalid_db).exists()); - invalid_db.set_extension("db-wal"); println!("Recreate {:?}", invalid_db); std::fs::copy(source_path.clone(), invalid_db.clone())?; @@ -2823,9 +2805,7 @@ mod test { // Reopen extent 1 region.reopen_extent(1).await?; - // Make sure there is no longer a db-shm and db-wal - dest_path.set_extension("db-shm"); - assert!(!Path::new(&dest_path).exists()); + // Make sure there is no longer a db-wal dest_path.set_extension("db-wal"); assert!(!Path::new(&dest_path).exists()); @@ -2906,7 +2886,6 @@ mod test { let good_files: Vec = vec![ "001".to_string(), "001.db".to_string(), - "001.db-shm".to_string(), "001.db-wal".to_string(), ]; @@ -2943,6 +2922,10 @@ mod test { #[test] fn validate_repair_files_quad_duplicate() { + // XXX I don't know why this test exists. we definitely shouldnt be + // involving shm in anything anymore. but what was this supposed to even + // be doing? idk. + // This is an expected list of files for an extent let good_files: Vec = vec![ "001".to_string(), @@ -2959,7 +2942,6 @@ mod test { let good_files: Vec = vec![ "001".to_string(), "001.db".to_string(), - "001.db-shm".to_string(), "001.db-wal".to_string(), ]; @@ -2971,18 +2953,6 @@ mod test { // Duplicate file in list let good_files: Vec = vec![ "001".to_string(), - "001".to_string(), - "001.db".to_string(), - "001.db-shm".to_string(), - "001.db-wal".to_string(), - ]; - assert!(!validate_repair_files(1, &good_files)); - } - - #[test] - fn validate_repair_files_not_good_enough() { - // We require 2 or 4 files, not 3 - let good_files: Vec = vec![ "001".to_string(), "001.db".to_string(), "001.db-wal".to_string(), @@ -3126,11 +3096,6 @@ mod test { assert_eq!(extent_file_name(4, ExtentType::Db), "004.db"); } - #[test] - fn extent_name_basic_ext_shm() { - assert_eq!(extent_file_name(4, ExtentType::DbShm), "004.db-shm"); - } - #[test] fn extent_name_basic_ext_wal() { assert_eq!(extent_file_name(4, ExtentType::DbWal), "004.db-wal"); diff --git a/downstairs/src/repair.rs b/downstairs/src/repair.rs index d0743f9db..05dc0dd2a 100644 --- a/downstairs/src/repair.rs +++ b/downstairs/src/repair.rs @@ -103,8 +103,6 @@ pub enum FileType { Data, #[serde(rename = "db")] Database, - #[serde(rename = "db_shm")] - DatabaseSharedMemory, #[serde(rename = "db_wal")] DatabaseLog, } @@ -131,11 +129,8 @@ async fn get_extent_file( FileType::Database => { extent_path.set_extension("db"); } - FileType::DatabaseSharedMemory => { - extent_path.set_extension("db-wal"); - } FileType::DatabaseLog => { - extent_path.set_extension("db-shm"); + extent_path.set_extension("db-wal"); } FileType::Data => (), }; @@ -235,7 +230,6 @@ async fn extent_file_list( let possible_files = vec![ (extent_file_name(eid, ExtentType::Data), true), (extent_file_name(eid, ExtentType::Db), true), - (extent_file_name(eid, ExtentType::DbShm), false), (extent_file_name(eid, ExtentType::DbWal), false), ]; @@ -289,7 +283,7 @@ mod test { let ed = extent_dir(&dir, 1); let mut ex_files = extent_file_list(ed, 1).await.unwrap(); ex_files.sort(); - let expected = vec!["001", "001.db", "001.db-shm", "001.db-wal"]; + let expected = vec!["001", "001.db", "001.db-wal"]; println!("files: {:?}", ex_files); assert_eq!(ex_files, expected); @@ -300,7 +294,7 @@ mod test { async fn extent_expected_files_short() -> Result<()> { // Verify that the list of files returned for an extent matches // what we expect. In this case we expect the extent data file and - // the .db file, but not the .db-shm or .db-wal database files. + // the .db file, but not the .db-wal database files. let dir = tempdir()?; let mut region = Region::create(&dir, new_region_options(), csl()).await?; @@ -309,13 +303,11 @@ mod test { // Determine the directory and name for expected extent files. let extent_dir = extent_dir(&dir, 1); - // Delete db-wal and db-shm + // Delete db-wal let mut rm_file = extent_dir.clone(); rm_file.push(extent_file_name(1, ExtentType::Data)); rm_file.set_extension("db-wal"); std::fs::remove_file(&rm_file).unwrap(); - rm_file.set_extension("db-shm"); - std::fs::remove_file(rm_file).unwrap(); let mut ex_files = extent_file_list(extent_dir, 1).await.unwrap(); ex_files.sort(); @@ -328,11 +320,10 @@ mod test { #[tokio::test] async fn extent_expected_files_short_with_close() -> Result<()> { - // Verify that the list of files returned for an extent matches - // what we expect. In this case we expect the extent data file and - // the .db file, but not the .db-shm or .db-wal database files. - // We close the extent here first, and on illumos that behaves - // a little different than elsewhere. + // Verify that the list of files returned for an extent matches what we + // expect. In this case we expect the extent data file and the .db file, + // but not the or .db-wal database files. We close the extent here + // first, and on illumos that behaves a little different than elsewhere. let dir = tempdir()?; let mut region = Region::create(&dir, new_region_options(), csl()).await?; @@ -344,14 +335,12 @@ mod test { // Determine the directory and name for expected extent files. let extent_dir = extent_dir(&dir, 1); - // Delete db-wal and db-shm. On illumos the close of the extent - // may remove these for us, so we ignore errors on the removal. + // Delete db-wal. On illumos the close of the extent may remove these + // for us, so we ignore errors on the removal. let mut rm_file = extent_dir.clone(); rm_file.push(extent_file_name(1, ExtentType::Data)); rm_file.set_extension("db-wal"); let _ = std::fs::remove_file(&rm_file); - rm_file.set_extension("db-shm"); - let _ = std::fs::remove_file(rm_file); let mut ex_files = extent_file_list(extent_dir, 1).await.unwrap(); ex_files.sort(); diff --git a/openapi/downstairs-repair.json b/openapi/downstairs-repair.json index a229d6077..d289d6934 100644 --- a/openapi/downstairs-repair.json +++ b/openapi/downstairs-repair.json @@ -120,7 +120,6 @@ "enum": [ "data", "db", - "db_shm", "db_wal" ] } From 301c37d597bd80db5833a7129f325ebf5de2984f Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Sun, 28 May 2023 07:07:30 +0000 Subject: [PATCH 2/7] meant to get this cleanup in there --- downstairs/src/region.rs | 51 ++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/downstairs/src/region.rs b/downstairs/src/region.rs index 2ba3a3a78..1afc06bec 100644 --- a/downstairs/src/region.rs +++ b/downstairs/src/region.rs @@ -1848,34 +1848,29 @@ impl Region { }; save_stream_to_file(extent_db, repair_stream.into_inner()).await?; - // These next two are optional. - // YYY - for opt_file in &[ExtentType::DbWal] { - let filename = extent_file_name(eid as u32, opt_file.clone()); - - if repair_files.contains(&filename) { - let extent_shm = extent.create_copy_file( - copy_dir.clone(), - Some(opt_file.clone()), - )?; - let repair_stream = match repair_server - .get_extent_file(eid as u32, opt_file.to_file_type()) - .await - { - Ok(rs) => rs, - Err(e) => { - crucible_bail!( - RepairRequestError, - "Failed to get extent {} {} file: {:?}", - eid, - opt_file, - e, - ); - } - }; - save_stream_to_file(extent_shm, repair_stream.into_inner()) - .await?; - } + // the WAL log isn't always there + let opt_file = ExtentType::DbWal; + let wal_filename = extent_file_name(eid as u32, opt_file.clone()); + + if repair_files.contains(&wal_filename) { + let extent_wal = extent + .create_copy_file(copy_dir.clone(), Some(opt_file.clone()))?; + let repair_stream = match repair_server + .get_extent_file(eid as u32, opt_file.to_file_type()) + .await + { + Ok(rs) => rs, + Err(e) => { + crucible_bail!( + RepairRequestError, + "Failed to get extent {} {} file: {:?}", + eid, + opt_file, + e, + ); + } + }; + save_stream_to_file(extent_wal, repair_stream.into_inner()).await?; } // After we have all files: move the repair dir. From ad6d00c5c448f9f905f15cf6d5a939b9fc114e86 Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Fri, 2 Jun 2023 09:38:13 +0000 Subject: [PATCH 3/7] fix up test_repair.sh --- tools/test_repair.sh | 53 ++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/tools/test_repair.sh b/tools/test_repair.sh index 8d67ba321..1079e19a6 100755 --- a/tools/test_repair.sh +++ b/tools/test_repair.sh @@ -140,17 +140,14 @@ for (( i = 0; i < 100; i += 1 )); do (( generation += 1)) echo "" - # Stop --lossy downstairs so it can't complete all its IOs - if [[ $choice -eq 0 ]]; then - kill "$ds0_pid" - wait "$ds0_pid" || true - elif [[ $choice -eq 1 ]]; then - kill "$ds1_pid" - wait "$ds1_pid" || true - else - kill "$ds2_pid" - wait "$ds2_pid" || true - fi + + # Stop everything before dump so dump can access the extent metadata + kill "$ds0_pid" + wait "$ds0_pid" || true + kill "$ds1_pid" + wait "$ds1_pid" || true + kill "$ds2_pid" + wait "$ds2_pid" || true # Did we get any mismatches? # We || true because dump will return non-zero when it finds @@ -160,17 +157,13 @@ for (( i = 0; i < 100; i += 1 )); do echo "On loop $i" echo "" - # Start downstairs without lossy - if [[ $choice -eq 0 ]]; then - ${cds} run -d "${testdir}/8810" -p 8810 &> "$ds_log_prefix"8810.txt & - ds0_pid=$! - elif [[ $choice -eq 1 ]]; then - ${cds} run -d "${testdir}/8820" -p 8820 &> "$ds_log_prefix"8820.txt & - ds1_pid=$! - else - ${cds} run -d "${testdir}/8830" -p 8830 &> "$ds_log_prefix"8830.txt & - ds2_pid=$! - fi + # Start everything back up, no lossy + ${cds} run -d "${testdir}/8810" -p 8810 &> "$ds_log_prefix"8810.txt & + ds0_pid=$! + ${cds} run -d "${testdir}/8820" -p 8820 &> "$ds_log_prefix"8820.txt & + ds1_pid=$! + ${cds} run -d "${testdir}/8830" -p 8830 &> "$ds_log_prefix"8830.txt & + ds2_pid=$! cp "$verify_file" ${verify_file}.last echo "Verifying data now" @@ -194,9 +187,25 @@ for (( i = 0; i < 100; i += 1 )); do set -o errexit (( generation += 1)) + # Need to shut down the downstairs before dumping due to the use of + # unix-excl sqlite VFS- only one process can access the metadata at a time. + kill "$ds0_pid" + wait "$ds0_pid" || true + kill "$ds1_pid" + wait "$ds1_pid" || true + kill "$ds2_pid" + wait "$ds2_pid" || true + echo "Loop: $i Downstairs dump after verify (and repair):" ${cds} dump ${dump_args[@]} + # Start everything back up + ${cds} run -d "${testdir}/8810" -p 8810 &> "$ds_log_prefix"8810.txt & + ds0_pid=$! + ${cds} run -d "${testdir}/8820" -p 8820 &> "$ds_log_prefix"8820.txt & + ds1_pid=$! + ${cds} run -d "${testdir}/8830" -p 8830 &> "$ds_log_prefix"8830.txt & + ds2_pid=$! done duration=$SECONDS From b70215e0253ef7957c96d16e5a134bfa85cfe0b5 Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Fri, 2 Jun 2023 10:13:55 +0000 Subject: [PATCH 4/7] fix test_up.sh --- tools/test_up.sh | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/tools/test_up.sh b/tools/test_up.sh index aa5b3ff4f..7ecba18ec 100755 --- a/tools/test_up.sh +++ b/tools/test_up.sh @@ -245,19 +245,21 @@ echo mv "${testdir}/previous" "${testdir}/${port}" rm -rf "${testdir:?}/${port:?}" mv "${testdir}/previous" "${testdir}/${port}" -echo "Restart downstairs with old directory" -if ! "$dsc" cmd start -c 2; then +# Put a dump test in the middle of the repair test, so we +# can see both a mismatch and that dump works. +# The dump args look different than other downstairs commands + +# shut down all downstairs so we can dump +echo "Shutting down all downstairs so we can dump" +if ! "$dsc" cmd stop-all; then (( res += 1 )) echo "" - echo "Failed repair test part 1, starting downstairs 2" - echo "Failed repair test part 1, starting downstairs 2" >> "$fail_log" + echo "Failed dsc stop-all" + echo "Failed dsc stop-all" >> "$fail_log" echo fi -# Put a dump test in the middle of the repair test, so we -# can see both a mismatch and that dump works. -# The dump args look different than other downstairs commands for (( i = 0; i < 30; i += 10 )); do (( port = port_base + i )) dir="${testdir}/$port" @@ -275,6 +277,16 @@ else echo "dump test found error as expected" fi + +echo "Restart downstairs with old directory" +if ! "$dsc" cmd start-all; then + (( res += 1 )) + echo "" + echo "Failed repair test part 1, starting downstairs" + echo "Failed repair test part 1, starting downstairs" >> "$fail_log" + echo +fi + echo "" echo "" echo "$ct" "$tt" -g "$gen" -q "${args[@]}" @@ -289,6 +301,16 @@ else fi (( gen += 1 )) +# shut down all downstairs so we can dump +echo "Shutting down downstairs for the last time" +if ! "$dsc" cmd shutdown; then + (( res += 1 )) + echo "" + echo "Failed dsc shutdown" + echo "Failed dsc shutdown" >> "$fail_log" + echo +fi + # Now that repair has finished, make sure the dump command does not detect # any errors echo "$cds" dump "${dump_args[@]}" @@ -325,16 +347,6 @@ else echo "dump block test passed" fi -# Tests done, shut down the downstairs. -echo "Upstairs tests have completed, stopping all downstairs" -if ! "$dsc" cmd shutdown; then - (( res += 1 )) - echo "" - echo "Failed dsc shutdown" - echo "Failed dsc shutdown" >> "$fail_log" - echo -fi - echo "" if [[ $res != 0 ]]; then echo "$res Tests have failed" From 876c1ae03bfcd25bfd12ff5dec16a180fd30461a Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Fri, 2 Jun 2023 10:17:24 +0000 Subject: [PATCH 5/7] more cleanup --- downstairs/src/region.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/downstairs/src/region.rs b/downstairs/src/region.rs index 1afc06bec..c55dc3685 100644 --- a/downstairs/src/region.rs +++ b/downstairs/src/region.rs @@ -485,15 +485,12 @@ fn open_sqlite_connection>(path: &P) -> Result { // locking altogether. // // See https://www.sqlite.org/vfs.html#standard_unix_vfses for more info. - // metadb.pragma_update(None, "", "")?; let metadb: Connection = Connection::open_with_flags_and_vfs( path, OpenFlags::default(), "unix-excl", )?; - // let metadb = Connection::open(path)?; - assert!(metadb.is_autocommit()); metadb.pragma_update(None, "journal_mode", "WAL")?; metadb.pragma_update(None, "synchronous", "FULL")?; From 57787bd5da21d54b5c9286d554a2791d56c1d3bc Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Fri, 2 Jun 2023 10:26:23 +0000 Subject: [PATCH 6/7] correct this comment --- downstairs/src/region.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/downstairs/src/region.rs b/downstairs/src/region.rs index c55dc3685..a0cc49014 100644 --- a/downstairs/src/region.rs +++ b/downstairs/src/region.rs @@ -474,15 +474,15 @@ pub fn validate_repair_files(eid: usize, files: &[String]) -> bool { /// Always open sqlite with journaling, and synchronous. /// Note: these pragma_updates are not durable fn open_sqlite_connection>(path: &P) -> Result { - // By default, sqlite uses these shm files for synchronization to support - // DB access from multiple processes. We don't need that at all in crucible; - // we are only going to access the DB from the single process. So we change - // it to unix-excl which prevents other processes from accessing the DB, and + // By default, sqlite uses these shm files for synchronization to support DB + // access from multiple processes. We don't need that at all in crucible; we + // are only going to access the DB from the single process. So we change it + // to unix-excl which prevents other processes from accessing the DB, and // keeps everything in the heap, but still allows multiple connections from // in-process. If we ultimately decide we don't want to hold multiple // connections per DB (and thus, we don't want any parallelism of SQLite - // operations within an extent), we could change this to unix-none to forego - // locking altogether. + // operations within an extent), we could additionally set the + // `locking_mode` pragma to EXCLUSIVE. // // See https://www.sqlite.org/vfs.html#standard_unix_vfses for more info. let metadb: Connection = Connection::open_with_flags_and_vfs( From 41c1efacc2a0da34ab8c7ce9644f2e025d035da9 Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Wed, 7 Jun 2023 01:29:54 +0000 Subject: [PATCH 7/7] open DB in immutable mode during read-only access --- downstairs/src/region.rs | 67 ++++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/downstairs/src/region.rs b/downstairs/src/region.rs index a0cc49014..1168f4d74 100644 --- a/downstairs/src/region.rs +++ b/downstairs/src/region.rs @@ -1,6 +1,7 @@ // Copyright 2023 Oxide Computer Company use std::collections::{HashMap, HashSet}; use std::convert::TryInto; +use std::ffi::OsString; use std::fmt; use std::fs::{rename, File, OpenOptions}; use std::io::{BufReader, IoSlice, IoSliceMut, Read, Seek, SeekFrom, Write}; @@ -473,7 +474,55 @@ pub fn validate_repair_files(eid: usize, files: &[String]) -> bool { /// Always open sqlite with journaling, and synchronous. /// Note: these pragma_updates are not durable -fn open_sqlite_connection>(path: &P) -> Result { +fn open_sqlite_connection>( + path: &P, + read_only: bool, +) -> Result { + // If we're opening a read-only extent, we need to + // Per http://www.sqlite.org/draft/wal.html#readonly , opening a DB on a + // read-only filesystem only works under one of three conditions: + // - The -shm and -wal files already exists and are readable + // - There is write permission on the directory containing the database so + // that the -shm and -wal files can be created. + // - The database connection is opened using the immutable query parameter. + // + // When we use the unix-excl VFS, there are no -shm files, so condition one + // will never be met. Condition two will never be met either if the FS is + // read-only. Frankly, I don't understand why condition two is enforced + // for the unix-excl mode, since it doesn't use shm files, and uses an OS + // file lock instead (not a write operation). But that's the reality we live + // in right now. + // + // Anyways, that leaves condition three, setting the immutable parameter. + // + // Per http://www.sqlite.org/draft/uri.html#uriimmutable, + // + // > The immutable query parameter is a boolean that signals to SQLite that + // > the underlying database file is held on read-only media and cannot be + // > modified, even by another process with elevated privileges. SQLite + // > always opens immutable database files read-only and it skips all file + // > locking and change detection on immutable database files. If this query + // > parameter (or the SQLITE_IOCAP_IMMUTABLE bit in xDeviceCharacteristics) + // > asserts that a database file is immutable and that file changes anyhow, + // > then SQLite might return incorrect query results and/or SQLITE_CORRUPT + // > errors. + // + // Something not great here is that it *skips all file locking*. So if we + // were to open an extent in immutable mode on a RW filesystem, another + // instance of crucible could then open the extent in RW mode. This would + // not result in data corruption, but could result in the read-only instance + // getting an incorrect view of the data as the RW-instance changes things + // out from under it. This should never happen! But maybe we should build + // in additional region-level locking to make sure this never happens. TODO + let path = if read_only { + let mut path_str = OsString::from("file:"); + path_str.push(path.as_ref().as_os_str()); + path_str.push("?immutable=1"); + PathBuf::from(path_str) + } else { + path.as_ref().to_owned() + }; + // By default, sqlite uses these shm files for synchronization to support DB // access from multiple processes. We don't need that at all in crucible; we // are only going to access the DB from the single process. So we change it @@ -485,11 +534,9 @@ fn open_sqlite_connection>(path: &P) -> Result { // `locking_mode` pragma to EXCLUSIVE. // // See https://www.sqlite.org/vfs.html#standard_unix_vfses for more info. - let metadb: Connection = Connection::open_with_flags_and_vfs( - path, - OpenFlags::default(), - "unix-excl", - )?; + let flags = OpenFlags::default(); + let metadb = + Connection::open_with_flags_and_vfs(&path, flags, "unix-excl")?; assert!(metadb.is_autocommit()); metadb.pragma_update(None, "journal_mode", "WAL")?; @@ -609,7 +656,7 @@ impl Extent { */ path.set_extension("db"); let metadb = - match open_sqlite_connection(&path) { + match open_sqlite_connection(&path, read_only) { Err(e) => { error!( log, @@ -712,12 +759,12 @@ impl Extent { let metadb = if Path::new(&seed).exists() { std::fs::copy(&seed, &path)?; - open_sqlite_connection(&path)? + open_sqlite_connection(&path, false)? } else { /* * Create the metadata db */ - let metadb = open_sqlite_connection(&path)?; + let metadb = open_sqlite_connection(&path, false)?; /* * Create tables and insert base data @@ -794,7 +841,7 @@ impl Extent { // Save it as DB seed std::fs::copy(&path, &seed)?; - open_sqlite_connection(&path)? + open_sqlite_connection(&path, false)? }; /*