diff --git a/downstairs/src/region.rs b/downstairs/src/region.rs index 844c7d1d2..1168f4d74 100644 --- a/downstairs/src/region.rs +++ b/downstairs/src/region.rs @@ -1,6 +1,7 @@ // Copyright 2023 Oxide Computer Company use std::collections::{HashMap, HashSet}; use std::convert::TryInto; +use std::ffi::OsString; use std::fmt; use std::fs::{rename, File, OpenOptions}; use std::io::{BufReader, IoSlice, IoSliceMut, Read, Seek, SeekFrom, Write}; @@ -11,7 +12,7 @@ use tokio::sync::{Mutex, MutexGuard}; use anyhow::{anyhow, bail, Result}; use futures::TryStreamExt; use nix::unistd::{sysconf, SysconfVar}; -use rusqlite::{params, Connection}; +use rusqlite::{params, Connection, OpenFlags}; use serde::{Deserialize, Serialize}; use tracing::instrument; @@ -324,7 +325,6 @@ impl Default for ExtentMeta { pub enum ExtentType { Data, Db, - DbShm, DbWal, } @@ -333,7 +333,6 @@ impl fmt::Display for ExtentType { match self { ExtentType::Data => Ok(()), ExtentType::Db => write!(f, "db"), - ExtentType::DbShm => write!(f, "db-shm"), ExtentType::DbWal => write!(f, "db-wal"), } } @@ -348,7 +347,6 @@ impl ExtentType { match self { ExtentType::Data => FileType::Data, ExtentType::Db => FileType::Db, - ExtentType::DbShm => FileType::DbShm, ExtentType::DbWal => FileType::DbWal, } } @@ -362,7 +360,7 @@ pub fn extent_file_name(number: u32, extent_type: ExtentType) -> String { ExtentType::Data => { format!("{:03X}", number & 0xFFF) } - ExtentType::Db | ExtentType::DbShm | ExtentType::DbWal => { + ExtentType::Db | ExtentType::DbWal => { format!("{:03X}.{}", number & 0xFFF, extent_type) } } @@ -456,7 +454,7 @@ pub fn remove_copy_cleanup_dir>(dir: P, eid: u32) -> Result<()> { /** * Validate a list of sorted repair files. - * There are either two or four files we expect to find, any more or less + * There are either two or three files we expect to find, any more or less * and we have a bad list. No duplicates. */ pub fn validate_repair_files(eid: usize, files: &[String]) -> bool { @@ -468,10 +466,7 @@ pub fn validate_repair_files(eid: usize, files: &[String]) -> bool { ]; let mut all = some.clone(); - all.extend(vec![ - extent_file_name(eid, ExtentType::DbShm), - extent_file_name(eid, ExtentType::DbWal), - ]); + all.extend(vec![extent_file_name(eid, ExtentType::DbWal)]); // Either we have some or all. files == some || files == all @@ -479,8 +474,69 @@ pub fn validate_repair_files(eid: usize, files: &[String]) -> bool { /// Always open sqlite with journaling, and synchronous. /// Note: these pragma_updates are not durable -fn open_sqlite_connection>(path: &P) -> Result { - let metadb = Connection::open(path)?; +fn open_sqlite_connection>( + path: &P, + read_only: bool, +) -> Result { + // If we're opening a read-only extent, we need to + // Per http://www.sqlite.org/draft/wal.html#readonly , opening a DB on a + // read-only filesystem only works under one of three conditions: + // - The -shm and -wal files already exists and are readable + // - There is write permission on the directory containing the database so + // that the -shm and -wal files can be created. + // - The database connection is opened using the immutable query parameter. + // + // When we use the unix-excl VFS, there are no -shm files, so condition one + // will never be met. Condition two will never be met either if the FS is + // read-only. Frankly, I don't understand why condition two is enforced + // for the unix-excl mode, since it doesn't use shm files, and uses an OS + // file lock instead (not a write operation). But that's the reality we live + // in right now. + // + // Anyways, that leaves condition three, setting the immutable parameter. + // + // Per http://www.sqlite.org/draft/uri.html#uriimmutable, + // + // > The immutable query parameter is a boolean that signals to SQLite that + // > the underlying database file is held on read-only media and cannot be + // > modified, even by another process with elevated privileges. SQLite + // > always opens immutable database files read-only and it skips all file + // > locking and change detection on immutable database files. If this query + // > parameter (or the SQLITE_IOCAP_IMMUTABLE bit in xDeviceCharacteristics) + // > asserts that a database file is immutable and that file changes anyhow, + // > then SQLite might return incorrect query results and/or SQLITE_CORRUPT + // > errors. + // + // Something not great here is that it *skips all file locking*. So if we + // were to open an extent in immutable mode on a RW filesystem, another + // instance of crucible could then open the extent in RW mode. This would + // not result in data corruption, but could result in the read-only instance + // getting an incorrect view of the data as the RW-instance changes things + // out from under it. This should never happen! But maybe we should build + // in additional region-level locking to make sure this never happens. TODO + let path = if read_only { + let mut path_str = OsString::from("file:"); + path_str.push(path.as_ref().as_os_str()); + path_str.push("?immutable=1"); + PathBuf::from(path_str) + } else { + path.as_ref().to_owned() + }; + + // By default, sqlite uses these shm files for synchronization to support DB + // access from multiple processes. We don't need that at all in crucible; we + // are only going to access the DB from the single process. So we change it + // to unix-excl which prevents other processes from accessing the DB, and + // keeps everything in the heap, but still allows multiple connections from + // in-process. If we ultimately decide we don't want to hold multiple + // connections per DB (and thus, we don't want any parallelism of SQLite + // operations within an extent), we could additionally set the + // `locking_mode` pragma to EXCLUSIVE. + // + // See https://www.sqlite.org/vfs.html#standard_unix_vfses for more info. + let flags = OpenFlags::default(); + let metadb = + Connection::open_with_flags_and_vfs(&path, flags, "unix-excl")?; assert!(metadb.is_autocommit()); metadb.pragma_update(None, "journal_mode", "WAL")?; @@ -600,7 +656,7 @@ impl Extent { */ path.set_extension("db"); let metadb = - match open_sqlite_connection(&path) { + match open_sqlite_connection(&path, read_only) { Err(e) => { error!( log, @@ -703,12 +759,12 @@ impl Extent { let metadb = if Path::new(&seed).exists() { std::fs::copy(&seed, &path)?; - open_sqlite_connection(&path)? + open_sqlite_connection(&path, false)? } else { /* * Create the metadata db */ - let metadb = open_sqlite_connection(&path)?; + let metadb = open_sqlite_connection(&path, false)?; /* * Create tables and insert base data @@ -785,7 +841,7 @@ impl Extent { // Save it as DB seed std::fs::copy(&path, &seed)?; - open_sqlite_connection(&path)? + open_sqlite_connection(&path, false)? }; /* @@ -1696,9 +1752,9 @@ impl Region { * from 012.replace dir * 7. Remove any files in extent dir that don't exist in replacing dir * For example, if the replacement extent has 012 and 012.db, but - * the current (bad) extent has 012 012.db 012.db-shm - * and 012.db-wal, we want to remove the 012.db-shm and 012.db-wal - * files when we replace 012 and 012.db with the new versions. + * the current (bad) extent has 012 012.db + * and 012.db-wal, we want to remove the 012.db-wal + * file when we replace 012 and 012.db with the new versions. * 8. fsync files after copying them (new location). * 9. fsync containing extent dir * 10. Rename 012.replace dir to 012.completed dir. @@ -1790,7 +1846,7 @@ impl Region { // The repair file list should always contain the extent data // file itself, and the .db file (metadata) for that extent. // Missing these means the repair will not succeed. - // Optionally, there could be both .db-shm and .db-wal. + // Optionally, there could be .db-wal. if !validate_repair_files(eid, &repair_files) { crucible_bail!( RepairFilesInvalid, @@ -1836,33 +1892,29 @@ impl Region { }; save_stream_to_file(extent_db, repair_stream.into_inner()).await?; - // These next two are optional. - for opt_file in &[ExtentType::DbShm, ExtentType::DbWal] { - let filename = extent_file_name(eid as u32, opt_file.clone()); + // the WAL log isn't always there + let opt_file = ExtentType::DbWal; + let wal_filename = extent_file_name(eid as u32, opt_file.clone()); - if repair_files.contains(&filename) { - let extent_shm = extent.create_copy_file( - copy_dir.clone(), - Some(opt_file.clone()), - )?; - let repair_stream = match repair_server - .get_extent_file(eid as u32, opt_file.to_file_type()) - .await - { - Ok(rs) => rs, - Err(e) => { - crucible_bail!( - RepairRequestError, - "Failed to get extent {} {} file: {:?}", - eid, - opt_file, - e, - ); - } - }; - save_stream_to_file(extent_shm, repair_stream.into_inner()) - .await?; - } + if repair_files.contains(&wal_filename) { + let extent_wal = extent + .create_copy_file(copy_dir.clone(), Some(opt_file.clone()))?; + let repair_stream = match repair_server + .get_extent_file(eid as u32, opt_file.to_file_type()) + .await + { + Ok(rs) => rs, + Err(e) => { + crucible_bail!( + RepairRequestError, + "Failed to get extent {} {} file: {:?}", + eid, + opt_file, + e, + ); + } + }; + save_stream_to_file(extent_wal, repair_stream.into_inner()).await?; } // After we have all files: move the repair dir. @@ -2291,31 +2343,9 @@ pub fn move_replacement_extent>( } sync_path(&original_file, log)?; - // The .db-shm and .db-wal files may or may not exist. If they don't - // exist on the source side, then be sure to remove them locally to - // avoid database corruption from a mismatch between old and new. - new_file.set_extension("db-shm"); - original_file.set_extension("db-shm"); - if new_file.exists() { - if let Err(e) = std::fs::copy(new_file.clone(), original_file.clone()) { - crucible_bail!( - IoError, - "copy {:?} to {:?} got: {:?}", - new_file, - original_file, - e - ); - } - sync_path(&original_file, log)?; - } else if original_file.exists() { - info!( - log, - "Remove old file {:?} as there is no replacement", - original_file.clone() - ); - std::fs::remove_file(&original_file)?; - } - + // The .db-wal file may or may not exist. If they don't exist on the source + // side, then be sure to remove them locally to avoid database corruption + // from a mismatch between old and new. new_file.set_extension("db-wal"); original_file.set_extension("db-wal"); if new_file.exists() { @@ -2731,10 +2761,6 @@ mod test { dest_path.set_extension("db"); std::fs::copy(source_path.clone(), dest_path.clone())?; - source_path.set_extension("db-shm"); - dest_path.set_extension("db-shm"); - std::fs::copy(source_path.clone(), dest_path.clone())?; - source_path.set_extension("db-wal"); dest_path.set_extension("db-wal"); std::fs::copy(source_path.clone(), dest_path.clone())?; @@ -2765,11 +2791,11 @@ mod test { #[tokio::test] async fn reopen_extent_cleanup_replay_short() -> Result<()> { - // test move_replacement_extent(), create a copy dir, populate it - // and let the reopen do the work. This time we make sure our - // copy dir only has extent data and .db files, and not .db-shm - // nor .db-wal. Verify these files are delete from the original - // extent after the reopen has cleaned them up. + // test move_replacement_extent(), create a copy dir, populate it and + // let the reopen do the work. This time we make sure our copy dir only + // has extent data and .db files, and not .db-wal. + // Verify these files are delete from the original extent after the + // reopen has cleaned them up. // Create the region, make three extents let dir = tempdir()?; let mut region = @@ -2801,16 +2827,11 @@ mod test { let rd = replace_dir(&dir, 1); rename(cp.clone(), rd.clone())?; - // The close may remove the db-shm and db-wal files, manually - // create them here, just to verify they are removed after the - // reopen as they are not included in the files to be recovered - // and this test exists to verify they will be deleted. + // The close may remove the db-wal file, manually create them here, just + // to verify they are removed after the reopen as they are not included + // in the files to be recovered and this test exists to verify they will + // be deleted. let mut invalid_db = extent_path(&dir, 1); - invalid_db.set_extension("db-shm"); - println!("Recreate {:?}", invalid_db); - std::fs::copy(source_path.clone(), invalid_db.clone())?; - assert!(Path::new(&invalid_db).exists()); - invalid_db.set_extension("db-wal"); println!("Recreate {:?}", invalid_db); std::fs::copy(source_path.clone(), invalid_db.clone())?; @@ -2823,9 +2844,7 @@ mod test { // Reopen extent 1 region.reopen_extent(1).await?; - // Make sure there is no longer a db-shm and db-wal - dest_path.set_extension("db-shm"); - assert!(!Path::new(&dest_path).exists()); + // Make sure there is no longer a db-wal dest_path.set_extension("db-wal"); assert!(!Path::new(&dest_path).exists()); @@ -2906,7 +2925,6 @@ mod test { let good_files: Vec = vec![ "001".to_string(), "001.db".to_string(), - "001.db-shm".to_string(), "001.db-wal".to_string(), ]; @@ -2943,6 +2961,10 @@ mod test { #[test] fn validate_repair_files_quad_duplicate() { + // XXX I don't know why this test exists. we definitely shouldnt be + // involving shm in anything anymore. but what was this supposed to even + // be doing? idk. + // This is an expected list of files for an extent let good_files: Vec = vec![ "001".to_string(), @@ -2959,7 +2981,6 @@ mod test { let good_files: Vec = vec![ "001".to_string(), "001.db".to_string(), - "001.db-shm".to_string(), "001.db-wal".to_string(), ]; @@ -2971,18 +2992,6 @@ mod test { // Duplicate file in list let good_files: Vec = vec![ "001".to_string(), - "001".to_string(), - "001.db".to_string(), - "001.db-shm".to_string(), - "001.db-wal".to_string(), - ]; - assert!(!validate_repair_files(1, &good_files)); - } - - #[test] - fn validate_repair_files_not_good_enough() { - // We require 2 or 4 files, not 3 - let good_files: Vec = vec![ "001".to_string(), "001.db".to_string(), "001.db-wal".to_string(), @@ -3126,11 +3135,6 @@ mod test { assert_eq!(extent_file_name(4, ExtentType::Db), "004.db"); } - #[test] - fn extent_name_basic_ext_shm() { - assert_eq!(extent_file_name(4, ExtentType::DbShm), "004.db-shm"); - } - #[test] fn extent_name_basic_ext_wal() { assert_eq!(extent_file_name(4, ExtentType::DbWal), "004.db-wal"); diff --git a/downstairs/src/repair.rs b/downstairs/src/repair.rs index d0743f9db..05dc0dd2a 100644 --- a/downstairs/src/repair.rs +++ b/downstairs/src/repair.rs @@ -103,8 +103,6 @@ pub enum FileType { Data, #[serde(rename = "db")] Database, - #[serde(rename = "db_shm")] - DatabaseSharedMemory, #[serde(rename = "db_wal")] DatabaseLog, } @@ -131,11 +129,8 @@ async fn get_extent_file( FileType::Database => { extent_path.set_extension("db"); } - FileType::DatabaseSharedMemory => { - extent_path.set_extension("db-wal"); - } FileType::DatabaseLog => { - extent_path.set_extension("db-shm"); + extent_path.set_extension("db-wal"); } FileType::Data => (), }; @@ -235,7 +230,6 @@ async fn extent_file_list( let possible_files = vec![ (extent_file_name(eid, ExtentType::Data), true), (extent_file_name(eid, ExtentType::Db), true), - (extent_file_name(eid, ExtentType::DbShm), false), (extent_file_name(eid, ExtentType::DbWal), false), ]; @@ -289,7 +283,7 @@ mod test { let ed = extent_dir(&dir, 1); let mut ex_files = extent_file_list(ed, 1).await.unwrap(); ex_files.sort(); - let expected = vec!["001", "001.db", "001.db-shm", "001.db-wal"]; + let expected = vec!["001", "001.db", "001.db-wal"]; println!("files: {:?}", ex_files); assert_eq!(ex_files, expected); @@ -300,7 +294,7 @@ mod test { async fn extent_expected_files_short() -> Result<()> { // Verify that the list of files returned for an extent matches // what we expect. In this case we expect the extent data file and - // the .db file, but not the .db-shm or .db-wal database files. + // the .db file, but not the .db-wal database files. let dir = tempdir()?; let mut region = Region::create(&dir, new_region_options(), csl()).await?; @@ -309,13 +303,11 @@ mod test { // Determine the directory and name for expected extent files. let extent_dir = extent_dir(&dir, 1); - // Delete db-wal and db-shm + // Delete db-wal let mut rm_file = extent_dir.clone(); rm_file.push(extent_file_name(1, ExtentType::Data)); rm_file.set_extension("db-wal"); std::fs::remove_file(&rm_file).unwrap(); - rm_file.set_extension("db-shm"); - std::fs::remove_file(rm_file).unwrap(); let mut ex_files = extent_file_list(extent_dir, 1).await.unwrap(); ex_files.sort(); @@ -328,11 +320,10 @@ mod test { #[tokio::test] async fn extent_expected_files_short_with_close() -> Result<()> { - // Verify that the list of files returned for an extent matches - // what we expect. In this case we expect the extent data file and - // the .db file, but not the .db-shm or .db-wal database files. - // We close the extent here first, and on illumos that behaves - // a little different than elsewhere. + // Verify that the list of files returned for an extent matches what we + // expect. In this case we expect the extent data file and the .db file, + // but not the or .db-wal database files. We close the extent here + // first, and on illumos that behaves a little different than elsewhere. let dir = tempdir()?; let mut region = Region::create(&dir, new_region_options(), csl()).await?; @@ -344,14 +335,12 @@ mod test { // Determine the directory and name for expected extent files. let extent_dir = extent_dir(&dir, 1); - // Delete db-wal and db-shm. On illumos the close of the extent - // may remove these for us, so we ignore errors on the removal. + // Delete db-wal. On illumos the close of the extent may remove these + // for us, so we ignore errors on the removal. let mut rm_file = extent_dir.clone(); rm_file.push(extent_file_name(1, ExtentType::Data)); rm_file.set_extension("db-wal"); let _ = std::fs::remove_file(&rm_file); - rm_file.set_extension("db-shm"); - let _ = std::fs::remove_file(rm_file); let mut ex_files = extent_file_list(extent_dir, 1).await.unwrap(); ex_files.sort(); diff --git a/openapi/downstairs-repair.json b/openapi/downstairs-repair.json index a229d6077..d289d6934 100644 --- a/openapi/downstairs-repair.json +++ b/openapi/downstairs-repair.json @@ -120,7 +120,6 @@ "enum": [ "data", "db", - "db_shm", "db_wal" ] } diff --git a/tools/test_repair.sh b/tools/test_repair.sh index 8d67ba321..1079e19a6 100755 --- a/tools/test_repair.sh +++ b/tools/test_repair.sh @@ -140,17 +140,14 @@ for (( i = 0; i < 100; i += 1 )); do (( generation += 1)) echo "" - # Stop --lossy downstairs so it can't complete all its IOs - if [[ $choice -eq 0 ]]; then - kill "$ds0_pid" - wait "$ds0_pid" || true - elif [[ $choice -eq 1 ]]; then - kill "$ds1_pid" - wait "$ds1_pid" || true - else - kill "$ds2_pid" - wait "$ds2_pid" || true - fi + + # Stop everything before dump so dump can access the extent metadata + kill "$ds0_pid" + wait "$ds0_pid" || true + kill "$ds1_pid" + wait "$ds1_pid" || true + kill "$ds2_pid" + wait "$ds2_pid" || true # Did we get any mismatches? # We || true because dump will return non-zero when it finds @@ -160,17 +157,13 @@ for (( i = 0; i < 100; i += 1 )); do echo "On loop $i" echo "" - # Start downstairs without lossy - if [[ $choice -eq 0 ]]; then - ${cds} run -d "${testdir}/8810" -p 8810 &> "$ds_log_prefix"8810.txt & - ds0_pid=$! - elif [[ $choice -eq 1 ]]; then - ${cds} run -d "${testdir}/8820" -p 8820 &> "$ds_log_prefix"8820.txt & - ds1_pid=$! - else - ${cds} run -d "${testdir}/8830" -p 8830 &> "$ds_log_prefix"8830.txt & - ds2_pid=$! - fi + # Start everything back up, no lossy + ${cds} run -d "${testdir}/8810" -p 8810 &> "$ds_log_prefix"8810.txt & + ds0_pid=$! + ${cds} run -d "${testdir}/8820" -p 8820 &> "$ds_log_prefix"8820.txt & + ds1_pid=$! + ${cds} run -d "${testdir}/8830" -p 8830 &> "$ds_log_prefix"8830.txt & + ds2_pid=$! cp "$verify_file" ${verify_file}.last echo "Verifying data now" @@ -194,9 +187,25 @@ for (( i = 0; i < 100; i += 1 )); do set -o errexit (( generation += 1)) + # Need to shut down the downstairs before dumping due to the use of + # unix-excl sqlite VFS- only one process can access the metadata at a time. + kill "$ds0_pid" + wait "$ds0_pid" || true + kill "$ds1_pid" + wait "$ds1_pid" || true + kill "$ds2_pid" + wait "$ds2_pid" || true + echo "Loop: $i Downstairs dump after verify (and repair):" ${cds} dump ${dump_args[@]} + # Start everything back up + ${cds} run -d "${testdir}/8810" -p 8810 &> "$ds_log_prefix"8810.txt & + ds0_pid=$! + ${cds} run -d "${testdir}/8820" -p 8820 &> "$ds_log_prefix"8820.txt & + ds1_pid=$! + ${cds} run -d "${testdir}/8830" -p 8830 &> "$ds_log_prefix"8830.txt & + ds2_pid=$! done duration=$SECONDS diff --git a/tools/test_up.sh b/tools/test_up.sh index aa5b3ff4f..7ecba18ec 100755 --- a/tools/test_up.sh +++ b/tools/test_up.sh @@ -245,19 +245,21 @@ echo mv "${testdir}/previous" "${testdir}/${port}" rm -rf "${testdir:?}/${port:?}" mv "${testdir}/previous" "${testdir}/${port}" -echo "Restart downstairs with old directory" -if ! "$dsc" cmd start -c 2; then +# Put a dump test in the middle of the repair test, so we +# can see both a mismatch and that dump works. +# The dump args look different than other downstairs commands + +# shut down all downstairs so we can dump +echo "Shutting down all downstairs so we can dump" +if ! "$dsc" cmd stop-all; then (( res += 1 )) echo "" - echo "Failed repair test part 1, starting downstairs 2" - echo "Failed repair test part 1, starting downstairs 2" >> "$fail_log" + echo "Failed dsc stop-all" + echo "Failed dsc stop-all" >> "$fail_log" echo fi -# Put a dump test in the middle of the repair test, so we -# can see both a mismatch and that dump works. -# The dump args look different than other downstairs commands for (( i = 0; i < 30; i += 10 )); do (( port = port_base + i )) dir="${testdir}/$port" @@ -275,6 +277,16 @@ else echo "dump test found error as expected" fi + +echo "Restart downstairs with old directory" +if ! "$dsc" cmd start-all; then + (( res += 1 )) + echo "" + echo "Failed repair test part 1, starting downstairs" + echo "Failed repair test part 1, starting downstairs" >> "$fail_log" + echo +fi + echo "" echo "" echo "$ct" "$tt" -g "$gen" -q "${args[@]}" @@ -289,6 +301,16 @@ else fi (( gen += 1 )) +# shut down all downstairs so we can dump +echo "Shutting down downstairs for the last time" +if ! "$dsc" cmd shutdown; then + (( res += 1 )) + echo "" + echo "Failed dsc shutdown" + echo "Failed dsc shutdown" >> "$fail_log" + echo +fi + # Now that repair has finished, make sure the dump command does not detect # any errors echo "$cds" dump "${dump_args[@]}" @@ -325,16 +347,6 @@ else echo "dump block test passed" fi -# Tests done, shut down the downstairs. -echo "Upstairs tests have completed, stopping all downstairs" -if ! "$dsc" cmd shutdown; then - (( res += 1 )) - echo "" - echo "Failed dsc shutdown" - echo "Failed dsc shutdown" >> "$fail_log" - echo -fi - echo "" if [[ $res != 0 ]]; then echo "$res Tests have failed"