From e117566eb7ff5218c7641658f27ce091ff8f1850 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Thu, 20 Feb 2025 13:48:49 -0800 Subject: [PATCH 01/22] Fixing emulated.rs after rebase --- vm/devices/user_driver/src/emulated.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/vm/devices/user_driver/src/emulated.rs b/vm/devices/user_driver/src/emulated.rs index e3dbcbbac3..75b8f54569 100644 --- a/vm/devices/user_driver/src/emulated.rs +++ b/vm/devices/user_driver/src/emulated.rs @@ -224,6 +224,21 @@ impl DeviceSharedMemory { state: self.state.clone(), }) } + + // TODO: [nvme-keepalive-testing] + // This is only a stop-gap until we can swap out the back end of nvme tests to use real memory + pub fn alloc_specific(&self, len: usize, base_pfn: u64) -> Option{ + assert!(len % PAGE_SIZE == 0); + let count = len / PAGE_SIZE; + let start_page = base_pfn as usize; + + let pages = (start_page..start_page + count).map(|p| p as u64).collect(); + Some(DmaBuffer { + mem: self.mem.clone(), + pfns: pages, + state: self.state.clone(), + }) + } } pub struct DmaBuffer { From c640a002bdb6f6c9c1ce625be0f9bd76709da9cb Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Thu, 6 Feb 2025 14:41:16 -0800 Subject: [PATCH 02/22] Added skeleton for testing objects after restore --- .../disk_nvme/nvme_driver/src/driver.rs | 6 ++++ .../disk_nvme/nvme_driver/src/tests.rs | 30 +++++++++++++++---- vm/devices/user_driver/src/emulated.rs | 5 ++-- 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index f912ff7960..193d36ba68 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -683,6 +683,12 @@ impl NvmeDriver { pub fn update_servicing_flags(&mut self, nvme_keepalive: bool) { self.nvme_keepalive = nvme_keepalive; } + + /// Given an input of the saved state from which the driver was constructed and the underlying + /// memory, this validates the current driver. + pub(crate) async fn verify_restore(&mut self, saved_state: NvmeDriverSavedState, mem: MemoryBlock) -> anyhow::Result<()> { + Ok(()) + } } async fn handle_asynchronous_events( diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs index 0cdd4a30ce..5e3566bb65 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs @@ -233,6 +233,7 @@ async fn test_nvme_driver(driver: DefaultDriver, allow_dma: bool) { } async fn test_nvme_save_restore_inner(driver: DefaultDriver) { + // ===== SHARED RESOURCES ===== const MSIX_COUNT: u16 = 2; const IO_QUEUE_COUNT: u16 = 64; const CPU_COUNT: u32 = 64; @@ -261,12 +262,16 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) { .await .unwrap(); - let device = EmulatedDevice::new(nvme_ctrl, msi_x, mem); + // ===== FIRST DRIVER INIT ===== + let device = EmulatedDevice::new(nvme_ctrl, msi_x, mem.clone()); let mut nvme_driver = NvmeDriver::new(&driver_source, CPU_COUNT, device) .await .unwrap(); let _ns1 = nvme_driver.namespace(1).await.unwrap(); let saved_state = nvme_driver.save().await.unwrap(); + + // Tear down the original driver to kill the underlying tasks. + nvme_driver.shutdown().await; // As of today we do not save namespace data to avoid possible conflict // when namespace has changed during servicing. // TODO: Review and re-enable in future. @@ -299,11 +304,24 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) { // Wait for CSTS.RDY to set. backoff.back_off().await; - let _new_device = EmulatedDevice::new(new_nvme_ctrl, new_msi_x, new_emu_mem); - // TODO: Memory restore is disabled for emulated DMA, uncomment once fixed. - // let _new_nvme_driver = NvmeDriver::restore(&driver_source, CPU_COUNT, new_device, &saved_state) - // .await - // .unwrap(); + // ====== SECOND DRIVER INIT ===== + let _new_device = EmulatedDevice::new(new_nvme_ctrl, new_msi_x, mem.clone()); + let mut new_nvme_driver = NvmeDriver::restore(&driver_source, CPU_COUNT, _new_device, &saved_state) + .await + .unwrap(); + + + // ===== VERIFY RESTORE ===== + // TODO: [nvme-keepalive-testing] We should be be byte-by-byte coplying this memory before + // passing it in. Reserving this for when we swap out the back end to use actual memory instead + // of emulated memory. + // TODO: [nvme-keepalive-testing] Do not need to use "allocate dma buffer" once we have actual + // memory backing + let host_allocator = EmulatedDmaAllocator::new(mem.clone()); + let verify_mem = DmaClient::attach_dma_buffer(&host_allocator, base_len, 0).unwrap(); + + let verify = new_nvme_driver.verify_restore(saved_state, verify_mem).await; + assert_eq!(verify.unwrap(), ()); } #[derive(Inspect)] diff --git a/vm/devices/user_driver/src/emulated.rs b/vm/devices/user_driver/src/emulated.rs index 75b8f54569..6a592e2f6f 100644 --- a/vm/devices/user_driver/src/emulated.rs +++ b/vm/devices/user_driver/src/emulated.rs @@ -292,8 +292,9 @@ impl DmaClient for EmulatedDmaAllocator { Ok(memory) } - fn attach_dma_buffer(&self, _len: usize, _base_pfn: u64) -> anyhow::Result { - anyhow::bail!("restore is not supported for emulated DMA") + fn attach_dma_buffer(&self, len: usize, base_pfn: u64) -> anyhow::Result { + let memory = MemoryBlock::new( self.shared_mem.alloc_specific(len, base_pfn).context("could not alloc specific. out of memory")?); + Ok(memory) } } From d7ab79a1c09494da270ee87cce5c69a7966efa0e Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Thu, 6 Feb 2025 16:05:57 -0800 Subject: [PATCH 03/22] Now checking the device_id and the nvme_keepalive values upon restore --- .../disk_nvme/nvme_driver/src/driver.rs | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index 193d36ba68..7decee31a5 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -686,7 +686,29 @@ impl NvmeDriver { /// Given an input of the saved state from which the driver was constructed and the underlying /// memory, this validates the current driver. + #[cfg(test)] pub(crate) async fn verify_restore(&mut self, saved_state: NvmeDriverSavedState, mem: MemoryBlock) -> anyhow::Result<()> { + // Going in order of variables defined in the NvmeDriver struct + if saved_state.device_id != self.device_id { + anyhow::bail!(format!("incorrect device_id after restore. Expected:{} Actual: {}", saved_state.device_id, self.device_id)); + } + + // TODO: [expand-verify-restore-functionality] + // saved_state.identify_ctrl.verify_restore(self.identify)?; + + // TODO: [expand-verify-restore-functionality] Namespace save is currently not supported. + // if saved_state.namespaces.len() != self.namespaces.len() { + // return Err(format!("number of namespaces after restore is incorrect. Expected: {} Actual: {}", saved_state.namespaces.len(), self.namespaces.len())); + // } + + // for i in 0..saved_state.namespaces.len() { + // saved_state.namespaces[i].verify_restore(self.namespaces[i])?; + // } + + if !self.nvme_keepalive { + anyhow::bail!(format!("incorrect nvme_keepalive value after restore. Expected: {} Actual: {}", true, self.nvme_keepalive)); + } + Ok(()) } } From 5c16819413a5bc41f21e403de33483996167b9ff Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Thu, 6 Feb 2025 18:23:59 -0800 Subject: [PATCH 04/22] Added verify_restore for queue pair. incomplete --- .../disk_nvme/nvme_driver/src/driver.rs | 31 ++++++++++++-- .../disk_nvme/nvme_driver/src/queue_pair.rs | 41 +++++++++++++++++++ 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index 7decee31a5..986d3eaf4a 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -688,13 +688,38 @@ impl NvmeDriver { /// memory, this validates the current driver. #[cfg(test)] pub(crate) async fn verify_restore(&mut self, saved_state: NvmeDriverSavedState, mem: MemoryBlock) -> anyhow::Result<()> { - // Going in order of variables defined in the NvmeDriver struct + if let Some(task) = self.task.as_mut() { + task.stop().await; + let worker = task.task(); + + // Verify Admin Queue + // TODO: [expand-verify-restore-functionality] Currrently providing base_pfn value in u64, this might panic + let admin_queue_match: Result<(), anyhow::Error> = match (saved_state.worker_data.admin, &worker.admin) { + (None, None) => Ok(()), + (Some(admin_saved_state), Some(admin)) => { + let admin_saved_mem = mem.subblock(admin_saved_state.base_pfn.try_into().unwrap(), admin_saved_state.mem_len); + return admin.verify_restore(admin_saved_state, admin_saved_mem).await; + }, + _ => anyhow::bail!("admin queue states do not match"), + }; + + if let Err(_) = admin_queue_match { + task.start(); + admin_queue_match?; + } + + // TODO: Verify I/O Queues with strict ordering + task.start(); + } else { + anyhow::bail!("task cannot be None() after restore"); + } + if saved_state.device_id != self.device_id { anyhow::bail!(format!("incorrect device_id after restore. Expected:{} Actual: {}", saved_state.device_id, self.device_id)); } // TODO: [expand-verify-restore-functionality] - // saved_state.identify_ctrl.verify_restore(self.identify)?; + // self.identify.verify_restore(saved_state.identify_ctrl)?; // TODO: [expand-verify-restore-functionality] Namespace save is currently not supported. // if saved_state.namespaces.len() != self.namespaces.len() { @@ -702,7 +727,7 @@ impl NvmeDriver { // } // for i in 0..saved_state.namespaces.len() { - // saved_state.namespaces[i].verify_restore(self.namespaces[i])?; + // self.namespaces[i].verify_restore(saved_state.namespaces[i])?; // } if !self.nvme_keepalive { diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 5f44159d4f..8903edcd75 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -324,6 +324,47 @@ impl QueuePair { Some(handler_data), ) } + + /// Given the saved state of a queue pair, this verifies the constructed queue pair. + /// Input memory block should already be constructed from the offsets. + #[cfg(test)] + pub(crate) async fn verify_restore(&self, saved_state: QueuePairSavedState, saved_mem: MemoryBlock) -> anyhow::Result<()> { + // TODO: Verify restore for the QueueHandler + // TODO: Do we need to verify_restore for cancel? + // TODO: Do we need to verify_restore for issuers? + // Verify bytes from memory + let mut saved_mem_data: [u8; PAGE_SIZE] = [0; PAGE_SIZE]; + let mut self_mem_data: [u8; PAGE_SIZE] = [0; PAGE_SIZE]; + + if saved_mem.len() != self.mem.len() { + anyhow::bail!(format!("mem length mismatch. Expected: {}, Actual: {}", saved_mem.len(), self.mem.len())); + } + + for pfn in 0..(saved_mem.len()/PAGE_SIZE) { + saved_mem.read_at(pfn * PAGE_SIZE, &mut saved_mem_data); + self.mem.read_at(pfn * PAGE_SIZE, &mut self_mem_data); + + for i in 0..PAGE_SIZE { + if saved_mem_data[i] != self_mem_data[i] { + anyhow::bail!(format!("memory mismatched at offset {}. Expected: {}, Actual: {}", pfn * PAGE_SIZE + i, saved_mem_data[i], self_mem_data[i])); + } + } + } + + if saved_state.qid != self.qid { + anyhow::bail!(format!("qid did not match. Expected: {}, Actual: {}", saved_state.qid, self.qid)); + } + + if saved_state.sq_entries != self.sq_entries { + anyhow::bail!(format!("sq_entries did not match. Expected: {}, Actual: {}", saved_state.sq_entries, self.sq_entries)); + } + + if saved_state.cq_entries != self.cq_entries { + anyhow::bail!(format!("cq_entries did not match. Expected: {}, Actual: {}", saved_state.cq_entries, self.cq_entries)); + } + + Ok(()) + } } /// An error issuing an NVMe request. From 63589d6258d743c1a02830f3d3a94fee90f0ab8d Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Thu, 6 Feb 2025 20:14:36 -0800 Subject: [PATCH 05/22] Checking the state of the QueueHandler after reset but currently has not implemented error --- .../disk_nvme/nvme_driver/src/queue_pair.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 8903edcd75..50744a9364 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -329,7 +329,10 @@ impl QueuePair { /// Input memory block should already be constructed from the offsets. #[cfg(test)] pub(crate) async fn verify_restore(&self, saved_state: QueuePairSavedState, saved_mem: MemoryBlock) -> anyhow::Result<()> { - // TODO: Verify restore for the QueueHandler + // Entire memory region is checked below. No need for the the handler to check it again. + // Send an RPC request to QueueHandler thread to verify the restore status. + self.issuer.send.call(Req::Verify, saved_state.handler_data).await??; + // TODO: Do we need to verify_restore for cancel? // TODO: Do we need to verify_restore for issuers? // Verify bytes from memory @@ -624,6 +627,8 @@ enum Req { Command(Rpc), Inspect(inspect::Deferred), Save(Rpc<(), Result>), + #[cfg(test)] + Verify(Rpc>), } #[derive(Inspect)] @@ -709,6 +714,10 @@ impl QueueHandler { // Do not allow any more processing after save completed. break; } + #[cfg(test)] + Req::Verify(verify_state) => { + verify_state.complete(Err(anyhow::Error::msg("not implemented"))); + } }, Event::Completion(completion) => { assert_eq!(completion.sqid, self.sq.id()); @@ -757,6 +766,12 @@ impl QueueHandler { drain_after_restore: sq_state.sqid != 0 && !pending_cmds.commands.is_empty(), }) } + + /// Given the QueueHandlerSavedState, it verifies the constructed handler. + #[cfg(test)] + pub(crate) fn verify_restore(saved_state: QueueHandlerSavedState) -> anyhow::Result<()> { + anyhow::bail!("verify_restore not yet implemented for the QueueHandler"); + } } pub(crate) fn admin_cmd(opcode: spec::AdminOpcode) -> spec::Command { From d803a559b62ff2d88148be7b8f088eb7c804253a Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Thu, 6 Feb 2025 21:32:46 -0800 Subject: [PATCH 06/22] Added verify restore functions for submission queue completion queue and pending commands. Calling this from an RPC call now --- .../disk_nvme/nvme_driver/src/queue_pair.rs | 22 ++++++++++++++++++- .../disk_nvme/nvme_driver/src/queues.rs | 13 +++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 50744a9364..9323a70dcd 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -163,6 +163,12 @@ impl PendingCommands { next_cid_high_bits: Wrapping(*next_cid_high_bits), }) } + + /// Given the saved state, verifies the state of the PendingCommands to match the saved state + #[cfg(test)] + pub fn verify_restore(&self, saved_state: PendingCommandsSavedState) -> anyhow::Result<()> { + anyhow::bail!("verify restore for PendingCommands not implemented"); + } } impl QueuePair { @@ -716,7 +722,21 @@ impl QueueHandler { } #[cfg(test)] Req::Verify(verify_state) => { - verify_state.complete(Err(anyhow::Error::msg("not implemented"))); + let saved_state = verify_state.input(); + + let sq_verify = self.sq.verify_restore(saved_state.sq_state.clone()); + let cq_verify = self.cq.verify_restore(saved_state.cq_state.clone()); + let pending_cmds_verify = self.commands.verify_restore(saved_state.pending_cmds.clone()); + + if let Err(_) = sq_verify { + verify_state.complete(sq_verify); + } else if let Err(_) = cq_verify { + verify_state.complete(cq_verify); + } else if let Err(_) = pending_cmds_verify { + verify_state.complete(pending_cmds_verify); + } else { + verify_state.complete(Ok(())); + } } }, Event::Completion(completion) => { diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs index 049aa7bba2..84fba347aa 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs @@ -106,6 +106,13 @@ impl SubmissionQueue { mem, }) } + + /// Given the saved state, checks the state of the submission queue. Does not verify memory. + // TODO: Can this be an associated function instead? + #[cfg(test)] + pub fn verify_restore(&self, saved_state: SubmissionQueueSavedState) -> anyhow::Result<()> { + anyhow::bail!("verify restore for Submission Queue not implemented yet"); + } } #[derive(Inspect)] @@ -201,6 +208,12 @@ impl CompletionQueue { mem, }) } + + /// Given the saved state, checks the state of the completion queue. Does not verify memory. + #[cfg(test)] + pub fn verify_restore(&self, saved_state: CompletionQueueSavedState) -> anyhow::Result<()> { + anyhow::bail!("verify restore for CompletionQueue not implemented yet"); + } } fn advance(n: u32, l: u32) -> u32 { From c8b6b3d63f2055026a1fc4bee9ed8c1470aebc74 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Fri, 7 Feb 2025 11:05:09 -0800 Subject: [PATCH 07/22] Added verify restore for Submission and Completion queues --- .../disk_nvme/nvme_driver/src/queues.rs | 44 ++++++++++++++++++- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs index 84fba347aa..52b84fbf90 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs @@ -111,7 +111,27 @@ impl SubmissionQueue { // TODO: Can this be an associated function instead? #[cfg(test)] pub fn verify_restore(&self, saved_state: SubmissionQueueSavedState) -> anyhow::Result<()> { - anyhow::bail!("verify restore for Submission Queue not implemented yet"); + if saved_state.sqid != self.sqid { + anyhow::bail!(format!("sqid for the submission queue did not match. Expected: {}, Actual: {}", saved_state.sqid, self.sqid)); + } + + if saved_state.head != self.head { + anyhow::bail!(format!("head for the submission queue did not match. Expected: {}, Actual: {}", saved_state.head, self.head)); + } + + if saved_state.tail != self.tail { + anyhow::bail!(format!("tail for the submission queue did not match. Expected: {}, Actual: {}", saved_state.tail, self.tail)); + } + + if saved_state.committed_tail != self.committed_tail { + anyhow::bail!(format!("committed_tail for the submission queue did not match. Expected: {}, Actual: {}", saved_state.committed_tail, self.committed_tail)); + } + + if saved_state.len != self.len { + anyhow::bail!(format!("len for the submission queue did not match. Expected: {}, Actual: {}", saved_state.len, self.len)); + } + + Ok(()) } } @@ -212,7 +232,27 @@ impl CompletionQueue { /// Given the saved state, checks the state of the completion queue. Does not verify memory. #[cfg(test)] pub fn verify_restore(&self, saved_state: CompletionQueueSavedState) -> anyhow::Result<()> { - anyhow::bail!("verify restore for CompletionQueue not implemented yet"); + if saved_state.cqid != self.cqid { + anyhow::bail!(format!("cqid for the completion queue did not match. Expected: {}, Actual: {}", saved_state.cqid, self.cqid)); + } + + if saved_state.head != self.head { + anyhow::bail!(format!("head for the completion queue did not match. Expected: {}, Actual: {}", saved_state.head, self.head)); + } + + if saved_state.committed_head != self.committed_head { + anyhow::bail!(format!("committed_head for the completion queue did not match. Expected: {}, Actual: {}", saved_state.committed_head, self.committed_head)); + } + + if saved_state.len != self.len { + anyhow::bail!(format!("len for the completion queue did not match. Expected: {}, Actual: {}", saved_state.len, self.len)); + } + + if saved_state.phase != self.phase { + anyhow::bail!(format!("phase for the completion queue did not match. Expected: {}, Actual: {}", saved_state.phase, self.phase)); + } + + Ok(()) } } From cd9249591da7e990de4c083da8940e89a6f5596d Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Fri, 7 Feb 2025 12:16:39 -0800 Subject: [PATCH 08/22] Moved to using panics and asserts instead of the Result return types --- .../disk_nvme/nvme_driver/src/driver.rs | 21 ++----- .../disk_nvme/nvme_driver/src/queue_pair.rs | 56 +++++-------------- .../disk_nvme/nvme_driver/src/queues.rs | 56 ++++--------------- .../disk_nvme/nvme_driver/src/tests.rs | 4 +- 4 files changed, 32 insertions(+), 105 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index 986d3eaf4a..c2163fdd77 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -687,7 +687,7 @@ impl NvmeDriver { /// Given an input of the saved state from which the driver was constructed and the underlying /// memory, this validates the current driver. #[cfg(test)] - pub(crate) async fn verify_restore(&mut self, saved_state: NvmeDriverSavedState, mem: MemoryBlock) -> anyhow::Result<()> { + pub(crate) async fn verify_restore(&mut self, saved_state: NvmeDriverSavedState, mem: MemoryBlock) { if let Some(task) = self.task.as_mut() { task.stop().await; let worker = task.task(); @@ -700,23 +700,16 @@ impl NvmeDriver { let admin_saved_mem = mem.subblock(admin_saved_state.base_pfn.try_into().unwrap(), admin_saved_state.mem_len); return admin.verify_restore(admin_saved_state, admin_saved_mem).await; }, - _ => anyhow::bail!("admin queue states do not match"), + _ => panic!("admin queue states do not match"), }; - if let Err(_) = admin_queue_match { - task.start(); - admin_queue_match?; - } - // TODO: Verify I/O Queues with strict ordering task.start(); } else { - anyhow::bail!("task cannot be None() after restore"); + panic!("task cannot be None() after restore"); } - if saved_state.device_id != self.device_id { - anyhow::bail!(format!("incorrect device_id after restore. Expected:{} Actual: {}", saved_state.device_id, self.device_id)); - } + assert_eq!(saved_state.device_id, self.device_id); // TODO: [expand-verify-restore-functionality] // self.identify.verify_restore(saved_state.identify_ctrl)?; @@ -730,11 +723,7 @@ impl NvmeDriver { // self.namespaces[i].verify_restore(saved_state.namespaces[i])?; // } - if !self.nvme_keepalive { - anyhow::bail!(format!("incorrect nvme_keepalive value after restore. Expected: {} Actual: {}", true, self.nvme_keepalive)); - } - - Ok(()) + assert!(self.nvme_keepalive); } } diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 9323a70dcd..250b7d5176 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -166,8 +166,8 @@ impl PendingCommands { /// Given the saved state, verifies the state of the PendingCommands to match the saved state #[cfg(test)] - pub fn verify_restore(&self, saved_state: PendingCommandsSavedState) -> anyhow::Result<()> { - anyhow::bail!("verify restore for PendingCommands not implemented"); + pub(crate) fn verify_restore(&self, saved_state: PendingCommandsSavedState) { + panic!("verify restore for PendingCommands not implemented"); } } @@ -334,10 +334,10 @@ impl QueuePair { /// Given the saved state of a queue pair, this verifies the constructed queue pair. /// Input memory block should already be constructed from the offsets. #[cfg(test)] - pub(crate) async fn verify_restore(&self, saved_state: QueuePairSavedState, saved_mem: MemoryBlock) -> anyhow::Result<()> { + pub(crate) async fn verify_restore(&self, saved_state: QueuePairSavedState, saved_mem: MemoryBlock) { // Entire memory region is checked below. No need for the the handler to check it again. // Send an RPC request to QueueHandler thread to verify the restore status. - self.issuer.send.call(Req::Verify, saved_state.handler_data).await??; + self.issuer.send.call(Req::Verify, saved_state.handler_data).await; // TODO: Do we need to verify_restore for cancel? // TODO: Do we need to verify_restore for issuers? @@ -345,34 +345,20 @@ impl QueuePair { let mut saved_mem_data: [u8; PAGE_SIZE] = [0; PAGE_SIZE]; let mut self_mem_data: [u8; PAGE_SIZE] = [0; PAGE_SIZE]; - if saved_mem.len() != self.mem.len() { - anyhow::bail!(format!("mem length mismatch. Expected: {}, Actual: {}", saved_mem.len(), self.mem.len())); - } + assert_eq!(saved_mem.len(), self.mem.len()); for pfn in 0..(saved_mem.len()/PAGE_SIZE) { saved_mem.read_at(pfn * PAGE_SIZE, &mut saved_mem_data); self.mem.read_at(pfn * PAGE_SIZE, &mut self_mem_data); for i in 0..PAGE_SIZE { - if saved_mem_data[i] != self_mem_data[i] { - anyhow::bail!(format!("memory mismatched at offset {}. Expected: {}, Actual: {}", pfn * PAGE_SIZE + i, saved_mem_data[i], self_mem_data[i])); - } + assert_eq!(saved_mem_data[i], self_mem_data[i]); } } - if saved_state.qid != self.qid { - anyhow::bail!(format!("qid did not match. Expected: {}, Actual: {}", saved_state.qid, self.qid)); - } - - if saved_state.sq_entries != self.sq_entries { - anyhow::bail!(format!("sq_entries did not match. Expected: {}, Actual: {}", saved_state.sq_entries, self.sq_entries)); - } - - if saved_state.cq_entries != self.cq_entries { - anyhow::bail!(format!("cq_entries did not match. Expected: {}, Actual: {}", saved_state.cq_entries, self.cq_entries)); - } - - Ok(()) + assert_eq!(saved_state.qid, self.qid); + assert_eq!(saved_state.sq_entries, self.sq_entries); + assert_eq!(saved_state.cq_entries, self.cq_entries); } } @@ -634,7 +620,7 @@ enum Req { Inspect(inspect::Deferred), Save(Rpc<(), Result>), #[cfg(test)] - Verify(Rpc>), + Verify(Rpc), } #[derive(Inspect)] @@ -724,19 +710,9 @@ impl QueueHandler { Req::Verify(verify_state) => { let saved_state = verify_state.input(); - let sq_verify = self.sq.verify_restore(saved_state.sq_state.clone()); - let cq_verify = self.cq.verify_restore(saved_state.cq_state.clone()); - let pending_cmds_verify = self.commands.verify_restore(saved_state.pending_cmds.clone()); - - if let Err(_) = sq_verify { - verify_state.complete(sq_verify); - } else if let Err(_) = cq_verify { - verify_state.complete(cq_verify); - } else if let Err(_) = pending_cmds_verify { - verify_state.complete(pending_cmds_verify); - } else { - verify_state.complete(Ok(())); - } + self.sq.verify_restore(saved_state.sq_state.clone()); + self.cq.verify_restore(saved_state.cq_state.clone()); + self.commands.verify_restore(saved_state.pending_cmds.clone()); } }, Event::Completion(completion) => { @@ -786,12 +762,6 @@ impl QueueHandler { drain_after_restore: sq_state.sqid != 0 && !pending_cmds.commands.is_empty(), }) } - - /// Given the QueueHandlerSavedState, it verifies the constructed handler. - #[cfg(test)] - pub(crate) fn verify_restore(saved_state: QueueHandlerSavedState) -> anyhow::Result<()> { - anyhow::bail!("verify_restore not yet implemented for the QueueHandler"); - } } pub(crate) fn admin_cmd(opcode: spec::AdminOpcode) -> spec::Command { diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs index 52b84fbf90..0d43a255c6 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs @@ -110,28 +110,12 @@ impl SubmissionQueue { /// Given the saved state, checks the state of the submission queue. Does not verify memory. // TODO: Can this be an associated function instead? #[cfg(test)] - pub fn verify_restore(&self, saved_state: SubmissionQueueSavedState) -> anyhow::Result<()> { - if saved_state.sqid != self.sqid { - anyhow::bail!(format!("sqid for the submission queue did not match. Expected: {}, Actual: {}", saved_state.sqid, self.sqid)); - } - - if saved_state.head != self.head { - anyhow::bail!(format!("head for the submission queue did not match. Expected: {}, Actual: {}", saved_state.head, self.head)); - } - - if saved_state.tail != self.tail { - anyhow::bail!(format!("tail for the submission queue did not match. Expected: {}, Actual: {}", saved_state.tail, self.tail)); - } - - if saved_state.committed_tail != self.committed_tail { - anyhow::bail!(format!("committed_tail for the submission queue did not match. Expected: {}, Actual: {}", saved_state.committed_tail, self.committed_tail)); - } - - if saved_state.len != self.len { - anyhow::bail!(format!("len for the submission queue did not match. Expected: {}, Actual: {}", saved_state.len, self.len)); - } - - Ok(()) + pub(crate) fn verify_restore(&self, saved_state: SubmissionQueueSavedState) { + assert_eq!(saved_state.sqid, self.sqid); + assert_eq!(saved_state.head, self.head); + assert_eq!(saved_state.tail, self.tail); + assert_eq!(saved_state.committed_tail, self.committed_tail); + assert_eq!(saved_state.len, self.len); } } @@ -231,28 +215,12 @@ impl CompletionQueue { /// Given the saved state, checks the state of the completion queue. Does not verify memory. #[cfg(test)] - pub fn verify_restore(&self, saved_state: CompletionQueueSavedState) -> anyhow::Result<()> { - if saved_state.cqid != self.cqid { - anyhow::bail!(format!("cqid for the completion queue did not match. Expected: {}, Actual: {}", saved_state.cqid, self.cqid)); - } - - if saved_state.head != self.head { - anyhow::bail!(format!("head for the completion queue did not match. Expected: {}, Actual: {}", saved_state.head, self.head)); - } - - if saved_state.committed_head != self.committed_head { - anyhow::bail!(format!("committed_head for the completion queue did not match. Expected: {}, Actual: {}", saved_state.committed_head, self.committed_head)); - } - - if saved_state.len != self.len { - anyhow::bail!(format!("len for the completion queue did not match. Expected: {}, Actual: {}", saved_state.len, self.len)); - } - - if saved_state.phase != self.phase { - anyhow::bail!(format!("phase for the completion queue did not match. Expected: {}, Actual: {}", saved_state.phase, self.phase)); - } - - Ok(()) + pub(crate) fn verify_restore(&self, saved_state: CompletionQueueSavedState) { + assert_eq!(saved_state.cqid, self.cqid); + assert_eq!(saved_state.head, self.head); + assert_eq!(saved_state.committed_head, self.committed_head); + assert_eq!(saved_state.len, self.len); + assert_eq!(saved_state.phase, self.phase); } } diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs index 5e3566bb65..57a7d4d87e 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs @@ -320,8 +320,8 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) { let host_allocator = EmulatedDmaAllocator::new(mem.clone()); let verify_mem = DmaClient::attach_dma_buffer(&host_allocator, base_len, 0).unwrap(); - let verify = new_nvme_driver.verify_restore(saved_state, verify_mem).await; - assert_eq!(verify.unwrap(), ()); + // Verify restore functions will panic if verification failed. + new_nvme_driver.verify_restore(saved_state, verify_mem).await; } #[derive(Inspect)] From 2d616b5dbff756f072656a594dafe0b6363a7106 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Fri, 7 Feb 2025 13:03:11 -0800 Subject: [PATCH 09/22] Added verify restor for PendingCommand too --- .../disk_nvme/nvme_driver/src/queue_pair.rs | 19 +++++++++++++++++-- vm/devices/storage/nvme_spec/src/lib.rs | 4 ++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 250b7d5176..098c733bbb 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -167,7 +167,14 @@ impl PendingCommands { /// Given the saved state, verifies the state of the PendingCommands to match the saved state #[cfg(test)] pub(crate) fn verify_restore(&self, saved_state: PendingCommandsSavedState) { - panic!("verify restore for PendingCommands not implemented"); + // TODO: [expand-verify-restore-functionality] cid_key_bits are currently unused during restore. + assert_eq!(saved_state.commands.len(), self.commands.len()); + + for (index, command) in &self.commands { + command.verify_restore(&saved_state.commands[index]); + } + + assert_eq!(saved_state.next_cid_high_bits, self.next_cid_high_bits.0); } } @@ -337,7 +344,7 @@ impl QueuePair { pub(crate) async fn verify_restore(&self, saved_state: QueuePairSavedState, saved_mem: MemoryBlock) { // Entire memory region is checked below. No need for the the handler to check it again. // Send an RPC request to QueueHandler thread to verify the restore status. - self.issuer.send.call(Req::Verify, saved_state.handler_data).await; + let _ =self.issuer.send.call(Req::Verify, saved_state.handler_data).await; // TODO: Do we need to verify_restore for cancel? // TODO: Do we need to verify_restore for issuers? @@ -764,6 +771,14 @@ impl QueueHandler { } } +#[cfg(test)] +impl PendingCommand { + /// Verifies the value of the pending command from a saved state + pub(crate) fn verify_restore(&self, saved_state: &PendingCommandSavedState) { + assert_eq!(saved_state.command, self.command); + } +} + pub(crate) fn admin_cmd(opcode: spec::AdminOpcode) -> spec::Command { spec::Command { cdw0: spec::Cdw0::new().with_opcode(opcode.0), diff --git a/vm/devices/storage/nvme_spec/src/lib.rs b/vm/devices/storage/nvme_spec/src/lib.rs index 812a950225..a12232d1a7 100644 --- a/vm/devices/storage/nvme_spec/src/lib.rs +++ b/vm/devices/storage/nvme_spec/src/lib.rs @@ -127,7 +127,7 @@ pub struct Aqa { } #[repr(C)] -#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes, Inspect)] +#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes, Inspect, PartialEq)] pub struct Command { pub cdw0: Cdw0, pub nsid: u32, @@ -146,7 +146,7 @@ pub struct Command { #[derive(Inspect)] #[bitfield(u32)] -#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)] +#[derive(IntoBytes, Immutable, KnownLayout, FromBytes, PartialEq)] pub struct Cdw0 { pub opcode: u8, #[bits(2)] From 139681145e4787d286ad6e53623b458a5ae514f1 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Fri, 7 Feb 2025 13:36:59 -0800 Subject: [PATCH 10/22] Added in the verify restore for I/O Queues --- .../disk_nvme/nvme_driver/src/driver.rs | 25 ++++++++++++++++--- .../disk_nvme/nvme_driver/src/queue_pair.rs | 4 +-- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index c2163fdd77..4a14b0ab92 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -138,6 +138,14 @@ impl IoQueue { cpu: *cpu, }) } + + #[cfg(test)] + pub(crate) async fn verify_restore(&self, saved_state: IoQueueSavedState, mem: MemoryBlock) { + self.queue.verify_restore(saved_state.queue_data, mem); + + // TODO: [expand-verify-restore-functionality] What is the iv in the IoQueue vs msix in the IoQueueSavedState + assert_eq!(saved_state.cpu, self.cpu); + } } #[derive(Debug, Inspect)] @@ -686,6 +694,9 @@ impl NvmeDriver { /// Given an input of the saved state from which the driver was constructed and the underlying /// memory, this validates the current driver. + // TODO: [expand-verify-restore-functionality] move this function to take a reference to the + // SavedState for versatility of use (That way it doesn't take ownership of the saved state + // from the unit tests) #[cfg(test)] pub(crate) async fn verify_restore(&mut self, saved_state: NvmeDriverSavedState, mem: MemoryBlock) { if let Some(task) = self.task.as_mut() { @@ -694,16 +705,22 @@ impl NvmeDriver { // Verify Admin Queue // TODO: [expand-verify-restore-functionality] Currrently providing base_pfn value in u64, this might panic - let admin_queue_match: Result<(), anyhow::Error> = match (saved_state.worker_data.admin, &worker.admin) { - (None, None) => Ok(()), + match (saved_state.worker_data.admin, &worker.admin) { + (None, None) => (), (Some(admin_saved_state), Some(admin)) => { let admin_saved_mem = mem.subblock(admin_saved_state.base_pfn.try_into().unwrap(), admin_saved_state.mem_len); - return admin.verify_restore(admin_saved_state, admin_saved_mem).await; + admin.verify_restore(admin_saved_state, admin_saved_mem).await; }, _ => panic!("admin queue states do not match"), }; - // TODO: Verify I/O Queues with strict ordering + // Verify I/O queues in strict ordering + assert_eq!(saved_state.worker_data.io.len(), worker.io.len()); + for index in 0..saved_state.worker_data.io.len() { + let io_saved_state = saved_state.worker_data.io[index].clone(); + let io_saved_mem = mem.subblock(io_saved_state.queue_data.base_pfn.try_into().unwrap(), io_saved_state.queue_data.mem_len); + worker.io[index].verify_restore(io_saved_state, io_saved_mem); + } task.start(); } else { panic!("task cannot be None() after restore"); diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 098c733bbb..522438188e 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -346,8 +346,8 @@ impl QueuePair { // Send an RPC request to QueueHandler thread to verify the restore status. let _ =self.issuer.send.call(Req::Verify, saved_state.handler_data).await; - // TODO: Do we need to verify_restore for cancel? - // TODO: Do we need to verify_restore for issuers? + // TODO: [expand-verify-restore-functionality] Do we need to verify_restore for cancel? + // TODO: [expand-verify-restore-functionality] Do we need to verify_restore for issuers? // Verify bytes from memory let mut saved_mem_data: [u8; PAGE_SIZE] = [0; PAGE_SIZE]; let mut self_mem_data: [u8; PAGE_SIZE] = [0; PAGE_SIZE]; From 1da4fbcc71915378d144cd102321cc1552b48591 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 10 Feb 2025 17:08:17 -0800 Subject: [PATCH 11/22] Commented out code to use the TestMapper instead of using AlignedHeapMemory --- vm/devices/user_driver/src/emulated.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/vm/devices/user_driver/src/emulated.rs b/vm/devices/user_driver/src/emulated.rs index 6a592e2f6f..fae1d7cead 100644 --- a/vm/devices/user_driver/src/emulated.rs +++ b/vm/devices/user_driver/src/emulated.rs @@ -25,6 +25,7 @@ use pci_core::chipset_device_ext::PciChipsetDeviceExt; use pci_core::msi::MsiControl; use pci_core::msi::MsiInterruptSet; use pci_core::msi::MsiInterruptTarget; +use sparse_mmap::SparseMapping; use safeatomic::AtomicSliceOps; use std::ptr::NonNull; use std::sync::atomic::AtomicU8; @@ -139,6 +140,25 @@ pub struct DeviceSharedMemory { state: Arc>>, } +// struct RealBacking { +// mem: Arc, +// allow_dma: bool, +// } + +// unsafe impl GuestMemoryAccess for RealBacking { +// fn mapping(&self) -> Option> { +// NonNull::new(self.mem.as_ptr().cast()) +// } +// +// fn base_iova(&self) -> Option { +// self.allow_dma.then_some(0) +// } +// +// fn max_address(&self) -> u64 { +// self.mem.len().try_into().unwrap() +// } +// } + struct Backing { mem: Arc, allow_dma: bool, @@ -163,6 +183,10 @@ impl DeviceSharedMemory { pub fn new(size: usize, extra: usize) -> Self { assert_eq!(size % PAGE_SIZE, 0); assert_eq!(extra % PAGE_SIZE, 0); + // let _real_mem_backing = RealBacking { + // mem: Arc::new(TestMapper::new(size + extra)), + // allow_dma: false, + // }; let mem_backing = Backing { mem: Arc::new(AlignedHeapMemory::new(size + extra)), allow_dma: false, From ccb7c5fde5aeb4e5de6f59862e0de640bf61b24d Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 10 Feb 2025 17:15:54 -0800 Subject: [PATCH 12/22] Verified with Yuri that we don't need to check the values for cancel and issuers in the queue_pair --- vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 522438188e..819af21559 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -344,11 +344,9 @@ impl QueuePair { pub(crate) async fn verify_restore(&self, saved_state: QueuePairSavedState, saved_mem: MemoryBlock) { // Entire memory region is checked below. No need for the the handler to check it again. // Send an RPC request to QueueHandler thread to verify the restore status. - let _ =self.issuer.send.call(Req::Verify, saved_state.handler_data).await; + let _ = self.issuer.send.call(Req::Verify, saved_state.handler_data).await; - // TODO: [expand-verify-restore-functionality] Do we need to verify_restore for cancel? - // TODO: [expand-verify-restore-functionality] Do we need to verify_restore for issuers? - // Verify bytes from memory + // cancel and issuers params are runtime parameters so we don't check underlying values. let mut saved_mem_data: [u8; PAGE_SIZE] = [0; PAGE_SIZE]; let mut self_mem_data: [u8; PAGE_SIZE] = [0; PAGE_SIZE]; From 5e4c82c4cafedefd17cf442322efc2f0023bbc9c Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 10 Feb 2025 17:51:48 -0800 Subject: [PATCH 13/22] now checking the self.identify value as well --- .../storage/disk_nvme/nvme_driver/src/driver.rs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index 4a14b0ab92..a1462a3063 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -144,6 +144,7 @@ impl IoQueue { self.queue.verify_restore(saved_state.queue_data, mem); // TODO: [expand-verify-restore-functionality] What is the iv in the IoQueue vs msix in the IoQueueSavedState + assert_eq!(saved_state.msix, self.iv as u32); assert_eq!(saved_state.cpu, self.cpu); } } @@ -728,17 +729,13 @@ impl NvmeDriver { assert_eq!(saved_state.device_id, self.device_id); - // TODO: [expand-verify-restore-functionality] - // self.identify.verify_restore(saved_state.identify_ctrl)?; + if let Some(identify) = &self.identify { + assert_eq!(saved_state.identify_ctrl.as_bytes(), identify.as_bytes()); + } else { + panic!("idenitfy value cannot be None after restore"); + } // TODO: [expand-verify-restore-functionality] Namespace save is currently not supported. - // if saved_state.namespaces.len() != self.namespaces.len() { - // return Err(format!("number of namespaces after restore is incorrect. Expected: {} Actual: {}", saved_state.namespaces.len(), self.namespaces.len())); - // } - - // for i in 0..saved_state.namespaces.len() { - // self.namespaces[i].verify_restore(saved_state.namespaces[i])?; - // } assert!(self.nvme_keepalive); } From 58cdec0b4f03981d5057154e1bcaf1b6c4665c4b Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 10 Feb 2025 18:04:54 -0800 Subject: [PATCH 14/22] Using reference to saved state instead of passing in the object itself --- .../storage/disk_nvme/nvme_driver/src/driver.rs | 16 +++++++--------- .../disk_nvme/nvme_driver/src/queue_pair.rs | 12 ++++++------ .../storage/disk_nvme/nvme_driver/src/queues.rs | 4 ++-- .../storage/disk_nvme/nvme_driver/src/tests.rs | 2 +- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index a1462a3063..2ad81927a7 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -140,10 +140,9 @@ impl IoQueue { } #[cfg(test)] - pub(crate) async fn verify_restore(&self, saved_state: IoQueueSavedState, mem: MemoryBlock) { - self.queue.verify_restore(saved_state.queue_data, mem); + pub(crate) async fn verify_restore(&self, saved_state: &IoQueueSavedState, mem: MemoryBlock) { + self.queue.verify_restore(&saved_state.queue_data, mem); - // TODO: [expand-verify-restore-functionality] What is the iv in the IoQueue vs msix in the IoQueueSavedState assert_eq!(saved_state.msix, self.iv as u32); assert_eq!(saved_state.cpu, self.cpu); } @@ -699,18 +698,18 @@ impl NvmeDriver { // SavedState for versatility of use (That way it doesn't take ownership of the saved state // from the unit tests) #[cfg(test)] - pub(crate) async fn verify_restore(&mut self, saved_state: NvmeDriverSavedState, mem: MemoryBlock) { + pub(crate) async fn verify_restore(&mut self, saved_state: &NvmeDriverSavedState, mem: MemoryBlock) { if let Some(task) = self.task.as_mut() { task.stop().await; let worker = task.task(); // Verify Admin Queue - // TODO: [expand-verify-restore-functionality] Currrently providing base_pfn value in u64, this might panic - match (saved_state.worker_data.admin, &worker.admin) { + match (&saved_state.worker_data.admin, &worker.admin) { (None, None) => (), (Some(admin_saved_state), Some(admin)) => { + // TODO: [expand-verify-restore-functionality] Currrently providing base_pfn value in u64, this might panic let admin_saved_mem = mem.subblock(admin_saved_state.base_pfn.try_into().unwrap(), admin_saved_state.mem_len); - admin.verify_restore(admin_saved_state, admin_saved_mem).await; + admin.verify_restore(&admin_saved_state, admin_saved_mem).await; }, _ => panic!("admin queue states do not match"), }; @@ -720,7 +719,7 @@ impl NvmeDriver { for index in 0..saved_state.worker_data.io.len() { let io_saved_state = saved_state.worker_data.io[index].clone(); let io_saved_mem = mem.subblock(io_saved_state.queue_data.base_pfn.try_into().unwrap(), io_saved_state.queue_data.mem_len); - worker.io[index].verify_restore(io_saved_state, io_saved_mem); + worker.io[index].verify_restore(&io_saved_state, io_saved_mem); } task.start(); } else { @@ -736,7 +735,6 @@ impl NvmeDriver { } // TODO: [expand-verify-restore-functionality] Namespace save is currently not supported. - assert!(self.nvme_keepalive); } } diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 819af21559..80c0379a74 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -166,7 +166,7 @@ impl PendingCommands { /// Given the saved state, verifies the state of the PendingCommands to match the saved state #[cfg(test)] - pub(crate) fn verify_restore(&self, saved_state: PendingCommandsSavedState) { + pub(crate) fn verify_restore(&self, saved_state: &PendingCommandsSavedState) { // TODO: [expand-verify-restore-functionality] cid_key_bits are currently unused during restore. assert_eq!(saved_state.commands.len(), self.commands.len()); @@ -341,10 +341,10 @@ impl QueuePair { /// Given the saved state of a queue pair, this verifies the constructed queue pair. /// Input memory block should already be constructed from the offsets. #[cfg(test)] - pub(crate) async fn verify_restore(&self, saved_state: QueuePairSavedState, saved_mem: MemoryBlock) { + pub(crate) async fn verify_restore(&self, saved_state: &QueuePairSavedState, saved_mem: MemoryBlock) { // Entire memory region is checked below. No need for the the handler to check it again. // Send an RPC request to QueueHandler thread to verify the restore status. - let _ = self.issuer.send.call(Req::Verify, saved_state.handler_data).await; + let _ = self.issuer.send.call(Req::Verify, saved_state.handler_data.clone()).await; // cancel and issuers params are runtime parameters so we don't check underlying values. let mut saved_mem_data: [u8; PAGE_SIZE] = [0; PAGE_SIZE]; @@ -715,9 +715,9 @@ impl QueueHandler { Req::Verify(verify_state) => { let saved_state = verify_state.input(); - self.sq.verify_restore(saved_state.sq_state.clone()); - self.cq.verify_restore(saved_state.cq_state.clone()); - self.commands.verify_restore(saved_state.pending_cmds.clone()); + self.sq.verify_restore(&saved_state.sq_state); + self.cq.verify_restore(&saved_state.cq_state); + self.commands.verify_restore(&saved_state.pending_cmds); } }, Event::Completion(completion) => { diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs index 0d43a255c6..1958ea3943 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs @@ -110,7 +110,7 @@ impl SubmissionQueue { /// Given the saved state, checks the state of the submission queue. Does not verify memory. // TODO: Can this be an associated function instead? #[cfg(test)] - pub(crate) fn verify_restore(&self, saved_state: SubmissionQueueSavedState) { + pub(crate) fn verify_restore(&self, saved_state: &SubmissionQueueSavedState) { assert_eq!(saved_state.sqid, self.sqid); assert_eq!(saved_state.head, self.head); assert_eq!(saved_state.tail, self.tail); @@ -215,7 +215,7 @@ impl CompletionQueue { /// Given the saved state, checks the state of the completion queue. Does not verify memory. #[cfg(test)] - pub(crate) fn verify_restore(&self, saved_state: CompletionQueueSavedState) { + pub(crate) fn verify_restore(&self, saved_state: &CompletionQueueSavedState) { assert_eq!(saved_state.cqid, self.cqid); assert_eq!(saved_state.head, self.head); assert_eq!(saved_state.committed_head, self.committed_head); diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs index 57a7d4d87e..866cfd9f91 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs @@ -321,7 +321,7 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) { let verify_mem = DmaClient::attach_dma_buffer(&host_allocator, base_len, 0).unwrap(); // Verify restore functions will panic if verification failed. - new_nvme_driver.verify_restore(saved_state, verify_mem).await; + new_nvme_driver.verify_restore(&saved_state, verify_mem).await; } #[derive(Inspect)] From c5f7d491481c280f51868918dec8d154fb27daf2 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 10 Feb 2025 18:09:52 -0800 Subject: [PATCH 15/22] Comments cleanup --- vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs | 3 --- vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs | 3 +-- vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs | 5 ----- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index 2ad81927a7..d2a78115f6 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -694,9 +694,6 @@ impl NvmeDriver { /// Given an input of the saved state from which the driver was constructed and the underlying /// memory, this validates the current driver. - // TODO: [expand-verify-restore-functionality] move this function to take a reference to the - // SavedState for versatility of use (That way it doesn't take ownership of the saved state - // from the unit tests) #[cfg(test)] pub(crate) async fn verify_restore(&mut self, saved_state: &NvmeDriverSavedState, mem: MemoryBlock) { if let Some(task) = self.task.as_mut() { diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 80c0379a74..73bf66df8f 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -343,10 +343,9 @@ impl QueuePair { #[cfg(test)] pub(crate) async fn verify_restore(&self, saved_state: &QueuePairSavedState, saved_mem: MemoryBlock) { // Entire memory region is checked below. No need for the the handler to check it again. - // Send an RPC request to QueueHandler thread to verify the restore status. let _ = self.issuer.send.call(Req::Verify, saved_state.handler_data.clone()).await; - // cancel and issuers params are runtime parameters so we don't check underlying values. + // `cancel` and `issuers` params are runtime parameters so we don't check underlying values. let mut saved_mem_data: [u8; PAGE_SIZE] = [0; PAGE_SIZE]; let mut self_mem_data: [u8; PAGE_SIZE] = [0; PAGE_SIZE]; diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs index 866cfd9f91..92af859e5c 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs @@ -312,11 +312,6 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) { // ===== VERIFY RESTORE ===== - // TODO: [nvme-keepalive-testing] We should be be byte-by-byte coplying this memory before - // passing it in. Reserving this for when we swap out the back end to use actual memory instead - // of emulated memory. - // TODO: [nvme-keepalive-testing] Do not need to use "allocate dma buffer" once we have actual - // memory backing let host_allocator = EmulatedDmaAllocator::new(mem.clone()); let verify_mem = DmaClient::attach_dma_buffer(&host_allocator, base_len, 0).unwrap(); From cb72ac8eeee4ac24bafe7981abb1839ef4aab8b3 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Thu, 20 Feb 2025 13:43:15 -0800 Subject: [PATCH 16/22] Commiting before doing a rebase on main --- .../disk_nvme/nvme_driver/src/tests.rs | 7 ++++--- vm/devices/user_driver/src/emulated.rs | 20 ------------------- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs index 92af859e5c..123d15c5a0 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs @@ -305,14 +305,15 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) { backoff.back_off().await; // ====== SECOND DRIVER INIT ===== - let _new_device = EmulatedDevice::new(new_nvme_ctrl, new_msi_x, mem.clone()); - let mut new_nvme_driver = NvmeDriver::restore(&driver_source, CPU_COUNT, _new_device, &saved_state) + let mem_new = DeviceSharedMemory::new(base_len, payload_len); + let new_device = EmulatedDevice::new(new_nvme_ctrl, new_msi_x, mem_new.clone()); + let mut new_nvme_driver = NvmeDriver::restore(&driver_source, CPU_COUNT, new_device, &saved_state) .await .unwrap(); // ===== VERIFY RESTORE ===== - let host_allocator = EmulatedDmaAllocator::new(mem.clone()); + let host_allocator = EmulatedDmaAllocator::new(mem_new.clone()); let verify_mem = DmaClient::attach_dma_buffer(&host_allocator, base_len, 0).unwrap(); // Verify restore functions will panic if verification failed. diff --git a/vm/devices/user_driver/src/emulated.rs b/vm/devices/user_driver/src/emulated.rs index fae1d7cead..cd0052c68c 100644 --- a/vm/devices/user_driver/src/emulated.rs +++ b/vm/devices/user_driver/src/emulated.rs @@ -25,7 +25,6 @@ use pci_core::chipset_device_ext::PciChipsetDeviceExt; use pci_core::msi::MsiControl; use pci_core::msi::MsiInterruptSet; use pci_core::msi::MsiInterruptTarget; -use sparse_mmap::SparseMapping; use safeatomic::AtomicSliceOps; use std::ptr::NonNull; use std::sync::atomic::AtomicU8; @@ -140,25 +139,6 @@ pub struct DeviceSharedMemory { state: Arc>>, } -// struct RealBacking { -// mem: Arc, -// allow_dma: bool, -// } - -// unsafe impl GuestMemoryAccess for RealBacking { -// fn mapping(&self) -> Option> { -// NonNull::new(self.mem.as_ptr().cast()) -// } -// -// fn base_iova(&self) -> Option { -// self.allow_dma.then_some(0) -// } -// -// fn max_address(&self) -> u64 { -// self.mem.len().try_into().unwrap() -// } -// } - struct Backing { mem: Arc, allow_dma: bool, From 6e2bd10f073f6c812ab9c3bc4569ca724d6b709d Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Thu, 20 Feb 2025 14:09:02 -0800 Subject: [PATCH 17/22] verify restore functionality is working again --- vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs | 2 +- vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs | 1 + vm/devices/user_driver/src/emulated.rs | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index d2a78115f6..c59512cbca 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -143,7 +143,7 @@ impl IoQueue { pub(crate) async fn verify_restore(&self, saved_state: &IoQueueSavedState, mem: MemoryBlock) { self.queue.verify_restore(&saved_state.queue_data, mem); - assert_eq!(saved_state.msix, self.iv as u32); + assert_eq!(saved_state.iv, self.iv as u32); assert_eq!(saved_state.cpu, self.cpu); } } diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs index 123d15c5a0..f0395082a9 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs @@ -20,6 +20,7 @@ use std::sync::Arc; use test_with_tracing::test; use user_driver::emulated::DeviceSharedMemory; use user_driver::emulated::EmulatedDevice; +use user_driver::emulated::EmulatedDmaAllocator; use user_driver::emulated::Mapping; use user_driver::interrupt::DeviceInterrupt; use user_driver::DeviceBacking; diff --git a/vm/devices/user_driver/src/emulated.rs b/vm/devices/user_driver/src/emulated.rs index cd0052c68c..bbd48d5488 100644 --- a/vm/devices/user_driver/src/emulated.rs +++ b/vm/devices/user_driver/src/emulated.rs @@ -289,6 +289,14 @@ pub struct EmulatedDmaAllocator { shared_mem: DeviceSharedMemory, } +impl EmulatedDmaAllocator { + pub fn new(shared_mem: DeviceSharedMemory) -> Self { + EmulatedDmaAllocator { + shared_mem, + } + } +} + impl DmaClient for EmulatedDmaAllocator { fn allocate_dma_buffer(&self, len: usize) -> anyhow::Result { let memory = MemoryBlock::new(self.shared_mem.alloc(len).context("out of memory")?); From 76f1a723e49fcf40f528dc95287d54cdcc1ceb4c Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Thu, 20 Feb 2025 14:14:11 -0800 Subject: [PATCH 18/22] Removing some ununsed comments --- vm/devices/user_driver/src/emulated.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vm/devices/user_driver/src/emulated.rs b/vm/devices/user_driver/src/emulated.rs index bbd48d5488..25ca52484c 100644 --- a/vm/devices/user_driver/src/emulated.rs +++ b/vm/devices/user_driver/src/emulated.rs @@ -163,10 +163,6 @@ impl DeviceSharedMemory { pub fn new(size: usize, extra: usize) -> Self { assert_eq!(size % PAGE_SIZE, 0); assert_eq!(extra % PAGE_SIZE, 0); - // let _real_mem_backing = RealBacking { - // mem: Arc::new(TestMapper::new(size + extra)), - // allow_dma: false, - // }; let mem_backing = Backing { mem: Arc::new(AlignedHeapMemory::new(size + extra)), allow_dma: false, From 47079d6af7d72078f367ea621fd7563f2dd6997e Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Thu, 20 Feb 2025 20:34:23 -0800 Subject: [PATCH 19/22] I updated the alloc_specific functionality to only give out a page if the underlying page is free --- .../disk_nvme/nvme_driver/src/driver.rs | 3 +- .../disk_nvme/nvme_driver/src/queue_pair.rs | 8 ++- .../disk_nvme/nvme_driver/src/tests.rs | 53 +++++++++++++------ vm/devices/user_driver/src/emulated.rs | 31 ++++++++--- 4 files changed, 69 insertions(+), 26 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index c59512cbca..42dd78bb14 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -720,7 +720,8 @@ impl NvmeDriver { } task.start(); } else { - panic!("task cannot be None() after restore"); + // driver should be enabled during restore so task cannot be None. + panic!("task cannot be None after restore"); } assert_eq!(saved_state.device_id, self.device_id); diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 73bf66df8f..7625dd2c9b 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -169,7 +169,6 @@ impl PendingCommands { pub(crate) fn verify_restore(&self, saved_state: &PendingCommandsSavedState) { // TODO: [expand-verify-restore-functionality] cid_key_bits are currently unused during restore. assert_eq!(saved_state.commands.len(), self.commands.len()); - for (index, command) in &self.commands { command.verify_restore(&saved_state.commands[index]); } @@ -351,12 +350,17 @@ impl QueuePair { assert_eq!(saved_mem.len(), self.mem.len()); + // [expand-verify-restore-functionality] Base Pfn value can't be checked after restore. + // This needs to be checked in a 'save' test instead. for pfn in 0..(saved_mem.len()/PAGE_SIZE) { saved_mem.read_at(pfn * PAGE_SIZE, &mut saved_mem_data); self.mem.read_at(pfn * PAGE_SIZE, &mut self_mem_data); for i in 0..PAGE_SIZE { - assert_eq!(saved_mem_data[i], self_mem_data[i]); + // assert_eq!(saved_mem_data[i], self_mem_data[i]); + if saved_mem_data[i] != self_mem_data[i] { + println!("BYTES NOT THE SAME PFN={} ADDRESS={} LEFT={} RIGHT={}", pfn, i, saved_mem_data[i], self_mem_data[i]); + } } } diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs index f0395082a9..bdff0a2777 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs @@ -23,6 +23,7 @@ use user_driver::emulated::EmulatedDevice; use user_driver::emulated::EmulatedDmaAllocator; use user_driver::emulated::Mapping; use user_driver::interrupt::DeviceInterrupt; +use user_driver::memory::PAGE_SIZE; use user_driver::DeviceBacking; use user_driver::DeviceRegisterIo; use user_driver::DmaClient; @@ -264,15 +265,20 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) { .unwrap(); // ===== FIRST DRIVER INIT ===== - let device = EmulatedDevice::new(nvme_ctrl, msi_x, mem.clone()); - let mut nvme_driver = NvmeDriver::new(&driver_source, CPU_COUNT, device) - .await - .unwrap(); - let _ns1 = nvme_driver.namespace(1).await.unwrap(); - let saved_state = nvme_driver.save().await.unwrap(); + let saved_state = { + let device = EmulatedDevice::new(nvme_ctrl, msi_x, mem.clone()); + let mut nvme_driver = NvmeDriver::new(&driver_source, CPU_COUNT, device) + .await + .unwrap(); + let _ns1 = nvme_driver.namespace(1).await.unwrap(); + let saved = nvme_driver.save().await.unwrap(); + + // Tear down the original driver to kill the underlying tasks. + nvme_driver.shutdown().await; + saved + }; - // Tear down the original driver to kill the underlying tasks. - nvme_driver.shutdown().await; + // As of today we do not save namespace data to avoid possible conflict // when namespace has changed during servicing. // TODO: Review and re-enable in future. @@ -305,20 +311,33 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) { // Wait for CSTS.RDY to set. backoff.back_off().await; - // ====== SECOND DRIVER INIT ===== - let mem_new = DeviceSharedMemory::new(base_len, payload_len); - let new_device = EmulatedDevice::new(new_nvme_ctrl, new_msi_x, mem_new.clone()); + // ===== COPY MEMORY ===== + let mem_new= DeviceSharedMemory::new(base_len, payload_len); + let host_allocator = EmulatedDmaAllocator::new(mem.clone()); + let mem_block= DmaClient::attach_dma_buffer(&host_allocator, base_len, 0).unwrap(); + { + // Host allocator to new memory should be dropped so mem it can be consumed by the + // new driver. + let host_allocator_new_mem = EmulatedDmaAllocator::new(mem_new.clone()); + let mem_block_new_mem= DmaClient::attach_dma_buffer(&host_allocator_new_mem, base_len, 0).unwrap(); + + let mut data_transfer_buffer: [u8; PAGE_SIZE] = [0; PAGE_SIZE]; + + let total_pages = (base_len)/PAGE_SIZE; + for pfn in 0..total_pages { + mem_block.read_at(pfn * PAGE_SIZE, &mut data_transfer_buffer); + mem_block_new_mem.write_at(pfn * PAGE_SIZE, &data_transfer_buffer); + } + } + + // ====== SECOND DRIVER INIT & VERIFY ===== + let new_device = NvmeTestEmulatedDevice::new(new_nvme_ctrl, new_msi_x, mem_new.clone()); let mut new_nvme_driver = NvmeDriver::restore(&driver_source, CPU_COUNT, new_device, &saved_state) .await .unwrap(); - - // ===== VERIFY RESTORE ===== - let host_allocator = EmulatedDmaAllocator::new(mem_new.clone()); - let verify_mem = DmaClient::attach_dma_buffer(&host_allocator, base_len, 0).unwrap(); - // Verify restore functions will panic if verification failed. - new_nvme_driver.verify_restore(&saved_state, verify_mem).await; + new_nvme_driver.verify_restore(&saved_state, mem_block).await; } #[derive(Inspect)] diff --git a/vm/devices/user_driver/src/emulated.rs b/vm/devices/user_driver/src/emulated.rs index 25ca52484c..78fa8f0090 100644 --- a/vm/devices/user_driver/src/emulated.rs +++ b/vm/devices/user_driver/src/emulated.rs @@ -225,14 +225,33 @@ impl DeviceSharedMemory { }) } - // TODO: [nvme-keepalive-testing] - // This is only a stop-gap until we can swap out the back end of nvme tests to use real memory - pub fn alloc_specific(&self, len: usize, base_pfn: u64) -> Option{ + // Allocated specific pages that are requested for by base_pfn and length. Function will fail + // if those pages have already been allocated. + pub fn alloc_specific(&self, len: usize, base_pfn: usize) -> Option{ assert!(len % PAGE_SIZE == 0); + let count = len / PAGE_SIZE; - let start_page = base_pfn as usize; + if base_pfn + count > self.len { + return None; // Not enough space + } - let pages = (start_page..start_page + count).map(|p| p as u64).collect(); + // Iterate over each requested page to verify if it is unmapped. + { + let mut state = self.state.lock(); + let mut i = base_pfn; + while i < base_pfn + count { + if state[i / 64] & 1 << (i % 64) != 0 { + return None; + } + i += 1; + } + // Create a mapping for the requested pfns to mark them used + for j in base_pfn..i { + state[j / 64] |= 1 << (j % 64); + } + }; + + let pages = (base_pfn..base_pfn + count).map(|p| p as u64).collect(); Some(DmaBuffer { mem: self.mem.clone(), pfns: pages, @@ -301,7 +320,7 @@ impl DmaClient for EmulatedDmaAllocator { } fn attach_dma_buffer(&self, len: usize, base_pfn: u64) -> anyhow::Result { - let memory = MemoryBlock::new( self.shared_mem.alloc_specific(len, base_pfn).context("could not alloc specific. out of memory")?); + let memory = MemoryBlock::new( self.shared_mem.alloc_specific(len, base_pfn.try_into().unwrap()).context("could not alloc specific. out of memory")?); Ok(memory) } } From 4c5974cd36b34b12fe698e1481b53d0ab4800b17 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Fri, 21 Feb 2025 10:54:12 -0800 Subject: [PATCH 20/22] Fixed the mismatched memory by moving the Verify RPC call to after the point we validate memory --- .../storage/disk_nvme/nvme_driver/src/queue_pair.rs | 10 +++------- vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs | 5 +++-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 7625dd2c9b..35f753df03 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -341,9 +341,6 @@ impl QueuePair { /// Input memory block should already be constructed from the offsets. #[cfg(test)] pub(crate) async fn verify_restore(&self, saved_state: &QueuePairSavedState, saved_mem: MemoryBlock) { - // Entire memory region is checked below. No need for the the handler to check it again. - let _ = self.issuer.send.call(Req::Verify, saved_state.handler_data.clone()).await; - // `cancel` and `issuers` params are runtime parameters so we don't check underlying values. let mut saved_mem_data: [u8; PAGE_SIZE] = [0; PAGE_SIZE]; let mut self_mem_data: [u8; PAGE_SIZE] = [0; PAGE_SIZE]; @@ -357,13 +354,12 @@ impl QueuePair { self.mem.read_at(pfn * PAGE_SIZE, &mut self_mem_data); for i in 0..PAGE_SIZE { - // assert_eq!(saved_mem_data[i], self_mem_data[i]); - if saved_mem_data[i] != self_mem_data[i] { - println!("BYTES NOT THE SAME PFN={} ADDRESS={} LEFT={} RIGHT={}", pfn, i, saved_mem_data[i], self_mem_data[i]); - } + assert_eq!(saved_mem_data[i], self_mem_data[i]); } } + // Entire memory region checked above. No need to send it along with the verify call + let _ = self.issuer.send.call(Req::Verify, saved_state.handler_data.clone()).await; assert_eq!(saved_state.qid, self.qid); assert_eq!(saved_state.sq_entries, self.sq_entries); assert_eq!(saved_state.cq_entries, self.cq_entries); diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs index bdff0a2777..701b4711da 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs @@ -331,13 +331,14 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) { } // ====== SECOND DRIVER INIT & VERIFY ===== - let new_device = NvmeTestEmulatedDevice::new(new_nvme_ctrl, new_msi_x, mem_new.clone()); + + let new_device = EmulatedDevice::new(new_nvme_ctrl, new_msi_x, mem_new.clone()); let mut new_nvme_driver = NvmeDriver::restore(&driver_source, CPU_COUNT, new_device, &saved_state) .await .unwrap(); // Verify restore functions will panic if verification failed. - new_nvme_driver.verify_restore(&saved_state, mem_block).await; + new_nvme_driver.verify_restore(&saved_state, mem_block.clone()).await; } #[derive(Inspect)] From 0d1423b37f06d2b25d249bda349cb2b68008d23f Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Fri, 21 Feb 2025 13:26:01 -0800 Subject: [PATCH 21/22] Pretty-fied the tests --- .../disk_nvme/nvme_driver/src/queue_pair.rs | 1 + .../disk_nvme/nvme_driver/src/tests.rs | 115 +++++++++--------- 2 files changed, 56 insertions(+), 60 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 35f753df03..7a18705cda 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -359,6 +359,7 @@ impl QueuePair { } // Entire memory region checked above. No need to send it along with the verify call + // Issue rpc call after checking memory because this will write to memory. let _ = self.issuer.send.call(Req::Verify, saved_state.handler_data.clone()).await; assert_eq!(saved_state.qid, self.qid); assert_eq!(saved_state.sq_entries, self.sq_entries); diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs index 701b4711da..230d159b84 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs @@ -8,6 +8,7 @@ use chipset_device::pci::PciConfigSpace; use guid::Guid; use inspect::Inspect; use inspect::InspectMut; +use nvme::NvmeController; use nvme::NvmeControllerCaps; use nvme_spec::nvm::DsmRange; use nvme_spec::Cap; @@ -23,6 +24,7 @@ use user_driver::emulated::EmulatedDevice; use user_driver::emulated::EmulatedDmaAllocator; use user_driver::emulated::Mapping; use user_driver::interrupt::DeviceInterrupt; +use user_driver::memory::MemoryBlock; use user_driver::memory::PAGE_SIZE; use user_driver::DeviceBacking; use user_driver::DeviceRegisterIo; @@ -58,7 +60,7 @@ async fn test_nvme_ioqueue_max_mqes(driver: DefaultDriver) { let driver_source = VmTaskDriverSource::new(SingleDriverBackend::new(driver)); let mut msi_set = MsiInterruptSet::new(); - let nvme = nvme::NvmeController::new( + let nvme = NvmeController::new( &driver_source, mem.guest_memory().clone(), &mut msi_set, @@ -92,7 +94,7 @@ async fn test_nvme_ioqueue_invalid_mqes(driver: DefaultDriver) { let driver_source = VmTaskDriverSource::new(SingleDriverBackend::new(driver)); let mut msi_set = MsiInterruptSet::new(); - let nvme = nvme::NvmeController::new( + let nvme = NvmeController::new( &driver_source, mem.guest_memory().clone(), &mut msi_set, @@ -137,7 +139,7 @@ async fn test_nvme_driver(driver: DefaultDriver, allow_dma: bool) { let driver_source = VmTaskDriverSource::new(SingleDriverBackend::new(driver)); let mut msi_set = MsiInterruptSet::new(); - let nvme = nvme::NvmeController::new( + let nvme = NvmeController::new( &driver_source, mem.guest_memory().clone(), &mut msi_set, @@ -235,27 +237,14 @@ async fn test_nvme_driver(driver: DefaultDriver, allow_dma: bool) { } async fn test_nvme_save_restore_inner(driver: DefaultDriver) { - // ===== SHARED RESOURCES ===== - const MSIX_COUNT: u16 = 2; - const IO_QUEUE_COUNT: u16 = 64; + // ===== SETUP ===== const CPU_COUNT: u32 = 64; let base_len = 64 * 1024 * 1024; - let payload_len = 4 * 1024 * 1024; - let mem = DeviceSharedMemory::new(base_len, payload_len); let driver_source = VmTaskDriverSource::new(SingleDriverBackend::new(driver.clone())); - let mut msi_x = MsiInterruptSet::new(); - let nvme_ctrl = nvme::NvmeController::new( - &driver_source, - mem.guest_memory().clone(), - &mut msi_x, - &mut ExternallyManagedMmioIntercepts, - NvmeControllerCaps { - msix_count: MSIX_COUNT, - max_io_queues: IO_QUEUE_COUNT, - subsystem_id: Guid::new_random(), - }, - ); + + // ===== FIRST DRIVER INIT, SAVE, & TEARDOWN ===== + let (shared_mem_original, msi_x, nvme_ctrl) = create_driver_resources(base_len, 0, &driver_source); // Add a namespace so Identify Namespace command will succeed later. nvme_ctrl @@ -264,9 +253,8 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) { .await .unwrap(); - // ===== FIRST DRIVER INIT ===== let saved_state = { - let device = EmulatedDevice::new(nvme_ctrl, msi_x, mem.clone()); + let device = EmulatedDevice::new(nvme_ctrl, msi_x, shared_mem_original.clone()); let mut nvme_driver = NvmeDriver::new(&driver_source, CPU_COUNT, device) .await .unwrap(); @@ -277,28 +265,13 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) { nvme_driver.shutdown().await; saved }; - // As of today we do not save namespace data to avoid possible conflict // when namespace has changed during servicing. - // TODO: Review and re-enable in future. assert_eq!(saved_state.namespaces.len(), 0); - // Create a second set of devices since the ownership has been moved. - let new_emu_mem = DeviceSharedMemory::new(base_len, payload_len); - let mut new_msi_x = MsiInterruptSet::new(); - let mut new_nvme_ctrl = nvme::NvmeController::new( - &driver_source, - new_emu_mem.guest_memory().clone(), - &mut new_msi_x, - &mut ExternallyManagedMmioIntercepts, - NvmeControllerCaps { - msix_count: MSIX_COUNT, - max_io_queues: IO_QUEUE_COUNT, - subsystem_id: Guid::new_random(), - }, - ); - + // ===== SECOND DRIVER RESOURCES AND KEEPALIVE SETUP ======= + let (shared_mem_copy, new_msi_x, mut new_nvme_ctrl) = create_driver_resources(base_len, 0, &driver_source); let mut backoff = user_driver::backoff::Backoff::new(&driver); // Enable the controller for keep-alive test. @@ -312,33 +285,55 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) { backoff.back_off().await; // ===== COPY MEMORY ===== - let mem_new= DeviceSharedMemory::new(base_len, payload_len); - let host_allocator = EmulatedDmaAllocator::new(mem.clone()); - let mem_block= DmaClient::attach_dma_buffer(&host_allocator, base_len, 0).unwrap(); - { - // Host allocator to new memory should be dropped so mem it can be consumed by the - // new driver. - let host_allocator_new_mem = EmulatedDmaAllocator::new(mem_new.clone()); - let mem_block_new_mem= DmaClient::attach_dma_buffer(&host_allocator_new_mem, base_len, 0).unwrap(); - - let mut data_transfer_buffer: [u8; PAGE_SIZE] = [0; PAGE_SIZE]; - - let total_pages = (base_len)/PAGE_SIZE; - for pfn in 0..total_pages { - mem_block.read_at(pfn * PAGE_SIZE, &mut data_transfer_buffer); - mem_block_new_mem.write_at(pfn * PAGE_SIZE, &data_transfer_buffer); - } - } - - // ====== SECOND DRIVER INIT & VERIFY ===== + let host_allocator_original = EmulatedDmaAllocator::new(shared_mem_original.clone()); + let mem_original= DmaClient::attach_dma_buffer(&host_allocator_original, base_len, 0).unwrap(); + copy_mem_block(&mem_original, shared_mem_copy.clone()); - let new_device = EmulatedDevice::new(new_nvme_ctrl, new_msi_x, mem_new.clone()); + // ====== SECOND DRIVER RESTORE & VERIFY ===== + let new_device = EmulatedDevice::new(new_nvme_ctrl, new_msi_x, shared_mem_copy); let mut new_nvme_driver = NvmeDriver::restore(&driver_source, CPU_COUNT, new_device, &saved_state) .await .unwrap(); // Verify restore functions will panic if verification failed. - new_nvme_driver.verify_restore(&saved_state, mem_block.clone()).await; + new_nvme_driver.verify_restore(&saved_state, mem_original).await; +} + +/// Creates required resources for the driver +fn create_driver_resources(base_len: usize, payload_len: usize, driver_source: &VmTaskDriverSource) -> (DeviceSharedMemory, MsiInterruptSet, NvmeController) { + const MSIX_COUNT: u16 = 2; + const IO_QUEUE_COUNT: u16 = 64; + + let mem = DeviceSharedMemory::new(base_len, 0); + let mut msi_x = MsiInterruptSet::new(); + let nvme_ctrl = NvmeController::new( + &driver_source, + mem.guest_memory().clone(), + &mut msi_x, + &mut ExternallyManagedMmioIntercepts, + NvmeControllerCaps { + msix_count: MSIX_COUNT, + max_io_queues: IO_QUEUE_COUNT, + subsystem_id: Guid::new_random(), + }, + ); + + (mem, msi_x, nvme_ctrl) +} + + +/// Copies contents of mem_original to shared_mem_copy starting at pfn 0. Caller makes sure that mem_original.len() <= shared_mem_copy.capacity() +fn copy_mem_block(mem_original: &MemoryBlock, shared_mem_copy: DeviceSharedMemory) { + let host_allocator_copy = EmulatedDmaAllocator::new(shared_mem_copy); + let mem_copy= DmaClient::attach_dma_buffer(&host_allocator_copy, mem_original.len(), 0).unwrap(); + + let mut data_transfer_buffer: [u8; PAGE_SIZE] = [0; PAGE_SIZE]; + + let total_pages = mem_original.len()/PAGE_SIZE; + for pfn in 0..total_pages { + mem_original.read_at(pfn * PAGE_SIZE, &mut data_transfer_buffer); + mem_copy.write_at(pfn * PAGE_SIZE, &data_transfer_buffer); + } } #[derive(Inspect)] From 536ceed328e535e56f9dadb1b0e458c8c59db1b4 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Fri, 21 Feb 2025 13:30:31 -0800 Subject: [PATCH 22/22] More pretty-fication --- .../storage/disk_nvme/nvme_driver/src/tests.rs | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs index 230d159b84..3a13665e52 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs @@ -239,7 +239,6 @@ async fn test_nvme_driver(driver: DefaultDriver, allow_dma: bool) { async fn test_nvme_save_restore_inner(driver: DefaultDriver) { // ===== SETUP ===== const CPU_COUNT: u32 = 64; - let base_len = 64 * 1024 * 1024; let driver_source = VmTaskDriverSource::new(SingleDriverBackend::new(driver.clone())); @@ -261,28 +260,22 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) { let _ns1 = nvme_driver.namespace(1).await.unwrap(); let saved = nvme_driver.save().await.unwrap(); - // Tear down the original driver to kill the underlying tasks. nvme_driver.shutdown().await; saved }; - // As of today we do not save namespace data to avoid possible conflict - // when namespace has changed during servicing. + // As of today we do not save namespace data to avoid possible conflict when namespace has changed during servicing. assert_eq!(saved_state.namespaces.len(), 0); // ===== SECOND DRIVER RESOURCES AND KEEPALIVE SETUP ======= let (shared_mem_copy, new_msi_x, mut new_nvme_ctrl) = create_driver_resources(base_len, 0, &driver_source); - let mut backoff = user_driver::backoff::Backoff::new(&driver); - // Enable the controller for keep-alive test. + // Read Register::CC -> set CC.EN -> wait CSTS.RDY let mut dword = 0u32; - // Read Register::CC. new_nvme_ctrl.read_bar0(0x14, dword.as_mut_bytes()).unwrap(); - // Set CC.EN. dword |= 1; new_nvme_ctrl.write_bar0(0x14, dword.as_bytes()).unwrap(); - // Wait for CSTS.RDY to set. - backoff.back_off().await; + user_driver::backoff::Backoff::new(&driver).back_off().await; // ===== COPY MEMORY ===== let host_allocator_original = EmulatedDmaAllocator::new(shared_mem_original.clone());