oxidecomputer · iximeow · Mar 19, 2025 · Mar 20, 2025 · iximeow · Mar 19, 2025
diff --git a/sled-agent/src/config.rs b/sled-agent/src/config.rs
@@ -58,10 +58,17 @@ pub struct Config {
     // TODO: Remove once this can be auto-detected.
     pub sidecar_revision: SidecarRevision,
     /// Optional percentage of DRAM to reserve for guest memory
-    pub vmm_reservoir_percentage: Option<u8>,
+    pub vmm_reservoir_percentage: Option<f32>,
     /// Optional DRAM to reserve for guest memory in MiB (mutually exclusive
     /// option with vmm_reservoir_percentage).
     pub vmm_reservoir_size_mb: Option<u32>,
+    /// Amount of memory to set aside in anticipation of use for services that
+    /// will have roughly constant memory use. These are services that may have
+    /// zero to one instances on a given sled - internal DNS, MGS, Nexus,
+    /// ClickHouse, and so on. For a sled that happens to not run these kinds of
+    /// control plane services, this memory is "wasted", but ensures the sled
+    /// could run those services if reconfiguration desired it.
+    pub control_plane_memory_earmark_mb: Option<u32>,
     /// Optional swap device size in GiB
     pub swap_device_size_gb: Option<u32>,
     /// Optional VLAN ID to be used for tagging guest VNICs.
@@ -181,8 +188,8 @@ mod test {
                     let entry = entry.unwrap();
                     if entry.file_name() == "config.toml" {
                         let path = entry.path();
-                        Config::from_file(&path).unwrap_or_else(|_| {
-                            panic!("Failed to parse config {path}")
+                        Config::from_file(&path).unwrap_or_else(|e| {
+                            panic!("Failed to parse config {path}: {e}")
                         });
                         configs_seen += 1;
                     }

diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs
@@ -69,7 +69,7 @@ use sled_agent_types::zone_bundle::{
     PriorityOrder, StorageLimit, ZoneBundleMetadata,
 };
 use sled_diagnostics::{SledDiagnosticsCmdError, SledDiagnosticsCmdOutput};
-use sled_hardware::{HardwareManager, underlay};
+use sled_hardware::{HardwareManager, MemoryReservations, underlay};
 use sled_hardware_types::Baseboard;
 use sled_hardware_types::underlay::BootstrapInterface;
 use sled_storage::manager::StorageHandle;
@@ -495,18 +495,24 @@ impl SledAgent {
             *sled_address.ip(),
         );
 
+        // The VMM reservior is configured with respect to what's left after
+        // accounting for relatively fixed and predictable uses.
+        // We expect certain amounts of memory to be set aside for kernel,
+        // buffer, or control plane uses.
+        let memory_sizes = MemoryReservations::new(
+            long_running_task_handles.hardware_manager.clone(),
+            config.control_plane_memory_earmark_mb,
+        );
+
         // Configure the VMM reservoir as either a percentage of DRAM or as an
         // exact size in MiB.
         let reservoir_mode = ReservoirMode::from_config(
             config.vmm_reservoir_percentage,
             config.vmm_reservoir_size_mb,
         );
 
-        let vmm_reservoir_manager = VmmReservoirManager::spawn(
-            &log,
-            long_running_task_handles.hardware_manager.clone(),
-            reservoir_mode,
-        );
+        let vmm_reservoir_manager =
+            VmmReservoirManager::spawn(&log, memory_sizes, reservoir_mode);
 
         let instances = InstanceManager::new(
             parent_log.clone(),

diff --git a/sled-agent/src/vmm_reservoir.rs b/sled-agent/src/vmm_reservoir.rs
@@ -12,7 +12,7 @@ use std::sync::atomic::{AtomicU64, Ordering};
 use std::thread;
 use tokio::sync::{broadcast, oneshot};
 
-use sled_hardware::HardwareManager;
+use sled_hardware::MemoryReservations;
 
 #[derive(thiserror::Error, Debug)]
 pub enum Error {
@@ -35,7 +35,7 @@ pub enum Error {
 #[derive(Debug, Clone, Copy)]
 pub enum ReservoirMode {
     Size(u32),
-    Percentage(u8),
+    Percentage(f32),
 }
 
 impl ReservoirMode {
@@ -44,7 +44,7 @@ impl ReservoirMode {
     ///
     /// Panic upon invalid configuration
     pub fn from_config(
-        percentage: Option<u8>,
+        percentage: Option<f32>,
         size_mb: Option<u32>,
     ) -> Option<ReservoirMode> {
         match (percentage, size_mb) {
@@ -135,6 +135,7 @@ impl VmmReservoirManagerHandle {
 
 /// Manage the VMM reservoir in a background thread
 pub struct VmmReservoirManager {
+    memory_reservations: MemoryReservations,
     reservoir_size: Arc<AtomicU64>,
     rx: flume::Receiver<ReservoirManagerMsg>,
     size_updated_tx: broadcast::Sender<()>,
@@ -146,7 +147,7 @@ pub struct VmmReservoirManager {
 impl VmmReservoirManager {
     pub fn spawn(
         log: &Logger,
-        hardware_manager: HardwareManager,
+        memory_reservations: sled_hardware::MemoryReservations,
         reservoir_mode: Option<ReservoirMode>,
     ) -> VmmReservoirManagerHandle {
         let log = log.new(o!("component" => "VmmReservoirManager"));
@@ -157,15 +158,15 @@ impl VmmReservoirManager {
         let (tx, rx) = flume::bounded(0);
         let reservoir_size = Arc::new(AtomicU64::new(0));
         let manager = VmmReservoirManager {
+            memory_reservations,
             reservoir_size: reservoir_size.clone(),
             size_updated_tx: size_updated_tx.clone(),
             _size_updated_rx,
             rx,
             log,
         };
-        let _manager_handle = Arc::new(thread::spawn(move || {
-            manager.run(hardware_manager, reservoir_mode)
-        }));
+        let _manager_handle =
+            Arc::new(thread::spawn(move || manager.run(reservoir_mode)));
         VmmReservoirManagerHandle {
             reservoir_size,
             tx,
@@ -174,31 +175,26 @@ impl VmmReservoirManager {
         }
     }
 
-    fn run(
-        self,
-        hardware_manager: HardwareManager,
-        reservoir_mode: Option<ReservoirMode>,
-    ) {
+    fn run(self, reservoir_mode: Option<ReservoirMode>) {
         match reservoir_mode {
             None => warn!(self.log, "Not using VMM reservoir"),
             Some(ReservoirMode::Size(0))
-            | Some(ReservoirMode::Percentage(0)) => {
+            | Some(ReservoirMode::Percentage(0.0)) => {
                 warn!(
                     self.log,
                     "Not using VMM reservoir (size 0 bytes requested)"
                 )
             }
             Some(mode) => {
-                if let Err(e) = self.set_reservoir_size(&hardware_manager, mode)
-                {
+                if let Err(e) = self.set_reservoir_size(mode) {
                     error!(self.log, "Failed to setup VMM reservoir: {e}");
                 }
             }
         }
 
         while let Ok(msg) = self.rx.recv() {
             let ReservoirManagerMsg::SetReservoirSize { mode, reply_tx } = msg;
-            match self.set_reservoir_size(&hardware_manager, mode) {
+            match self.set_reservoir_size(mode) {
                 Ok(()) => {
                     let _ = reply_tx.send(Ok(()));
                 }
@@ -213,33 +209,28 @@ impl VmmReservoirManager {
     /// Sets the VMM reservoir to the requested percentage of usable physical
     /// RAM or to a size in MiB. Either mode will round down to the nearest
     /// aligned size required by the control plane.
-    fn set_reservoir_size(
-        &self,
-        hardware: &sled_hardware::HardwareManager,
-        mode: ReservoirMode,
-    ) -> Result<(), Error> {
-        let hardware_physical_ram_bytes = hardware.usable_physical_ram_bytes();
+    fn set_reservoir_size(&self, mode: ReservoirMode) -> Result<(), Error> {
+        let vmm_eligible_memory = self.memory_reservations.vmm_eligible();
         let req_bytes = match mode {
             ReservoirMode::Size(mb) => {
                 let bytes = ByteCount::from_mebibytes_u32(mb).to_bytes();
-                if bytes > hardware_physical_ram_bytes {
+                if bytes > vmm_eligible_memory {
                     return Err(Error::ReservoirConfig(format!(
-                        "cannot specify a reservoir of {bytes} bytes when \
-                        physical memory is {hardware_physical_ram_bytes} bytes",
+                        "cannot specify a reservoir of {bytes} bytes when the \
+                        maximum reservoir size is {vmm_eligible_memory} bytes",
                     )));
                 }
                 bytes
             }
             ReservoirMode::Percentage(percent) => {
-                if !matches!(percent, 1..=99) {
+                if !matches!(percent, 0.1..100.0) {
                     return Err(Error::ReservoirConfig(format!(
                         "VMM reservoir percentage of {} must be between 0 and \
                         100",
                         percent
                     )));
                 };
-                (hardware_physical_ram_bytes as f64
-                    * (f64::from(percent) / 100.0))
+                (vmm_eligible_memory as f64 * (f64::from(percent) / 100.0))
                     .floor() as u64
             }
         };
@@ -258,15 +249,16 @@ impl VmmReservoirManager {
         }
 
         // The max ByteCount value is i64::MAX, which is ~8 million TiB.
-        // As this value is either a percentage of DRAM or a size in MiB
-        // represented as a u32, constructing this should always work.
+        // As this value is either a percentage of otherwise-unbudgeted DRAM or
+        // a size in MiB represented as a u32, constructing this should always
+        // work.
         let reservoir_size = ByteCount::try_from(req_bytes_aligned).unwrap();
         if let ReservoirMode::Percentage(percent) = mode {
             info!(
                 self.log,
-                "{}% of {} physical ram = {} bytes)",
+                "{}% of {} VMM eligible ram = {} bytes)",
                 percent,
-                hardware_physical_ram_bytes,
+                vmm_eligible_memory,
                 req_bytes,
             );
         }

diff --git a/sled-hardware/src/illumos/mod.rs b/sled-hardware/src/illumos/mod.rs
@@ -205,6 +205,7 @@ struct HardwareView {
     disks: HashMap<DiskIdentity, UnparsedDisk>,
     baseboard: Option<Baseboard>,
     online_processor_count: u32,
+    usable_physical_pages: u64,
     usable_physical_ram_bytes: u64,
 }
 
@@ -220,6 +221,7 @@ impl HardwareView {
             disks: HashMap::new(),
             baseboard: None,
             online_processor_count: sysconf::online_processor_count()?,
+            usable_physical_pages: sysconf::usable_physical_pages()?,
             usable_physical_ram_bytes: sysconf::usable_physical_ram_bytes()?,
         })
     }
@@ -230,6 +232,7 @@ impl HardwareView {
             disks: HashMap::new(),
             baseboard: None,
             online_processor_count: sysconf::online_processor_count()?,
+            usable_physical_pages: sysconf::usable_physical_pages()?,
             usable_physical_ram_bytes: sysconf::usable_physical_ram_bytes()?,
         })
     }
@@ -798,6 +801,10 @@ impl HardwareManager {
         self.inner.lock().unwrap().online_processor_count
     }
 
+    pub fn usable_physical_pages(&self) -> u64 {
+        self.inner.lock().unwrap().usable_physical_pages
+    }
+
     pub fn usable_physical_ram_bytes(&self) -> u64 {
         self.inner.lock().unwrap().usable_physical_ram_bytes
     }

diff --git a/sled-hardware/src/illumos/sysconf.rs b/sled-hardware/src/illumos/sysconf.rs
@@ -25,14 +25,21 @@ pub fn online_processor_count() -> Result<u32, Error> {
     Ok(u32::try_from(res)?)
 }
 
-/// Returns the amount of RAM on this sled, in bytes.
-pub fn usable_physical_ram_bytes() -> Result<u64, Error> {
-    let phys_pages: u64 = illumos_utils::libc::sysconf(libc::_SC_PHYS_PAGES)
+/// Returns the number of physical RAM pages on this sled.
+pub fn usable_physical_pages() -> Result<u64, Error> {
+    let pages = illumos_utils::libc::sysconf(libc::_SC_PHYS_PAGES)
         .map_err(|e| Error::Sysconf { arg: "physical pages", e })?
         .try_into()?;
+    Ok(pages)
+}
+
+/// Returns the amount of RAM on this sled, in bytes.
+pub fn usable_physical_ram_bytes() -> Result<u64, Error> {
     let page_size: u64 = illumos_utils::libc::sysconf(libc::_SC_PAGESIZE)
         .map_err(|e| Error::Sysconf { arg: "physical page size", e })?
         .try_into()?;
 
-    Ok(phys_pages * page_size)
+    // XXX: if we eventually have pages with mixed sizes, this may be wrong!
+    // I'm not even sure how we'd calculate this in such a world!
+    Ok(usable_physical_pages()? * page_size)
 }
diff --git a/sled-hardware/src/lib.rs b/sled-hardware/src/lib.rs
@@ -75,3 +75,74 @@ pub enum SledMode {
     /// Force sled to run as a Scrimlet
     Scrimlet { asic: DendriteAsic },
 }
+
+/// Accounting for high watermark memory usage for various system purposes
+#[derive(Copy, Clone, Debug)]
+pub struct MemoryReservations {
+    /// Installed physical memory in this sled. Probably should hold a
+    /// [`HardwareManager`] and call `usable_physical_ram_bytes()` instead of
+    /// this.
+    hardware_physical_ram_bytes: u64,
+    /// The amount of memory expected to be used if "control plane" services all
+    /// running on this sled. "control plane" here refers to services that have
+    /// roughly fixed memory use given differing sled hardware configurations.
+    /// DNS (internal, external), Nexus, Cockroach, or ClickHouse are all
+    /// examples of "control plane" here.
+    ///
+    /// This is a pessimistic overestimate; it is unlikely
+    /// (and one might say undesirable) that all such services are colocated on
+    /// a sled, and (as described in RFD 413) the budgeting for each service's
+    /// RAM must include headroom for those services potentially forking and
+    /// bursting required swap or resident pages.
+    //
+    // XXX: This is really something we should be told by Neuxs, perhaps after
+    // starting with this conservative estimate to get the sled started.
+    control_plane_earmark_bytes: u64,
+    /// The amount of memory used for `page_t` structures, assuming a distinct
+    /// `page_t` for each 4k page in the system. If we use larger pages, like
+    /// 2MiB pages, this will be potentially a gross overestimate.
+    max_page_t_space: u64,
+    // XXX: Crucible involves some amount of memory in support of the volumes it
+    // manages. We should collect zpool size and estimate the memory that would
+    // be used if all available storage was dedicated to Crucible volumes. For
+    // now this is part of the control plane earmark.
+}
+
+impl MemoryReservations {
+    pub fn new(
+        hardware_manager: HardwareManager,
+        control_plane_earmark_mib: Option<u32>,
+    ) -> MemoryReservations {
+        let hardware_physical_ram_bytes =
+            hardware_manager.usable_physical_ram_bytes();
+        // Don't like hardcoding a struct size from the host OS here like
+        // this, maybe we shuffle some bits around before merging.. On the
+        // other hand, the last time page_t changed was illumos-gate commit
+        // a5652762e5 from 2006.
+        const PAGE_T_SIZE: u64 = 120;
+        let max_page_t_space =
+            hardware_manager.usable_physical_pages() * PAGE_T_SIZE;
+
+        const MIB: u64 = 1024 * 1024;
+        let control_plane_earmark_bytes =
+            u64::from(control_plane_earmark_mib.unwrap_or(0)) * MIB;
+
+        Self {
+            hardware_physical_ram_bytes,
+            max_page_t_space,
+            control_plane_earmark_bytes,
+        }
+    }
+
+    /// Compute the amount of physical memory that could be set aside for the
+    /// VMM reservoir.
+    ///
+    /// The actual VMM reservoir will be smaller than this amount, and is either
+    /// a fixed amount of memory specified by `ReservoirMode::Size` or
+    /// a percentage of this amount specified by `ReservoirMode::Percentage`.
+    pub fn vmm_eligible(&self) -> u64 {
+        self.hardware_physical_ram_bytes
+            - self.max_page_t_space
+            - self.control_plane_earmark_bytes
+    }
+}
diff --git a/sled-hardware/src/non_illumos/mod.rs b/sled-hardware/src/non_illumos/mod.rs
@@ -45,6 +45,10 @@ impl HardwareManager {
         unimplemented!("Accessing hardware unsupported on non-illumos");
     }
 
+    pub fn usable_physical_pages(&self) -> u64 {
+        unimplemented!("Accessing hardware unsupported on non-illumos");
+    }
+
     pub fn usable_physical_ram_bytes(&self) -> u64 {
         unimplemented!("Accessing hardware unsupported on non-illumos");
     }

diff --git a/smf/sled-agent/gimlet-standalone/config.toml b/smf/sled-agent/gimlet-standalone/config.toml
@@ -20,7 +20,19 @@ skip_timesync = true
 
 # Percentage of usable physical DRAM to use for the VMM reservoir, which
 # guest memory is pulled from.
-vmm_reservoir_percentage = 80
+vmm_reservoir_percentage = 86.3
+# The amount of memory held back for services which exist between zero and one
+# on this Gimlet. This currently includes some additional terms reflecting
+# OS memory use under load.
+#
+# As of writing, this is the sum of the following items from RFD 413:
+# * Network buffer slush: 18 GiB
+# * Other kernel heap: 20 GiB
+# * ZFS ARC minimum: 5 GiB
+# * Sled agent: 0.5 GiB
+# * Maghemite: 0.25 GiB
+# * NTP: 0.25 GiB
+control_plane_memory_earmark_mb = 45056
 
 # Swap device size for the system. The device is a sparsely allocated zvol on
 # the internal zpool of the M.2 that we booted from.