Skip to content

Commit a72dbc6

Browse files
committed
Rework VMM reservoir sizing to scale better with memory configurations
The core observation of this change is that some uses of memory are relatively fixed regardless of a sled's hardware configuration. By subtracting these more constrained uses of memory before calculating a VMM reservoir size, the remaining memory will be used mostly for services that scale either with the amount of physical memory or the amount of storage installed. The new `control_plane_memory_earmark_mb` setting for sled-agent describes the sum of this fixed allocation, and existing sled-agent config.toml files are updated so that actual VMM reservoir sizes for Gimlets with 1TB of installed memory are about the same: Before: `1012 * 0.8 => 809.6 GiB` of VMM reservoir After: `(1012 - 30 - 44) * 0.863 => 809.494 GiB` of VMM reservoir A Gimlet with 2 TiB of DRAM sees a larger VMM reservoir: Before: `2048 * 0.8 => 1638.4 GiB` of VMM reservoir After: `(2048 - 60 - 44) * 0.863 => 1677.672 GiB` of VMM reservoir A Gimlet with less than 1 TiB of DRAM would see a smaller VMM reservoir, but this is in some sense correct: we would otherwise "overprovision" the VMM reservoir and eat into what is currently effectively a slush fund of memory for Oxide services supporting the rack's operation, risking overall system stability if inferring from observation and testing on systems with 1 TiB gimlets. A useful additional step in the direction of "config that is workable across SKUs" would be to measure Crucible overhead in the context of number of disks or total installed storage. Then we could calculate the VMM reservoir after subtracting the maximum memory expected to be used by Crucible if all storage was allocated, and have a presumably-higher VMM reservoir percentage for the yet-smaller slice of system memory that is not otherwise accounted. Fixes #7448.
1 parent 63d0dc7 commit a72dbc6

File tree

10 files changed

+179
-41
lines changed

10 files changed

+179
-41
lines changed

sled-agent/src/config.rs

+7
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,13 @@ pub struct Config {
6262
/// Optional DRAM to reserve for guest memory in MiB (mutually exclusive
6363
/// option with vmm_reservoir_percentage).
6464
pub vmm_reservoir_size_mb: Option<u32>,
65+
/// Amount of memory to set aside in anticipation of use for services that
66+
/// will have roughly constant memory use. These are services that may have
67+
/// zero to one instances on a given sled - internal DNS, MGS, Nexus,
68+
/// ClickHouse, and so on. For a sled that happens to not run these kinds of
69+
/// control plane services, this memory is "wasted", but ensures the sled
70+
/// could run those services if reconfiguration desired it.
71+
pub control_plane_memory_earmark_mb: Option<u32>,
6572
/// Optional swap device size in GiB
6673
pub swap_device_size_gb: Option<u32>,
6774
/// Optional VLAN ID to be used for tagging guest VNICs.

sled-agent/src/sled_agent.rs

+12-6
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ use sled_agent_types::zone_bundle::{
6969
PriorityOrder, StorageLimit, ZoneBundleMetadata,
7070
};
7171
use sled_diagnostics::{SledDiagnosticsCmdError, SledDiagnosticsCmdOutput};
72-
use sled_hardware::{HardwareManager, underlay};
72+
use sled_hardware::{HardwareManager, MemoryReservations, underlay};
7373
use sled_hardware_types::Baseboard;
7474
use sled_hardware_types::underlay::BootstrapInterface;
7575
use sled_storage::manager::StorageHandle;
@@ -495,18 +495,24 @@ impl SledAgent {
495495
*sled_address.ip(),
496496
);
497497

498+
// The VMM reservior is configured with respect to what's left after
499+
// accounting for relatively fixed and predictable uses.
500+
// We expect certain amounts of memory to be set aside for kernel,
501+
// buffer, or control plane uses.
502+
let memory_sizes = MemoryReservations::new(
503+
long_running_task_handles.hardware_manager.clone(),
504+
config.control_plane_memory_earmark_mb,
505+
);
506+
498507
// Configure the VMM reservoir as either a percentage of DRAM or as an
499508
// exact size in MiB.
500509
let reservoir_mode = ReservoirMode::from_config(
501510
config.vmm_reservoir_percentage,
502511
config.vmm_reservoir_size_mb,
503512
);
504513

505-
let vmm_reservoir_manager = VmmReservoirManager::spawn(
506-
&log,
507-
long_running_task_handles.hardware_manager.clone(),
508-
reservoir_mode,
509-
);
514+
let vmm_reservoir_manager =
515+
VmmReservoirManager::spawn(&log, memory_sizes, reservoir_mode);
510516

511517
let instances = InstanceManager::new(
512518
parent_log.clone(),

sled-agent/src/vmm_reservoir.rs

+25-28
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ use std::sync::atomic::{AtomicU64, Ordering};
1212
use std::thread;
1313
use tokio::sync::{broadcast, oneshot};
1414

15-
use sled_hardware::HardwareManager;
15+
use sled_hardware::MemoryReservations;
1616

1717
#[derive(thiserror::Error, Debug)]
1818
pub enum Error {
@@ -135,6 +135,7 @@ impl VmmReservoirManagerHandle {
135135

136136
/// Manage the VMM reservoir in a background thread
137137
pub struct VmmReservoirManager {
138+
memory_reservations: MemoryReservations,
138139
reservoir_size: Arc<AtomicU64>,
139140
rx: flume::Receiver<ReservoirManagerMsg>,
140141
size_updated_tx: broadcast::Sender<()>,
@@ -146,7 +147,7 @@ pub struct VmmReservoirManager {
146147
impl VmmReservoirManager {
147148
pub fn spawn(
148149
log: &Logger,
149-
hardware_manager: HardwareManager,
150+
memory_reservations: sled_hardware::MemoryReservations,
150151
reservoir_mode: Option<ReservoirMode>,
151152
) -> VmmReservoirManagerHandle {
152153
let log = log.new(o!("component" => "VmmReservoirManager"));
@@ -157,15 +158,15 @@ impl VmmReservoirManager {
157158
let (tx, rx) = flume::bounded(0);
158159
let reservoir_size = Arc::new(AtomicU64::new(0));
159160
let manager = VmmReservoirManager {
161+
memory_reservations,
160162
reservoir_size: reservoir_size.clone(),
161163
size_updated_tx: size_updated_tx.clone(),
162164
_size_updated_rx,
163165
rx,
164166
log,
165167
};
166-
let _manager_handle = Arc::new(thread::spawn(move || {
167-
manager.run(hardware_manager, reservoir_mode)
168-
}));
168+
let _manager_handle =
169+
Arc::new(thread::spawn(move || manager.run(reservoir_mode)));
169170
VmmReservoirManagerHandle {
170171
reservoir_size,
171172
tx,
@@ -174,11 +175,7 @@ impl VmmReservoirManager {
174175
}
175176
}
176177

177-
fn run(
178-
self,
179-
hardware_manager: HardwareManager,
180-
reservoir_mode: Option<ReservoirMode>,
181-
) {
178+
fn run(self, reservoir_mode: Option<ReservoirMode>) {
182179
match reservoir_mode {
183180
None => warn!(self.log, "Not using VMM reservoir"),
184181
Some(ReservoirMode::Size(0))
@@ -189,16 +186,15 @@ impl VmmReservoirManager {
189186
)
190187
}
191188
Some(mode) => {
192-
if let Err(e) = self.set_reservoir_size(&hardware_manager, mode)
193-
{
189+
if let Err(e) = self.set_reservoir_size(mode) {
194190
error!(self.log, "Failed to setup VMM reservoir: {e}");
195191
}
196192
}
197193
}
198194

199195
while let Ok(msg) = self.rx.recv() {
200196
let ReservoirManagerMsg::SetReservoirSize { mode, reply_tx } = msg;
201-
match self.set_reservoir_size(&hardware_manager, mode) {
197+
match self.set_reservoir_size(mode) {
202198
Ok(()) => {
203199
let _ = reply_tx.send(Ok(()));
204200
}
@@ -213,19 +209,20 @@ impl VmmReservoirManager {
213209
/// Sets the VMM reservoir to the requested percentage of usable physical
214210
/// RAM or to a size in MiB. Either mode will round down to the nearest
215211
/// aligned size required by the control plane.
216-
fn set_reservoir_size(
217-
&self,
218-
hardware: &sled_hardware::HardwareManager,
219-
mode: ReservoirMode,
220-
) -> Result<(), Error> {
221-
let hardware_physical_ram_bytes = hardware.usable_physical_ram_bytes();
212+
fn set_reservoir_size(&self, mode: ReservoirMode) -> Result<(), Error> {
213+
let vmm_eligible_memory = self.memory_reservations.vmm_eligible();
214+
/*
215+
let control_plane_earmark = get that from somewhere;
216+
let disks_earmark = get that from somewhere;
217+
let vmm_eligible_memory = hardware_physical_ram_bytes - control_plane_earmark - disks_earmark;
218+
*/
222219
let req_bytes = match mode {
223220
ReservoirMode::Size(mb) => {
224221
let bytes = ByteCount::from_mebibytes_u32(mb).to_bytes();
225-
if bytes > hardware_physical_ram_bytes {
222+
if bytes > vmm_eligible_memory {
226223
return Err(Error::ReservoirConfig(format!(
227-
"cannot specify a reservoir of {bytes} bytes when \
228-
physical memory is {hardware_physical_ram_bytes} bytes",
224+
"cannot specify a reservoir of {bytes} bytes when the \
225+
maximum reservoir size is {vmm_eligible_memory} bytes",
229226
)));
230227
}
231228
bytes
@@ -238,8 +235,7 @@ impl VmmReservoirManager {
238235
percent
239236
)));
240237
};
241-
(hardware_physical_ram_bytes as f64
242-
* (f64::from(percent) / 100.0))
238+
(vmm_eligible_memory as f64 * (f64::from(percent) / 100.0))
243239
.floor() as u64
244240
}
245241
};
@@ -258,15 +254,16 @@ impl VmmReservoirManager {
258254
}
259255

260256
// The max ByteCount value is i64::MAX, which is ~8 million TiB.
261-
// As this value is either a percentage of DRAM or a size in MiB
262-
// represented as a u32, constructing this should always work.
257+
// As this value is either a percentage of otherwise-unbudgeted DRAM or
258+
// a size in MiB represented as a u32, constructing this should always
259+
// work.
263260
let reservoir_size = ByteCount::try_from(req_bytes_aligned).unwrap();
264261
if let ReservoirMode::Percentage(percent) = mode {
265262
info!(
266263
self.log,
267-
"{}% of {} physical ram = {} bytes)",
264+
"{}% of {} VMM eligible ram = {} bytes)",
268265
percent,
269-
hardware_physical_ram_bytes,
266+
vmm_eligible_memory,
270267
req_bytes,
271268
);
272269
}

sled-hardware/src/illumos/mod.rs

+7
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ struct HardwareView {
205205
disks: HashMap<DiskIdentity, UnparsedDisk>,
206206
baseboard: Option<Baseboard>,
207207
online_processor_count: u32,
208+
usable_physical_pages: u64,
208209
usable_physical_ram_bytes: u64,
209210
}
210211

@@ -220,6 +221,7 @@ impl HardwareView {
220221
disks: HashMap::new(),
221222
baseboard: None,
222223
online_processor_count: sysconf::online_processor_count()?,
224+
usable_physical_pages: sysconf::usable_physical_pages()?,
223225
usable_physical_ram_bytes: sysconf::usable_physical_ram_bytes()?,
224226
})
225227
}
@@ -230,6 +232,7 @@ impl HardwareView {
230232
disks: HashMap::new(),
231233
baseboard: None,
232234
online_processor_count: sysconf::online_processor_count()?,
235+
usable_physical_pages: sysconf::usable_physical_pages()?,
233236
usable_physical_ram_bytes: sysconf::usable_physical_ram_bytes()?,
234237
})
235238
}
@@ -798,6 +801,10 @@ impl HardwareManager {
798801
self.inner.lock().unwrap().online_processor_count
799802
}
800803

804+
pub fn usable_physical_pages(&self) -> u64 {
805+
self.inner.lock().unwrap().usable_physical_pages
806+
}
807+
801808
pub fn usable_physical_ram_bytes(&self) -> u64 {
802809
self.inner.lock().unwrap().usable_physical_ram_bytes
803810
}

sled-hardware/src/illumos/sysconf.rs

+11-4
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,21 @@ pub fn online_processor_count() -> Result<u32, Error> {
2525
Ok(u32::try_from(res)?)
2626
}
2727

28-
/// Returns the amount of RAM on this sled, in bytes.
29-
pub fn usable_physical_ram_bytes() -> Result<u64, Error> {
30-
let phys_pages: u64 = illumos_utils::libc::sysconf(libc::_SC_PHYS_PAGES)
28+
/// Returns the number of physical RAM pages on this sled.
29+
pub fn usable_physical_pages() -> Result<u64, Error> {
30+
let pages = illumos_utils::libc::sysconf(libc::_SC_PHYS_PAGES)
3131
.map_err(|e| Error::Sysconf { arg: "physical pages", e })?
3232
.try_into()?;
33+
Ok(pages)
34+
}
35+
36+
/// Returns the amount of RAM on this sled, in bytes.
37+
pub fn usable_physical_ram_bytes() -> Result<u64, Error> {
3338
let page_size: u64 = illumos_utils::libc::sysconf(libc::_SC_PAGESIZE)
3439
.map_err(|e| Error::Sysconf { arg: "physical page size", e })?
3540
.try_into()?;
3641

37-
Ok(phys_pages * page_size)
42+
// XXX: if we eventually have pages with mixed sizes, this may be wrong!
43+
// I'm not even sure how we'd calculate this in such a world!
44+
Ok(usable_physical_pages()? * page_size)
3845
}

sled-hardware/src/lib.rs

+71
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,74 @@ pub enum SledMode {
7575
/// Force sled to run as a Scrimlet
7676
Scrimlet { asic: DendriteAsic },
7777
}
78+
79+
/// Accounting for high watermark memory usage for various system purposes
80+
#[derive(Copy, Clone, Debug)]
81+
pub struct MemoryReservations {
82+
/// Installed physical memory in this sled. Probably should hold a
83+
/// [`HardwareManager`] and call `usable_physical_ram_bytes()` instead of
84+
/// this.
85+
hardware_physical_ram_bytes: u64,
86+
/// The amount of memory expected to be used if "control plane" services all
87+
/// running on this sled. "control plane" here refers to services that have
88+
/// roughly fixed memory use given differing sled hardware configurations.
89+
/// DNS (internal, external), Nexus, Cockroach, or ClickHouse are all
90+
/// examples of "control plane" here.
91+
///
92+
/// This is a pessimistic overestimate; it is unlikely
93+
/// (and one might say undesirable) that all such services are colocated on
94+
/// a sled, and (as described in RFD 413) the budgeting for each service's
95+
/// RAM must include headroom for those services potentially forking and
96+
/// bursting required swap or resident pages.
97+
//
98+
// XXX: This is really something we should be told by Neuxs, perhaps after
99+
// starting with this conservative estimate to get the sled started.
100+
control_plane_earmark_bytes: u64,
101+
/// The amount of memory used for `page_t` structures, assuming a distinct
102+
/// `page_t` for each 4k page in the system. If we use larger pages, like
103+
/// 2MiB pages, this will be potentially a gross overestimate.
104+
max_page_t_space: u64,
105+
// XXX: Crucible involves some amount of memory in support of the volumes it
106+
// manages. We should collect zpool size and estimate the memory that would
107+
// be used if all available storage was dedicated to Crucible volumes. For
108+
// now this is part of the control plane earmark.
109+
}
110+
111+
impl MemoryReservations {
112+
pub fn new(
113+
hardware_manager: HardwareManager,
114+
control_plane_earmark_mib: Option<u32>,
115+
) -> MemoryReservations {
116+
let hardware_physical_ram_bytes =
117+
hardware_manager.usable_physical_ram_bytes();
118+
// Don't like hardcoding a struct size from the host OS here like
119+
// this, maybe we shuffle some bits around before merging.. On the
120+
// other hand, the last time page_t changed was illumos-gate commit
121+
// a5652762e5 from 2006.
122+
const PAGE_T_SIZE: u64 = 120;
123+
let max_page_t_space =
124+
hardware_manager.usable_physical_pages() * PAGE_T_SIZE;
125+
126+
const MIB: u64 = 1024 * 1024;
127+
let control_plane_earmark_bytes =
128+
u64::from(control_plane_earmark_mib.unwrap_or(0)) * MIB;
129+
130+
Self {
131+
hardware_physical_ram_bytes,
132+
max_page_t_space,
133+
control_plane_earmark_bytes,
134+
}
135+
}
136+
137+
/// Compute the amount of physical memory that could be set aside for the
138+
/// VMM reservoir.
139+
///
140+
/// The actual VMM reservoir will be smaller than this amount, and is either
141+
/// a fixed amount of memory specified by `ReservoirMode::Size` or
142+
/// a percentage of this amount specified by `ReservoirMode::Percentage`.
143+
pub fn vmm_eligible(&self) -> u64 {
144+
self.hardware_physical_ram_bytes
145+
- self.max_page_t_space
146+
- self.control_plane_earmark_bytes
147+
}
148+
}

sled-hardware/src/non_illumos/mod.rs

+4
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ impl HardwareManager {
4545
unimplemented!("Accessing hardware unsupported on non-illumos");
4646
}
4747

48+
pub fn usable_physical_pages(&self) -> u64 {
49+
unimplemented!("Accessing hardware unsupported on non-illumos");
50+
}
51+
4852
pub fn usable_physical_ram_bytes(&self) -> u64 {
4953
unimplemented!("Accessing hardware unsupported on non-illumos");
5054
}

smf/sled-agent/gimlet-standalone/config.toml

+13-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,19 @@ skip_timesync = true
2020

2121
# Percentage of usable physical DRAM to use for the VMM reservoir, which
2222
# guest memory is pulled from.
23-
vmm_reservoir_percentage = 80
23+
vmm_reservoir_percentage = 86.3
24+
# The amount of memory held back for services which exist between zero and one
25+
# on this Gimlet. This currently includes some additional terms reflecting
26+
# OS memory use under load.
27+
#
28+
# As of writing, this is the sum of the following items from RFD 413:
29+
# * Network buffer slush: 18 GiB
30+
# * Other kernel heap: 20 GiB
31+
# * ZFS ARC minimum: 5 GiB
32+
# * Sled agent: 0.5 GiB
33+
# * Maghemite: 0.25 GiB
34+
# * NTP: 0.25 GiB
35+
control_plane_earmark_mib = 58368
2436

2537
# Swap device size for the system. The device is a sparsely allocated zvol on
2638
# the internal zpool of the M.2 that we booted from.

smf/sled-agent/gimlet/config.toml

+13-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,19 @@ data_link = "cxgbe0"
2323

2424
# Percentage of usable physical DRAM to use for the VMM reservoir, which
2525
# guest memory is pulled from.
26-
vmm_reservoir_percentage = 80
26+
vmm_reservoir_percentage = 86.3
27+
# The amount of memory held back for services which exist between zero and one
28+
# on this Gimlet. This currently includes some additional terms reflecting
29+
# OS memory use under load.
30+
#
31+
# As of writing, this is the sum of the following items from RFD 413:
32+
# * Network buffer slush: 18 GiB
33+
# * Other kernel heap: 20 GiB
34+
# * ZFS ARC minimum: 5 GiB
35+
# * Sled agent: 0.5 GiB
36+
# * Maghemite: 0.25 GiB
37+
# * NTP: 0.25 GiB
38+
control_plane_earmark_mib = 58368
2739

2840
# Swap device size for the system. The device is a sparsely allocated zvol on
2941
# the internal zpool of the M.2 that we booted from.

0 commit comments

Comments
 (0)