From a4d3e32a27f6263dacb0a2673ebd49142c5676ce Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 28 Mar 2026 20:52:00 +0000 Subject: [PATCH 1/2] virtio: reduce guest memory accesses on queue hot path The virtio queue implementation was doing expensive guest memory writes (kick arming/suppression) on every call to is_available(), even when the device was just draining a batch and had no intention of sleeping. This added unnecessary overhead on the hot path, particularly for high-throughput devices like virtio-net. Restructure notification handling so that is_available() is a lightweight read-only check, kick arming is deferred to the actual sleep boundary (poll_kick), and suppression happens lazily when work is found while armed. The split queue caches avail_index to drain batches with a single read, and the packed queue reads only the 2-byte flags field instead of the full 16-byte descriptor for availability checks. Burette TCP tx throughput improves by 31% on nested KVM, UDP tx by 48%. TCP rx is unchanged. --- vm/devices/virtio/virtio/src/common.rs | 18 +++-- vm/devices/virtio/virtio/src/queue.rs | 53 ++++++++++++++ vm/devices/virtio/virtio/src/queue/packed.rs | 72 +++++++++++++------- vm/devices/virtio/virtio/src/queue/split.rs | 69 ++++++++++++++----- vm/devices/virtio/virtio_spec/src/lib.rs | 14 ++++ 5 files changed, 181 insertions(+), 45 deletions(-) diff --git a/vm/devices/virtio/virtio/src/common.rs b/vm/devices/virtio/virtio/src/common.rs index 3a1f21775d..dab61e9092 100644 --- a/vm/devices/virtio/virtio/src/common.rs +++ b/vm/devices/virtio/virtio/src/common.rs @@ -325,9 +325,17 @@ impl VirtioQueue { } } - /// Polls until the queue is kicked by the guest, indicating new work may be available. + /// Polls until the queue is kicked by the guest, indicating new work may be + /// available. + /// + /// Before sleeping, this arms kick notification and rechecks the queue. If + /// new data arrived during arming, it returns immediately without sleeping. + /// On wakeup, kicks are suppressed to avoid unnecessary doorbells while + /// the caller drains the queue. pub fn poll_kick(&mut self, cx: &mut Context<'_>) -> Poll<()> { - ready!(self.queue_event.wait().poll_unpin(cx)).expect("waits on Event cannot fail"); + if self.core.arm_for_kick() { + ready!(self.queue_event.wait().poll_unpin(cx)).expect("waits on Event cannot fail"); + } Poll::Ready(()) } @@ -335,9 +343,9 @@ impl VirtioQueue { /// work is currently available, or an error if there was an issue accessing /// the queue. /// - /// If `None` is returned, then the queue will be armed so that the guest - /// will kick it when new work is available; the caller can use - /// [`poll_kick`](Self::poll_kick) to wait for this. + /// This is a lightweight check that does not arm kick notification. When + /// used in a poll loop with [`poll_kick`](Self::poll_kick), the kick will + /// be armed automatically before sleeping. pub fn try_next(&mut self) -> Result, Error> { Ok(self .core diff --git a/vm/devices/virtio/virtio/src/queue.rs b/vm/devices/virtio/virtio/src/queue.rs index 959bd5107e..47b4022d48 100644 --- a/vm/devices/virtio/virtio/src/queue.rs +++ b/vm/devices/virtio/virtio/src/queue.rs @@ -123,6 +123,8 @@ pub(crate) struct QueueCoreGetWork { mem: GuestMemory, #[inspect(flatten)] inner: QueueGetWorkInner, + /// Whether kick notification is currently armed. + armed: bool, } impl QueueCoreGetWork { @@ -178,6 +180,7 @@ impl QueueCoreGetWork { features, mem, inner, + armed: false, }) } @@ -202,9 +205,59 @@ impl QueueCoreGetWork { QueueGetWorkInner::Packed(packed) => packed.is_available()?, }; let Some(index) = index else { return Ok(None) }; + self.suppress_if_armed(); self.work_from_index(index).map(Some) } + /// Arms kick notification so the guest will send a doorbell when new work + /// is available. Returns `true` if armed successfully (caller should + /// sleep), or `false` if new data arrived during arming (caller should + /// retry by calling [`try_next_work`](Self::try_next_work) again). + /// + /// If already armed, this is a no-op and returns `true`. + pub fn arm_for_kick(&mut self) -> bool { + if self.armed { + return true; + } + let r = match &mut self.inner { + QueueGetWorkInner::Split(split) => split.arm_kick(), + QueueGetWorkInner::Packed(packed) => packed.arm_kick(), + }; + match r { + Ok(true) => { + self.armed = true; + true + } + Ok(false) => false, + Err(err) => { + tracelimit::error_ratelimited!( + error = &err as &dyn std::error::Error, + "failed to arm kick" + ); + false + } + } + } + + /// If kicks are armed, suppress them. Called automatically when work is + /// found so the guest doesn't send unnecessary doorbells while draining. + fn suppress_if_armed(&mut self) { + if self.armed { + self.armed = false; + let r = match &self.inner { + QueueGetWorkInner::Split(split) => split.suppress_kicks(), + QueueGetWorkInner::Packed(packed) => packed.suppress_kicks(), + }; + + if let Err(err) = r { + tracelimit::error_ratelimited!( + error = &err as &dyn std::error::Error, + "failed to suppress kicks" + ); + } + } + } + /// Advances the available index after a successful /// [`try_peek_work`](Self::try_peek_work) call. pub fn advance(&mut self, work: &QueueWork) { diff --git a/vm/devices/virtio/virtio/src/queue/packed.rs b/vm/devices/virtio/virtio/src/queue/packed.rs index 0e660ce0d0..00e3165252 100644 --- a/vm/devices/virtio/virtio/src/queue/packed.rs +++ b/vm/devices/virtio/virtio/src/queue/packed.rs @@ -7,7 +7,6 @@ use crate::queue::QueueDescriptor; use crate::queue::QueueError; use crate::queue::QueueParams; use crate::queue::descriptor_offset; -use crate::queue::read_descriptor; use crate::spec::VirtioDeviceFeatures; use crate::spec::queue as spec; use crate::spec::queue::DescriptorFlags; @@ -47,6 +46,7 @@ pub(crate) struct PackedQueueGetWork { queue_size: u16, next_avail_index: u16, wrapped_bit: bool, + next_is_available: bool, } impl PackedQueueGetWork { @@ -73,6 +73,7 @@ impl PackedQueueGetWork { queue_size: params.size, next_avail_index: initial_index, wrapped_bit: initial_wrap, + next_is_available: false, }) } @@ -81,33 +82,57 @@ impl PackedQueueGetWork { self.next_avail_index | (u16::from(self.wrapped_bit) << 15) } - pub fn is_available(&self) -> Result, QueueError> { - loop { - let disable_event = - PackedEventSuppression::new().with_flags(EventSuppressionFlags::Disabled); - self.device_event - .write_plain(0, &disable_event) + /// Checks whether a descriptor is available, returning its index. + /// + /// This is a lightweight check that does not arm kick notification. When + /// `None` is returned, the caller must call [`arm_kick`](Self::arm_kick) + /// before sleeping to ensure the guest will send a kick when new work + /// arrives. + pub fn is_available(&mut self) -> Result, QueueError> { + if !self.next_is_available { + let flags: DescriptorFlags = self + .queue_desc + .read_plain( + descriptor_offset(self.next_avail_index) + + std::mem::offset_of!(PackedDescriptor, flags_raw) as u64, + ) .map_err(QueueError::Memory)?; - atomic::fence(atomic::Ordering::Acquire); - let descriptor: PackedDescriptor = - read_descriptor(&self.queue_desc, self.next_avail_index)?; - let flags = descriptor.flags(); - if flags.available() == self.wrapped_bit && flags.used() != self.wrapped_bit { - return Ok(Some(self.next_avail_index)); - } - let enable_event = - PackedEventSuppression::new().with_flags(EventSuppressionFlags::Enabled); - self.device_event - .write_plain(0, &enable_event) - .map_err(QueueError::Memory)?; - atomic::fence(atomic::Ordering::SeqCst); - let descriptor: PackedDescriptor = - read_descriptor(&self.queue_desc, self.next_avail_index)?; - let flags = descriptor.flags(); if flags.available() != self.wrapped_bit || flags.used() == self.wrapped_bit { return Ok(None); } + self.next_is_available = true; + } + Ok(Some(self.next_avail_index)) + } + + /// Arms kick notification so the guest will send a doorbell when new work + /// is available. Returns `true` if armed successfully (caller should + /// sleep), or `false` if new data arrived during arming (caller should + /// retry). + pub fn arm_kick(&mut self) -> Result { + let enable_event = PackedEventSuppression::new().with_flags(EventSuppressionFlags::Enabled); + self.device_event + .write_plain(0, &enable_event) + .map_err(QueueError::Memory)?; + // Ensure the event enable is visible before checking the descriptor. + atomic::fence(atomic::Ordering::SeqCst); + if self.is_available()?.is_some() { + // New data arrived during arming — suppress kicks and report. + self.suppress_kicks()?; + return Ok(false); } + Ok(true) + } + + /// Suppress kick notifications from the guest. Call this after finding + /// work to avoid unnecessary kicks while processing. + pub fn suppress_kicks(&self) -> Result<(), QueueError> { + let disable_event = + PackedEventSuppression::new().with_flags(EventSuppressionFlags::Disabled); + self.device_event + .write_plain(0, &disable_event) + .map_err(QueueError::Memory)?; + Ok(()) } /// Advances `next_avail_index` by `count` descriptors. @@ -117,6 +142,7 @@ impl PackedQueueGetWork { self.wrapped_bit = !self.wrapped_bit; } self.next_avail_index = next_avail_index; + self.next_is_available = false; } } diff --git a/vm/devices/virtio/virtio/src/queue/split.rs b/vm/devices/virtio/virtio/src/queue/split.rs index 447c9eee8e..7a6de6e69c 100644 --- a/vm/devices/virtio/virtio/src/queue/split.rs +++ b/vm/devices/virtio/virtio/src/queue/split.rs @@ -19,6 +19,9 @@ pub(crate) struct SplitQueueGetWork { queue_used: GuestMemory, queue_size: u16, last_avail_index: u16, + /// Cached guest avail_index from the last read, to avoid re-reading for + /// each descriptor when draining a batch. + cached_avail_index: u16, use_ring_event_index: bool, } @@ -57,6 +60,7 @@ impl SplitQueueGetWork { queue_used, queue_size: params.size, last_avail_index: initial_avail_index, + cached_avail_index: initial_avail_index, use_ring_event_index: features.bank0().ring_event_idx(), }) } @@ -82,32 +86,63 @@ impl SplitQueueGetWork { /// Checks whether a descriptor is available, returning its wrapped index. /// Does not advance `last_avail_index`; call [`advance`](Self::advance) /// to consume the descriptor. + /// + /// This is a lightweight check that does not arm kick notification. When + /// `None` is returned, the caller must call [`arm_kick`](Self::arm_kick) + /// before sleeping to ensure the guest will send a kick when new work + /// arrives. + /// + /// Reads the guest's avail_index only when the locally cached value has + /// been exhausted, allowing multiple descriptors to be drained per read. pub fn is_available(&mut self) -> Result, QueueError> { - let mut avail_index = Self::get_available_index(self)?; - if avail_index == self.last_avail_index { - if self.use_ring_event_index { - self.set_available_event(avail_index)?; - } else { - self.set_used_flags(spec::UsedFlags::new())?; - } - // Ensure the available event/used flags are visible before checking - // the available index again. - atomic::fence(atomic::Ordering::SeqCst); - avail_index = Self::get_available_index(self)?; - if avail_index == self.last_avail_index { + if self.cached_avail_index == self.last_avail_index { + // Re-read the guest's avail_index to see if new work arrived. + self.cached_avail_index = self.get_available_index()?; + if self.cached_avail_index == self.last_avail_index { return Ok(None); } + // Ensure available index read is ordered before subsequent + // descriptor reads. Only needed when we actually read from guest + // memory; the cached path reuses a previously-fenced value. + atomic::fence(atomic::Ordering::Acquire); } + Ok(Some(self.last_avail_index & (self.queue_size - 1))) + } + /// Arms kick notification so the guest will send a doorbell when new work + /// is available. Returns `true` if armed successfully (caller should + /// sleep), or `false` if new data arrived during arming (caller should + /// retry). + pub fn arm_kick(&mut self) -> Result { if self.use_ring_event_index { self.set_available_event(self.last_avail_index)?; } else { + self.set_used_flags(spec::UsedFlags::new())?; + } + // Ensure the available event/used flags are visible before checking + // the available index again. + atomic::fence(atomic::Ordering::SeqCst); + if self.is_available()?.is_some() { + // New work arrived during arming — suppress kicks again. + self.suppress_kicks()?; + return Ok(false); + } + Ok(true) + } + + /// Suppress kick notifications from the guest. Call this after finding + /// work to avoid unnecessary kicks while processing. + /// + /// With `EVENT_IDX`, the `avail_event` value set during arming is + /// inherently index-based and will go stale as we drain descriptors. + /// There's no good way to express "don't kick" — any fixed index will + /// eventually be hit on wrap. So we skip the write; the worst case is + /// one spurious kick per u16 wrap (every 65536 descriptors). + pub fn suppress_kicks(&self) -> Result<(), QueueError> { + if !self.use_ring_event_index { self.set_used_flags(spec::UsedFlags::new().with_no_notify(true))?; } - // Ensure available index read is ordered before subsequent descriptor - // reads. - atomic::fence(atomic::Ordering::Acquire); - Ok(Some(self.last_avail_index % self.queue_size)) + Ok(()) } /// Advances `last_avail_index` by one, consuming the descriptor returned @@ -230,7 +265,7 @@ impl SplitQueueCompleteWork { descriptor_index: u16, bytes_written: u32, ) -> Result<(), QueueError> { - let wrapped_index = (queue_last_used_index % self.queue_size) as u64; + let wrapped_index = (queue_last_used_index & (self.queue_size - 1)) as u64; let addr = spec::USED_OFFSET_RING + spec::USED_ELEMENT_SIZE * wrapped_index; self.queue_used .write_plain( diff --git a/vm/devices/virtio/virtio_spec/src/lib.rs b/vm/devices/virtio/virtio_spec/src/lib.rs index 4652bcbc69..c1d949b3ab 100644 --- a/vm/devices/virtio/virtio_spec/src/lib.rs +++ b/vm/devices/virtio/virtio_spec/src/lib.rs @@ -351,12 +351,26 @@ pub mod queue { } } + /// Flags controlling event (interrupt/notification) suppression for packed + /// virtqueues. + /// + /// Reference: virtio spec §2.8.10, "Event Suppression Structure Layout". #[derive(Debug, PartialEq, Eq)] #[repr(u8)] pub enum EventSuppressionFlags { + /// `RING_EVENT_FLAGS_ENABLE` (0x0) — events are enabled; the device/driver + /// should generate events (interrupts or notifications) normally. Enabled = 0, + /// `RING_EVENT_FLAGS_DISABLE` (0x1) — events are disabled; the + /// device/driver should not generate any events. Disabled = 1, + /// `RING_EVENT_FLAGS_DESC` (0x2) — enable events only when a specific + /// descriptor index (with matching wrap counter) is reached, as + /// specified by the `offset` and `wrap` fields of the + /// [`PackedEventSuppression`] structure. DescriptorIndex = 2, + /// Reserved value (0x3). Treated as "events enabled" by this + /// implementation for forward compatibility. Reserved = 3, } impl EventSuppressionFlags { From 8f04c24d41785c3ad697784d13c984efec236b56 Mon Sep 17 00:00:00 2001 From: John Starks Date: Mon, 30 Mar 2026 17:59:09 +0000 Subject: [PATCH 2/2] feedback, add fences --- vm/devices/virtio/virtio/src/queue.rs | 5 ++++- vm/devices/virtio/virtio/src/queue/packed.rs | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/vm/devices/virtio/virtio/src/queue.rs b/vm/devices/virtio/virtio/src/queue.rs index 47b4022d48..7d14440e63 100644 --- a/vm/devices/virtio/virtio/src/queue.rs +++ b/vm/devices/virtio/virtio/src/queue.rs @@ -234,7 +234,10 @@ impl QueueCoreGetWork { error = &err as &dyn std::error::Error, "failed to arm kick" ); - false + // On error, behave as if armed to avoid a busy loop in callers + // that treat `false` as "retry immediately". + self.armed = true; + true } } } diff --git a/vm/devices/virtio/virtio/src/queue/packed.rs b/vm/devices/virtio/virtio/src/queue/packed.rs index 00e3165252..e4b2515d99 100644 --- a/vm/devices/virtio/virtio/src/queue/packed.rs +++ b/vm/devices/virtio/virtio/src/queue/packed.rs @@ -100,6 +100,9 @@ impl PackedQueueGetWork { if flags.available() != self.wrapped_bit || flags.used() == self.wrapped_bit { return Ok(None); } + // Ensure subsequent descriptor-field reads cannot be reordered + // before the flags read on weakly ordered architectures. + atomic::fence(atomic::Ordering::Acquire); self.next_is_available = true; } Ok(Some(self.next_avail_index)) @@ -202,6 +205,9 @@ impl PackedQueueCompleteWork { .with_available(self.wrapped_bit) .with_used(self.wrapped_bit), ); + // Ensure any prior writes to guest buffers (e.g. device data) are + // visible before the used descriptor becomes visible to the guest. + atomic::fence(atomic::Ordering::Release); self.queue_desc .write_plain(descriptor_offset(self.next_index), &descriptor) .map_err(QueueError::Memory)?;