From 6bfbb421a2d3f69a0c4a54b764e05b029da31a03 Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Mon, 4 Oct 2021 22:11:56 -0700 Subject: [PATCH 01/23] nvme: keep IO queue sizes as u32's --- propolis/src/hw/nvme/admin.rs | 4 ++-- propolis/src/hw/nvme/cmds.rs | 10 ++++++---- propolis/src/hw/nvme/queue.rs | 3 ++- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/propolis/src/hw/nvme/admin.rs b/propolis/src/hw/nvme/admin.rs index b171323c1..d9139c2d9 100644 --- a/propolis/src/hw/nvme/admin.rs +++ b/propolis/src/hw/nvme/admin.rs @@ -33,7 +33,7 @@ impl NvmeCtrl { cmd.qid, cmd.intr_vector, GuestAddr(cmd.prp), - cmd.qsize as u32, + cmd.qsize, ctx, ) { Ok(_) => cmds::Completion::success(), @@ -67,7 +67,7 @@ impl NvmeCtrl { cmd.qid, cmd.cqid, GuestAddr(cmd.prp), - cmd.qsize as u32, + cmd.qsize, ctx, ) { Ok(_) => cmds::Completion::success(), diff --git a/propolis/src/hw/nvme/cmds.rs b/propolis/src/hw/nvme/cmds.rs index f779fa5f3..0d6fefec3 100644 --- a/propolis/src/hw/nvme/cmds.rs +++ b/propolis/src/hw/nvme/cmds.rs @@ -61,7 +61,7 @@ impl AdminCmd { }; AdminCmd::CreateIOSubQ(CreateIOSQCmd { prp: raw.prp1, - qsize: (raw.cdw10 >> 16) as u16 + 1, // Convert from 0's based + qsize: (raw.cdw10 >> 16) + 1, // Convert from 0's based qid: raw.cdw10 as u16, cqid: (raw.cdw11 >> 16) as u16, queue_prio, @@ -84,7 +84,7 @@ impl AdminCmd { bits::ADMIN_OPC_CREATE_IO_CQ => { AdminCmd::CreateIOCompQ(CreateIOCQCmd { prp: raw.prp1, - qsize: (raw.cdw10 >> 16) as u16 + 1, // Convert from 0's based + qsize: (raw.cdw10 >> 16) + 1, // Convert from 0's based qid: raw.cdw10 as u16, intr_vector: (raw.cdw11 >> 16) as u16, intr_enable: (raw.cdw11 & 0b10) != 0, @@ -132,7 +132,8 @@ pub struct CreateIOCQCmd { /// /// The size of the Completion Queue to be created. /// See NVMe 1.0e Section 4.1.3 Queue Size - pub qsize: u16, + /// NOTE: This has already been converted from a 0's based value. + pub qsize: u32, /// Queue Identifier (QID) /// @@ -171,7 +172,8 @@ pub struct CreateIOSQCmd { /// /// The size of the Completion Queue to be created. /// See NVMe 1.0e Section 4.1.3 Queue Size - pub qsize: u16, + /// NOTE: This has already been converted from a 0's based value. + pub qsize: u32, /// Queue Identifier (QID) /// diff --git a/propolis/src/hw/nvme/queue.rs b/propolis/src/hw/nvme/queue.rs index 5ddebe3cc..3824bb5c8 100644 --- a/propolis/src/hw/nvme/queue.rs +++ b/propolis/src/hw/nvme/queue.rs @@ -14,7 +14,8 @@ pub type QueueId = u16; /// The minimum number of entries in either a Completion or Submission Queue. /// -/// Note: One entry will always be unavailable for use due to Head and Tail entry pointer defition. +/// Note: One entry will always be unavailable for use due to the Head and Tail +/// entry pointer definitions. /// See NVMe 1.0e Section 4.1.3 Queue Size const MIN_QUEUE_SIZE: u32 = 2; From 6ef32cc4047ab1ed4d16c178baa1ab85231a5e3b Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Tue, 5 Oct 2021 01:03:56 -0700 Subject: [PATCH 02/23] nvme: Skip Mutex on NVMe queues and use atomics for head/tail pointers. --- propolis/src/hw/nvme/mod.rs | 54 ++++----- propolis/src/hw/nvme/ns.rs | 40 +++---- propolis/src/hw/nvme/queue.rs | 200 +++++++++++++++++++--------------- 3 files changed, 151 insertions(+), 143 deletions(-) diff --git a/propolis/src/hw/nvme/mod.rs b/propolis/src/hw/nvme/mod.rs index f988936ec..1cc57dcf5 100644 --- a/propolis/src/hw/nvme/mod.rs +++ b/propolis/src/hw/nvme/mod.rs @@ -111,10 +111,10 @@ struct NvmeCtrl { msix_hdl: Option, /// The list of Completion Queues handled by the controller - cqs: [Option>>; MAX_NUM_QUEUES], + cqs: [Option>; MAX_NUM_QUEUES], /// The list of Submission Queues handled by the controller - sqs: [Option>>; MAX_NUM_QUEUES], + sqs: [Option>; MAX_NUM_QUEUES], /// The list of namespaces handled by the controller nss: [Option; MAX_NUM_NAMESPACES], @@ -171,7 +171,7 @@ impl NvmeCtrl { .ok_or(NvmeError::MsixHdlUnavailable)? .clone(); let cq = CompQueue::new(cqid, iv, size, base, ctx, msix_hdl)?; - self.cqs[cqid as usize] = Some(Arc::new(Mutex::new(cq))); + self.cqs[cqid as usize] = Some(Arc::new(cq)); Ok(()) } @@ -199,15 +199,12 @@ impl NvmeCtrl { return Err(NvmeError::SubQueueAlreadyExists(sqid)); } let sq = SubQueue::new(sqid, cqid, size, base, ctx)?; - self.sqs[sqid as usize] = Some(Arc::new(Mutex::new(sq))); + self.sqs[sqid as usize] = Some(Arc::new(sq)); Ok(()) } /// Returns a reference to the [`CompQueue`] which corresponds to the given completion queue id (`cqid`). - fn get_cq( - &self, - cqid: QueueId, - ) -> Result>, NvmeError> { + fn get_cq(&self, cqid: QueueId) -> Result, NvmeError> { debug_assert!((cqid as usize) < MAX_NUM_QUEUES); self.cqs[cqid as usize] .as_ref() @@ -216,7 +213,7 @@ impl NvmeCtrl { } /// Returns a reference to the [`SubQueue`] which corresponds to the given submission queue id (`cqid`). - fn get_sq(&self, sqid: QueueId) -> Result>, NvmeError> { + fn get_sq(&self, sqid: QueueId) -> Result, NvmeError> { debug_assert!((sqid as usize) < MAX_NUM_QUEUES); self.sqs[sqid as usize] .as_ref() @@ -229,7 +226,7 @@ impl NvmeCtrl { /// # Panics /// /// Panics if the Admin Completion Queue hasn't been created yet. - fn get_admin_cq(&self) -> Arc> { + fn get_admin_cq(&self) -> Arc { self.get_cq(queue::ADMIN_QUEUE_ID).unwrap() } @@ -238,7 +235,7 @@ impl NvmeCtrl { /// # Panics /// /// Panics if the Admin Submission Queue hasn't been created yet. - fn get_admin_sq(&self) -> Arc> { + fn get_admin_sq(&self) -> Arc { self.get_sq(queue::ADMIN_QUEUE_ID).unwrap() } @@ -533,21 +530,19 @@ impl PciNvme { let val = wo.read_u32().try_into().unwrap(); let state = self.state.lock().unwrap(); let admin_sq = state.get_admin_sq(); - let mut sq = admin_sq.lock().unwrap(); - match sq.notify_tail(val) { + match admin_sq.notify_tail(val) { Ok(_) => {} Err(_) => todo!("set controller error state"), } // Process any new SQ entries - self.process_admin_queue(state, sq, ctx)?; + self.process_admin_queue(state, admin_sq, ctx)?; } CtrlrReg::DoorBellAdminCQ => { let val = wo.read_u32().try_into().unwrap(); let state = self.state.lock().unwrap(); let admin_cq = state.get_admin_cq(); - let mut cq = admin_cq.lock().unwrap(); - match cq.notify_head(val) { + match admin_cq.notify_head(val) { Ok(_) => {} Err(_) => todo!("set controller error state"), } @@ -571,8 +566,7 @@ impl PciNvme { if (off >> 2) & 0b1 == 0b1 { // Completion Queue y Head Doorbell let y = (off - 4) >> 3; - let io_cq = state.get_cq(y as u16)?; - let mut cq = io_cq.lock().unwrap(); + let cq = state.get_cq(y as u16)?; match cq.notify_head(val) { Ok(_) => {} Err(_) => todo!("set controller error state"), @@ -581,14 +575,12 @@ impl PciNvme { } else { // Submission Queue y Tail Doorbell let y = off >> 3; - let io_sq = state.get_sq(y as u16)?; - let mut sq = io_sq.lock().unwrap(); + let sq = state.get_sq(y as u16)?; match sq.notify_tail(val) { Ok(_) => {} Err(_) => todo!("set controller error state"), } - drop(sq); - self.process_io_queue(state, io_sq, ctx)?; + self.process_io_queue(state, sq, ctx)?; } } } @@ -600,12 +592,11 @@ impl PciNvme { fn process_admin_queue( &self, mut state: MutexGuard, - mut sq: MutexGuard, + sq: Arc, ctx: &DispCtx, ) -> Result<(), NvmeError> { // Grab the Admin CQ too - let admin_cq = state.get_admin_cq(); - let mut cq = admin_cq.lock().unwrap(); + let cq = state.get_admin_cq(); while let Some(sub) = sq.pop(ctx) { use cmds::AdminCmd; @@ -659,13 +650,11 @@ impl PciNvme { fn process_io_queue( &self, state: MutexGuard, - io_sq: Arc>, + sq: Arc, ctx: &DispCtx, ) -> Result<(), NvmeError> { - let mut sq = io_sq.lock().unwrap(); - // Grab the corresponding CQ - let io_cq = state.get_cq(sq.cqid())?; + let cq = state.get_cq(sq.cqid())?; // Collect all the IO SQ entries, per namespace let mut io_cmds = BTreeMap::new(); @@ -673,20 +662,17 @@ impl PciNvme { io_cmds.entry(sub.nsid).or_insert_with(Vec::new).push(sub); } - drop(sq); - // Queue up said IO entries to the underlying block device, per namespace for (nsid, io_cmds) in io_cmds { state.get_ns(nsid)?.queue_io_cmds( io_cmds, - io_cq.clone(), - io_sq.clone(), + cq.clone(), + sq.clone(), ctx, )?; } // Notify for any newly added completions - let cq = io_cq.lock().unwrap(); cq.fire_interrupt(ctx); Ok(()) diff --git a/propolis/src/hw/nvme/ns.rs b/propolis/src/hw/nvme/ns.rs index 95d86ad58..ae7ab6878 100644 --- a/propolis/src/hw/nvme/ns.rs +++ b/propolis/src/hw/nvme/ns.rs @@ -1,5 +1,5 @@ use std::collections::VecDeque; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use crate::block::*; use crate::hw::nvme::bits::RawCompletion; @@ -71,16 +71,14 @@ impl NvmeNs { pub(super) fn queue_io_cmds( &self, cmds: Vec, - cq: Arc>, - sq: Arc>, + cq: Arc, + sq: Arc, ctx: &DispCtx, ) -> Result<(), NvmeError> { for sub in cmds { let cmd = NvmCmd::parse(sub)?; match cmd { NvmCmd::Write(_) if self.is_ro => { - let mut cq = cq.lock().unwrap(); - let sq = sq.lock().unwrap(); let comp = Completion::specific_err( bits::StatusCodeType::CmdSpecific, bits::STS_WRITE_READ_ONLY_RANGE, @@ -107,9 +105,6 @@ impl NvmeNs { } NvmCmd::Unknown(_) => { // For any other command, just immediately complete it - let mut cq = cq.lock().unwrap(); - let sq = sq.lock().unwrap(); - let comp = Completion::generic_err(bits::STS_INTERNAL_ERR); let completion = RawCompletion { dw0: comp.dw0, @@ -132,8 +127,8 @@ impl NvmeNs { fn flush_cmd( &self, cid: u16, - cq: Arc>, - sq: Arc>, + cq: Arc, + sq: Arc, ) { // TODO: handles if it gets unmapped? self.bdev.enqueue(Request { @@ -153,8 +148,8 @@ impl NvmeNs { cid: u16, cmd: ReadCmd, ctx: &DispCtx, - cq: Arc>, - sq: Arc>, + cq: Arc, + sq: Arc, ) { probe_nvme_read_enqueue!(|| (cid, cmd.slba, cmd.nlb)); let off = self.nlb_to_size(cmd.slba as usize); @@ -178,8 +173,8 @@ impl NvmeNs { cid: u16, cmd: WriteCmd, ctx: &DispCtx, - cq: Arc>, - sq: Arc>, + cq: Arc, + sq: Arc, ) { probe_nvme_write_enqueue!(|| (cid, cmd.slba, cmd.nlb)); let off = self.nlb_to_size(cmd.slba as usize); @@ -216,10 +211,10 @@ pub struct Request { cid: u16, /// The associated Completion Queue - cq: Arc>, + cq: Arc, /// The associated Submission Queue - sq: Arc>, + sq: Arc, } impl BlockReq for Request { @@ -269,21 +264,18 @@ impl BlockReq for Request { _ => {} } - let sq = self.sq.lock().unwrap(); - let mut cq = self.cq.lock().unwrap(); - let completion = bits::RawCompletion { dw0: comp.dw0, rsvd: 0, - sqhd: sq.head(), - sqid: sq.id(), + sqhd: self.sq.head(), + sqid: self.sq.id(), cid: self.cid, - status_phase: comp.status | cq.phase(), + status_phase: comp.status | self.cq.phase(), }; - cq.push(completion, ctx); + self.cq.push(completion, ctx); // TODO: should this be done here? - cq.fire_interrupt(ctx); + self.cq.fire_interrupt(ctx); } } diff --git a/propolis/src/hw/nvme/queue.rs b/propolis/src/hw/nvme/queue.rs index 3824bb5c8..134860e7d 100644 --- a/propolis/src/hw/nvme/queue.rs +++ b/propolis/src/hw/nvme/queue.rs @@ -1,4 +1,5 @@ use std::marker::PhantomData; +use std::sync::atomic::{AtomicU64, Ordering}; use super::bits::{self, RawCompletion, RawSubmission}; use crate::common::*; @@ -52,21 +53,25 @@ struct QueueState { /// See NVMe 1.0e Section 4.1.3 Queue Size size: u32, - /// The current Head entry pointer. + /// The combined current Head and Tail entry pointers. + /// + /// The least significant 16 bits represent the Head pointer while + /// the next 16 represent the Tail pointer. + /// + /// Bit 32 is the current phase tag (for completion queues). /// /// The consumer of entries on a queue uses the current Head entry pointer /// to identify the next entry to be pulled off the queue. /// - /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition - head: u16, - - /// The current Tail entry pointer. - /// /// The submitter of entries to a queue uses the current Tail entry pointer /// to identify the next open queue entry space. /// + /// The Phase Tag is used to identify to the host (VM) that a Completion + /// entry is new. Flips every time the Tail entry pointer wraps around. + /// /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition - tail: u16, + /// See NVMe 1.0e Section 4.5 Completion Queue Entry - Phase Tag (P) + head_tail_phase: AtomicU64, /// Marker type to indicate what type of Queue we're modeling. _qt: PhantomData, @@ -74,30 +79,30 @@ struct QueueState { impl QueueState { /// Create a new `QueueState` - fn new(size: u32, head: u16, tail: u16) -> Self { + fn new(size: u32) -> Self { assert!(size >= MIN_QUEUE_SIZE && size <= MAX_QUEUE_SIZE); - Self { size, head, tail, _qt: PhantomData } + Self { size, head_tail_phase: AtomicU64::new(1 << 32 /* phase tag */), _qt: PhantomData } } - /// Returns if the queue is currently empty. + /// Returns if the queue is currently empty with the given head and tail pointers. /// /// A queue is empty when the Head entry pointer equals the Tail entry pointer. /// /// See: NVMe 1.0e Section 4.1.1 Empty Queue - fn is_empty(&self) -> bool { - self.head == self.tail + fn is_empty(&self, head: u16, tail: u16) -> bool { + head == tail } - /// Returns if the queue is currently full. + /// Returns if the queue is currently full with the given head and tail pointers. /// /// The queue is full when the Head entry pointer equals one more than the Tail /// entry pointer. The number of entries in a queue will always be 1 less than /// the queue size. /// /// See: NVMe 1.0e Section 4.1.2 Full Queue - fn is_full(&self) -> bool { - (self.head > 0 && self.tail == (self.head - 1)) - || (self.head == 0 && self.tail == (self.size - 1) as u16) + fn is_full(&self, head: u16, tail: u16) -> bool { + (head > 0 && tail == (head - 1)) + || (head == 0 && tail == (self.size - 1) as u16) } /// Helper method to calculate a positive offset for a given index, wrapping at @@ -126,18 +131,6 @@ impl QueueState { idx - off } } - - /// How many slots are empty between the tail and the head i.e., how many - /// entries can we write to the queue currently. - fn avail_empty(&self) -> u16 { - self.wrap_sub(self.wrap_sub(self.head, 1), self.tail) - } - - /// How many slots are occupied between the head and the tail i.e., how - /// many entries can we read from the queue currently. - fn avail_occupied(&self) -> u16 { - self.wrap_sub(self.tail, self.head) - } } impl QueueState { @@ -145,16 +138,40 @@ impl QueueState { /// /// If the queue is full this method returns [`None`]. /// Otherwise, this method returns the current Tail entry pointer and then - /// increments the Tail entry pointer by 1 (wrapping if necessary). It will - /// also return whether the Tail entry pointer wrapped after incrementing. - fn push_tail(&mut self) -> Option<(u16, bool)> { - if self.is_full() { - None - } else { - let result = Some(self.tail); - self.tail = self.wrap_add(self.tail, 1); - result.map(|r| (r, (r + 1) as u32 >= self.size)) - } + /// increments the Tail entry pointer by 1 (wrapping if necessary). + fn push_tail(&self) -> Option { + self.head_tail_phase + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |head_tail| { + let (mut phase, tail, head) = ( + (head_tail & (1 << 32)) != 0, + (head_tail >> 16) as u16, + head_tail as u16, + ); + if self.is_full(head, tail) { + return None; + } + if tail as u32 + 1 >= self.size { + // We wrapped so flip phase + phase = !phase; + } + let new_tail = self.wrap_add(tail, 1); + Some( + (phase as u64) << 32 + | (new_tail as u64) << 16 + | head as u64, + ) + }) + .ok() + .map(|old_head_tail| { + let old_tail = (old_head_tail >> 16) as u16; + old_tail + }) + } + + /// How many slots are occupied between the head and the tail i.e., how + /// many entries can we read from the queue currently. + fn avail_occupied(&self, head: u16, tail: u16) -> u16 { + self.wrap_sub(tail, head) } /// Attempt to move the Head entry pointer forward to the given index. @@ -163,16 +180,26 @@ impl QueueState { /// must have enough occupied slots otherwise we return an error. /// Conceptually this method indicates some entries have been consumed /// from the queue. - fn pop_head_to(&mut self, idx: u16) -> Result<(), &'static str> { + fn pop_head_to(&self, idx: u16) -> Result<(), &'static str> { if idx as u32 >= self.size { return Err("invalid index"); } - let pop_count = self.wrap_sub(idx, self.head); - if pop_count > self.avail_occupied() { - return Err("index too far"); - } - self.head = idx; - Ok(()) + self.head_tail_phase + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |head_tail| { + let (phase, tail, head) = ( + (head_tail & (1 << 32)) != 0, + (head_tail >> 16) as u16, + head_tail as u16, + ); + let pop_count = self.wrap_sub(idx, head); + if pop_count > self.avail_occupied(head, tail) { + return None; + } + // Replace head with given idx + Some((phase as u64) << 32 | (tail as u64) << 16 | idx as u64) + }) + .map_err(|_| "index too far") + .map(|_| ()) } } @@ -182,14 +209,24 @@ impl QueueState { /// If the queue is empty this method returns [`None`]. /// Otherwise, this method returns the current Head entry pointer and then /// increments the Head entry pointer by 1 (wrapping if necessary). - fn pop_head(&mut self) -> Option { - if self.is_empty() { - None - } else { - let result = Some(self.head); - self.head = self.wrap_add(self.head, 1); - result - } + fn pop_head(&self) -> Option { + self.head_tail_phase + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |head_tail| { + let (tail, head) = ((head_tail >> 16) as u16, head_tail as u16); + if self.is_empty(head, tail) { + return None; + } + let new_head = self.wrap_add(head, 1); + Some((tail as u64) << 16 | new_head as u64) + }) + .ok() + .map(|old_head_tail| old_head_tail as u16) + } + + /// How many slots are empty between the tail and the head i.e., how many + /// entries can we write to the queue currently. + fn avail_empty(&self, head: u16, tail: u16) -> u16 { + self.wrap_sub(self.wrap_sub(head, 1), tail) } /// Attempt to move the Tail entry pointer forward to the given index. @@ -198,16 +235,22 @@ impl QueueState { /// must have enough empty slots available otherwise we return an error. /// Conceptually this method indicates new entries have been added to the /// queue. - fn push_tail_to(&mut self, idx: u16) -> Result<(), &'static str> { + fn push_tail_to(&self, idx: u16) -> Result<(), &'static str> { if idx as u32 >= self.size { return Err("invalid index"); } - let push_count = self.wrap_sub(idx, self.tail); - if push_count > self.avail_empty() { - return Err("index too far"); - } - self.tail = idx; - Ok(()) + self.head_tail_phase + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |head_tail| { + let (tail, head) = ((head_tail >> 16) as u16, head_tail as u16); + let push_count = self.wrap_sub(idx, tail); + if push_count > self.avail_empty(head, tail) { + return None; + } + // Replace tail with given idx + Some((idx as u64) << 16 | head as u64) + }) + .map_err(|_| "index too far") + .map(|_| ()) } } @@ -249,16 +292,16 @@ impl SubQueue { ctx: &DispCtx, ) -> Result { Self::validate(id, base, size, ctx)?; - Ok(Self { id, cqid, state: QueueState::new(size, 0, 0), base }) + Ok(Self { id, cqid, state: QueueState::new(size), base }) } /// Attempt to move the Tail entry pointer forward to the given index. - pub fn notify_tail(&mut self, idx: u16) -> Result<(), &'static str> { + pub fn notify_tail(&self, idx: u16) -> Result<(), &'static str> { self.state.push_tail_to(idx) } /// Returns the next entry off of the Queue or [`None`] if it is empty. - pub fn pop(&mut self, ctx: &DispCtx) -> Option { + pub fn pop(&self, ctx: &DispCtx) -> Option { if let Some(idx) = self.state.pop_head() { let mem = ctx.mctx.memctx(); let ent: Option = mem.read(self.entry_addr(idx)); @@ -271,7 +314,7 @@ impl SubQueue { /// Returns the current Head entry pointer. pub fn head(&self) -> u16 { - self.state.head + self.state.head_tail_phase.load(Ordering::SeqCst) as u16 } /// Returns the ID of this Submission Queue. @@ -333,12 +376,6 @@ pub struct CompQueue { /// The [`GuestAddr`] at which the Queue is mapped. base: GuestAddr, - /// The current Phase Tag to identify to the host (VM) that a Completion - /// entry is new. Flips every time the Tail entry pointer wraps around. - /// - /// See NVMe 1.0e Section 4.5 Completion Queue Entry - Phase Tag (P) - phase: bool, - /// MSI-X object associated with PCIe device to signal host (VM). hdl: pci::MsixHdl, } @@ -355,31 +392,22 @@ impl CompQueue { hdl: pci::MsixHdl, ) -> Result { Self::validate(id, base, size, ctx)?; - Ok(Self { - iv, - state: QueueState::new(size, 0, 0), - base, - phase: true, - hdl, - }) + Ok(Self { iv, state: QueueState::new(size), base, hdl }) } /// Attempt to move the Head entry pointer forward to the given index. - pub fn notify_head(&mut self, idx: u16) -> Result<(), &'static str> { + pub fn notify_head(&self, idx: u16) -> Result<(), &'static str> { self.state.pop_head_to(idx) } /// Attempt to add a new entry to the Completion Queue. /// /// TODO: handle the case where the queue may be currently full. - pub fn push(&mut self, entry: RawCompletion, ctx: &DispCtx) { - if let Some((idx, wrapped)) = self.state.push_tail() { + pub fn push(&self, entry: RawCompletion, ctx: &DispCtx) { + if let Some(idx) = self.state.push_tail() { let mem = ctx.mctx.memctx(); let addr = self.entry_addr(idx); mem.write(addr, &entry); - if wrapped { - self.phase = !self.phase; - } // XXX: handle a guest addr that becomes unmapped later // XXX: figure out interrupts } @@ -392,13 +420,15 @@ impl CompQueue { /// /// See NVMe 1.0e Section 4.5 Completion Queue Entry - Phase Tag (P) pub fn phase(&self) -> u16 { - self.phase as u16 + (self.state.head_tail_phase.load(Ordering::SeqCst) >> 32) as u16 } /// Fires an interrupt to the guest with the associated interrupt vector /// if the queue is not currently empty. pub fn fire_interrupt(&self, ctx: &DispCtx) { - if !self.state.is_empty() { + let head_tail = self.state.head_tail_phase.load(Ordering::SeqCst); + let (tail, head) = ((head_tail >> 16) as u16, head_tail as u16); + if !self.state.is_empty(head, tail) { self.hdl.fire(self.iv, ctx); } } From e45d1232ff429e2e8b912b4461a51743cad1493e Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Tue, 5 Oct 2021 09:20:34 -0700 Subject: [PATCH 03/23] nvme: keep Arc ref to corresponding cq from each sq --- propolis/src/hw/nvme/mod.rs | 18 +++++++++--------- propolis/src/hw/nvme/queue.rs | 18 +++++++++--------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/propolis/src/hw/nvme/mod.rs b/propolis/src/hw/nvme/mod.rs index 1cc57dcf5..1762c5a9d 100644 --- a/propolis/src/hw/nvme/mod.rs +++ b/propolis/src/hw/nvme/mod.rs @@ -190,22 +190,20 @@ impl NvmeCtrl { if (sqid as usize) >= MAX_NUM_QUEUES { return Err(NvmeError::InvalidSubQueue(sqid)); } - if (cqid as usize) >= MAX_NUM_QUEUES - || self.cqs[cqid as usize].is_none() - { - return Err(NvmeError::InvalidCompQueue(cqid)); - } if self.sqs[sqid as usize].is_some() { return Err(NvmeError::SubQueueAlreadyExists(sqid)); } - let sq = SubQueue::new(sqid, cqid, size, base, ctx)?; + let cq = self.get_cq(cqid)?; + let sq = SubQueue::new(sqid, cq, size, base, ctx)?; self.sqs[sqid as usize] = Some(Arc::new(sq)); Ok(()) } /// Returns a reference to the [`CompQueue`] which corresponds to the given completion queue id (`cqid`). fn get_cq(&self, cqid: QueueId) -> Result, NvmeError> { - debug_assert!((cqid as usize) < MAX_NUM_QUEUES); + if (cqid as usize) >= MAX_NUM_QUEUES { + return Err(NvmeError::InvalidCompQueue(cqid)); + } self.cqs[cqid as usize] .as_ref() .map(Arc::clone) @@ -214,7 +212,9 @@ impl NvmeCtrl { /// Returns a reference to the [`SubQueue`] which corresponds to the given submission queue id (`cqid`). fn get_sq(&self, sqid: QueueId) -> Result, NvmeError> { - debug_assert!((sqid as usize) < MAX_NUM_QUEUES); + if (sqid as usize) >= MAX_NUM_QUEUES { + return Err(NvmeError::InvalidSubQueue(sqid)); + } self.sqs[sqid as usize] .as_ref() .map(Arc::clone) @@ -654,7 +654,7 @@ impl PciNvme { ctx: &DispCtx, ) -> Result<(), NvmeError> { // Grab the corresponding CQ - let cq = state.get_cq(sq.cqid())?; + let cq = sq.cq(); // Collect all the IO SQ entries, per namespace let mut io_cmds = BTreeMap::new(); diff --git a/propolis/src/hw/nvme/queue.rs b/propolis/src/hw/nvme/queue.rs index 134860e7d..e0e33ff65 100644 --- a/propolis/src/hw/nvme/queue.rs +++ b/propolis/src/hw/nvme/queue.rs @@ -1,4 +1,5 @@ use std::marker::PhantomData; +use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; use super::bits::{self, RawCompletion, RawSubmission}; @@ -268,11 +269,11 @@ pub enum QueueCreateErr { /// Type for manipulating Submission Queues. pub struct SubQueue { - /// The ID of queue in question. + /// The ID of this Submission Queue. id: QueueId, - /// The ID of the corresponding Completion Queue. - cqid: QueueId, + /// The corresponding Completion Queue. + cq: Arc, /// Queue state such as the size and current head/tail entry pointers. state: QueueState, @@ -286,13 +287,13 @@ impl SubQueue { /// given base address. pub fn new( id: QueueId, - cqid: QueueId, + cq: Arc, size: u32, base: GuestAddr, ctx: &DispCtx, ) -> Result { Self::validate(id, base, size, ctx)?; - Ok(Self { id, cqid, state: QueueState::new(size), base }) + Ok(Self { id, cq, state: QueueState::new(size), base }) } /// Attempt to move the Tail entry pointer forward to the given index. @@ -322,10 +323,9 @@ impl SubQueue { self.id } - /// Returns the ID of the corresponding Completion Queue - /// to this Submission Queue. - pub fn cqid(&self) -> QueueId { - self.cqid + /// Returns the corresponding Completion Queue + pub fn cq(&self) -> Arc { + self.cq.clone() } /// Returns the corresponding [`GuestAddr`] for a given entry in From 85000eeb31f5e712b2ef5ce9aece2fdae6d3ac36 Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Tue, 5 Oct 2021 09:45:53 -0700 Subject: [PATCH 04/23] nvme: factor out submitting completions --- propolis/src/hw/nvme/mod.rs | 10 +--------- propolis/src/hw/nvme/ns.rs | 37 ++++++----------------------------- propolis/src/hw/nvme/queue.rs | 18 ++++++++++++++++- 3 files changed, 24 insertions(+), 41 deletions(-) diff --git a/propolis/src/hw/nvme/mod.rs b/propolis/src/hw/nvme/mod.rs index 1762c5a9d..7ce5fd0ae 100644 --- a/propolis/src/hw/nvme/mod.rs +++ b/propolis/src/hw/nvme/mod.rs @@ -629,15 +629,7 @@ impl PciNvme { } }; - let completion = RawCompletion { - dw0: comp.dw0, - rsvd: 0, - sqhd: sq.head(), - sqid: sq.id(), - cid: sub.cid(), - status_phase: comp.status | cq.phase(), - }; - cq.push(completion, ctx); + sq.push_completion(sub.cid(), comp, ctx); } // Notify for any newly added completions diff --git a/propolis/src/hw/nvme/ns.rs b/propolis/src/hw/nvme/ns.rs index ae7ab6878..fc563dac3 100644 --- a/propolis/src/hw/nvme/ns.rs +++ b/propolis/src/hw/nvme/ns.rs @@ -2,7 +2,6 @@ use std::collections::VecDeque; use std::sync::Arc; use crate::block::*; -use crate::hw::nvme::bits::RawCompletion; use crate::hw::nvme::cmds::{self, NvmCmd}; use crate::{common, dispatch::DispCtx}; @@ -83,16 +82,7 @@ impl NvmeNs { bits::StatusCodeType::CmdSpecific, bits::STS_WRITE_READ_ONLY_RANGE, ); - let completion = RawCompletion { - dw0: comp.dw0, - rsvd: 0, - sqhd: sq.head(), - sqid: sq.id(), - cid: sub.cid(), - status_phase: comp.status | cq.phase(), - }; - - cq.push(completion, ctx); + sq.push_completion(sub.cid(), comp, ctx); } NvmCmd::Write(cmd) => { self.write_cmd(sub.cid(), cmd, ctx, cq.clone(), sq.clone()) @@ -106,20 +96,14 @@ impl NvmeNs { NvmCmd::Unknown(_) => { // For any other command, just immediately complete it let comp = Completion::generic_err(bits::STS_INTERNAL_ERR); - let completion = RawCompletion { - dw0: comp.dw0, - rsvd: 0, - sqhd: sq.head(), - sqid: sq.id(), - cid: sub.cid(), - status_phase: comp.status | cq.phase(), - }; - - cq.push(completion, ctx); + sq.push_completion(sub.cid(), comp, ctx); } } } + // Notify for any newly added completions + cq.fire_interrupt(ctx); + Ok(()) } @@ -264,16 +248,7 @@ impl BlockReq for Request { _ => {} } - let completion = bits::RawCompletion { - dw0: comp.dw0, - rsvd: 0, - sqhd: self.sq.head(), - sqid: self.sq.id(), - cid: self.cid, - status_phase: comp.status | self.cq.phase(), - }; - - self.cq.push(completion, ctx); + self.sq.push_completion(self.cid, comp, ctx); // TODO: should this be done here? self.cq.fire_interrupt(ctx); diff --git a/propolis/src/hw/nvme/queue.rs b/propolis/src/hw/nvme/queue.rs index e0e33ff65..8d727c490 100644 --- a/propolis/src/hw/nvme/queue.rs +++ b/propolis/src/hw/nvme/queue.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; use super::bits::{self, RawCompletion, RawSubmission}; +use super::cmds::Completion; use crate::common::*; use crate::dispatch::DispCtx; use crate::hw::pci; @@ -328,6 +329,21 @@ impl SubQueue { self.cq.clone() } + /// Push a new entry onto this Submission Queue's corresponding + /// Completion Queue. + pub fn push_completion(&self, cid: u16, comp: Completion, ctx: &DispCtx) { + let completion = bits::RawCompletion { + dw0: comp.dw0, + rsvd: 0, + sqhd: self.head(), + sqid: self.id(), + cid, + status_phase: comp.status | self.cq.phase(), + }; + + self.cq.push(completion, ctx); + } + /// Returns the corresponding [`GuestAddr`] for a given entry in /// the Submission Queue. fn entry_addr(&self, idx: u16) -> GuestAddr { @@ -403,7 +419,7 @@ impl CompQueue { /// Attempt to add a new entry to the Completion Queue. /// /// TODO: handle the case where the queue may be currently full. - pub fn push(&self, entry: RawCompletion, ctx: &DispCtx) { + fn push(&self, entry: RawCompletion, ctx: &DispCtx) { if let Some(idx) = self.state.push_tail() { let mem = ctx.mctx.memctx(); let addr = self.entry_addr(idx); From bc1d40ca79b6c5a28495c6ed44f75a98cd8917ee Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Tue, 5 Oct 2021 11:03:54 -0700 Subject: [PATCH 05/23] nvme: clean up bit twiddling for managing queue state by using bitstruct. --- propolis/src/hw/nvme/queue.rs | 187 ++++++++++++++++++++-------------- 1 file changed, 112 insertions(+), 75 deletions(-) diff --git a/propolis/src/hw/nvme/queue.rs b/propolis/src/hw/nvme/queue.rs index 8d727c490..dc8f1f390 100644 --- a/propolis/src/hw/nvme/queue.rs +++ b/propolis/src/hw/nvme/queue.rs @@ -1,6 +1,6 @@ use std::marker::PhantomData; -use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; use super::bits::{self, RawCompletion, RawSubmission}; use super::cmds::Completion; @@ -8,6 +8,7 @@ use crate::common::*; use crate::dispatch::DispCtx; use crate::hw::pci; +use bitstruct::bitstruct; use thiserror::Error; /// Each queue is identified by a 16-bit ID. @@ -43,6 +44,56 @@ enum CompletionQueueType {} /// Marker type to indicate a Submission Queue. enum SubmissionQueueType {} +bitstruct! { + /// Completion Queue State we update atomically. + struct CompQueueState(pub u64) { + /// The Queue Head entry pointer. + /// + /// The consumer of entries on a queue uses the current Head entry pointer + /// to identify the next entry to be pulled off the queue. + /// + /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition + head: u16 = 0..16; + + /// The Queue Tail entry pointer. + /// + /// The submitter of entries to a queue uses the current Tail entry pointer + /// to identify the next open queue entry space. + /// + /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition + tail: u16 = 16..32; + + /// The current phase tag. + /// + /// The Phase Tag is used to identify to the host (VM) that a Completion + /// entry is new. Flips every time the Tail entry pointer wraps around. + /// + /// See NVMe 1.0e Section 4.5 Completion Queue Entry - Phase Tag (P) + phase: bool = 32; + } +} + +bitstruct! { + /// Submission Queue State we update atomically. + struct SubQueueState(pub u64) { + /// The Queue Head entry pointer. + /// + /// The consumer of entries on a queue uses the current Head entry pointer + /// to identify the next entry to be pulled off the queue. + /// + /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition + head: u16 = 0..16; + + /// The Queue Tail entry pointer. + /// + /// The submitter of entries to a queue uses the current Tail entry pointer + /// to identify the next open queue entry space. + /// + /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition + tail: u16 = 16..32; + } +} + /// Helper for manipulating Completion/Submission Queues /// /// The type parameter `QT` is used to constrain the set of @@ -55,37 +106,17 @@ struct QueueState { /// See NVMe 1.0e Section 4.1.3 Queue Size size: u32, - /// The combined current Head and Tail entry pointers. - /// - /// The least significant 16 bits represent the Head pointer while - /// the next 16 represent the Tail pointer. - /// - /// Bit 32 is the current phase tag (for completion queues). + /// The actual atomic state that gets updated during the normal course of operation. /// - /// The consumer of entries on a queue uses the current Head entry pointer - /// to identify the next entry to be pulled off the queue. - /// - /// The submitter of entries to a queue uses the current Tail entry pointer - /// to identify the next open queue entry space. - /// - /// The Phase Tag is used to identify to the host (VM) that a Completion - /// entry is new. Flips every time the Tail entry pointer wraps around. - /// - /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition - /// See NVMe 1.0e Section 4.5 Completion Queue Entry - Phase Tag (P) - head_tail_phase: AtomicU64, + /// Either `CompQueueState` for a Completion Queue or + /// a `SubQueueState` for a Submission Queue. + inner: AtomicU64, /// Marker type to indicate what type of Queue we're modeling. _qt: PhantomData, } impl QueueState { - /// Create a new `QueueState` - fn new(size: u32) -> Self { - assert!(size >= MIN_QUEUE_SIZE && size <= MAX_QUEUE_SIZE); - Self { size, head_tail_phase: AtomicU64::new(1 << 32 /* phase tag */), _qt: PhantomData } - } - /// Returns if the queue is currently empty with the given head and tail pointers. /// /// A queue is empty when the Head entry pointer equals the Tail entry pointer. @@ -136,38 +167,37 @@ impl QueueState { } impl QueueState { + /// Create a new `QueueState` for a Completion Queue + fn new_completion_state(size: u32) -> QueueState { + assert!(size >= MIN_QUEUE_SIZE && size <= MAX_QUEUE_SIZE); + // As the device side, we start with our phase tag as asserted (1) + // as the host side (VM) will create all the Completion Queue entries + // with the phase initially zeroed out. + let inner = CompQueueState(0).with_phase(true); + Self { size, inner: AtomicU64::new(inner.0), _qt: PhantomData } + } + /// Attempt to return the Tail entry pointer and then move it forward by 1. /// /// If the queue is full this method returns [`None`]. /// Otherwise, this method returns the current Tail entry pointer and then /// increments the Tail entry pointer by 1 (wrapping if necessary). fn push_tail(&self) -> Option { - self.head_tail_phase - .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |head_tail| { - let (mut phase, tail, head) = ( - (head_tail & (1 << 32)) != 0, - (head_tail >> 16) as u16, - head_tail as u16, - ); - if self.is_full(head, tail) { + self.inner + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { + let mut state = CompQueueState(state); + if self.is_full(state.head(), state.tail()) { return None; } - if tail as u32 + 1 >= self.size { + if state.tail() as u32 + 1 >= self.size { // We wrapped so flip phase - phase = !phase; + state.set_phase(!state.phase()); } - let new_tail = self.wrap_add(tail, 1); - Some( - (phase as u64) << 32 - | (new_tail as u64) << 16 - | head as u64, - ) + let new_tail = self.wrap_add(state.tail(), 1); + Some(state.with_tail(new_tail).0) }) .ok() - .map(|old_head_tail| { - let old_tail = (old_head_tail >> 16) as u16; - old_tail - }) + .map(|state| CompQueueState(state).tail()) } /// How many slots are occupied between the head and the tail i.e., how @@ -186,19 +216,15 @@ impl QueueState { if idx as u32 >= self.size { return Err("invalid index"); } - self.head_tail_phase - .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |head_tail| { - let (phase, tail, head) = ( - (head_tail & (1 << 32)) != 0, - (head_tail >> 16) as u16, - head_tail as u16, - ); - let pop_count = self.wrap_sub(idx, head); - if pop_count > self.avail_occupied(head, tail) { + self.inner + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { + let state = CompQueueState(state); + let pop_count = self.wrap_sub(idx, state.head()); + if pop_count > self.avail_occupied(state.head(), state.tail()) { return None; } // Replace head with given idx - Some((phase as u64) << 32 | (tail as u64) << 16 | idx as u64) + Some(state.with_head(idx).0) }) .map_err(|_| "index too far") .map(|_| ()) @@ -206,23 +232,30 @@ impl QueueState { } impl QueueState { + /// Create a new `QueueState` for a Submission Queue + fn new_submission_state(size: u32) -> QueueState { + assert!(size >= MIN_QUEUE_SIZE && size <= MAX_QUEUE_SIZE); + let inner = SubQueueState(0); + Self { size, inner: AtomicU64::new(inner.0), _qt: PhantomData } + } + /// Attempt to return the Head entry pointer and then move it forward by 1. /// /// If the queue is empty this method returns [`None`]. /// Otherwise, this method returns the current Head entry pointer and then /// increments the Head entry pointer by 1 (wrapping if necessary). fn pop_head(&self) -> Option { - self.head_tail_phase - .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |head_tail| { - let (tail, head) = ((head_tail >> 16) as u16, head_tail as u16); - if self.is_empty(head, tail) { + self.inner + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { + let state = SubQueueState(state); + if self.is_empty(state.head(), state.tail()) { return None; } - let new_head = self.wrap_add(head, 1); - Some((tail as u64) << 16 | new_head as u64) + let new_head = self.wrap_add(state.head(), 1); + Some(state.with_head(new_head).0) }) .ok() - .map(|old_head_tail| old_head_tail as u16) + .map(|state| SubQueueState(state).head()) } /// How many slots are empty between the tail and the head i.e., how many @@ -241,15 +274,15 @@ impl QueueState { if idx as u32 >= self.size { return Err("invalid index"); } - self.head_tail_phase - .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |head_tail| { - let (tail, head) = ((head_tail >> 16) as u16, head_tail as u16); - let push_count = self.wrap_sub(idx, tail); - if push_count > self.avail_empty(head, tail) { + self.inner + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { + let state = SubQueueState(state); + let push_count = self.wrap_sub(idx, state.tail()); + if push_count > self.avail_empty(state.head(), state.tail()) { return None; } // Replace tail with given idx - Some((idx as u64) << 16 | head as u64) + Some(state.with_tail(idx).0) }) .map_err(|_| "index too far") .map(|_| ()) @@ -294,7 +327,7 @@ impl SubQueue { ctx: &DispCtx, ) -> Result { Self::validate(id, base, size, ctx)?; - Ok(Self { id, cq, state: QueueState::new(size), base }) + Ok(Self { id, cq, state: QueueState::new_submission_state(size), base }) } /// Attempt to move the Tail entry pointer forward to the given index. @@ -316,7 +349,7 @@ impl SubQueue { /// Returns the current Head entry pointer. pub fn head(&self) -> u16 { - self.state.head_tail_phase.load(Ordering::SeqCst) as u16 + SubQueueState(self.state.inner.load(Ordering::SeqCst)).head() } /// Returns the ID of this Submission Queue. @@ -408,7 +441,12 @@ impl CompQueue { hdl: pci::MsixHdl, ) -> Result { Self::validate(id, base, size, ctx)?; - Ok(Self { iv, state: QueueState::new(size), base, hdl }) + Ok(Self { + iv, + state: QueueState::new_completion_state(size), + base, + hdl, + }) } /// Attempt to move the Head entry pointer forward to the given index. @@ -436,15 +474,14 @@ impl CompQueue { /// /// See NVMe 1.0e Section 4.5 Completion Queue Entry - Phase Tag (P) pub fn phase(&self) -> u16 { - (self.state.head_tail_phase.load(Ordering::SeqCst) >> 32) as u16 + CompQueueState(self.state.inner.load(Ordering::SeqCst)).phase() as u16 } /// Fires an interrupt to the guest with the associated interrupt vector /// if the queue is not currently empty. pub fn fire_interrupt(&self, ctx: &DispCtx) { - let head_tail = self.state.head_tail_phase.load(Ordering::SeqCst); - let (tail, head) = ((head_tail >> 16) as u16, head_tail as u16); - if !self.state.is_empty(head, tail) { + let state = CompQueueState(self.state.inner.load(Ordering::SeqCst)); + if !self.state.is_empty(state.head(), state.tail()) { self.hdl.fire(self.iv, ctx); } } From 5f8653892e9301a1aecfb2bdefb24af7f4e2a10d Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Tue, 5 Oct 2021 13:13:22 -0700 Subject: [PATCH 06/23] nvme: only allowing writing to completion queue if you have a permit --- propolis/src/hw/nvme/mod.rs | 18 +++--- propolis/src/hw/nvme/ns.rs | 57 ++++++------------ propolis/src/hw/nvme/queue.rs | 109 +++++++++++++++++++++++++++------- 3 files changed, 113 insertions(+), 71 deletions(-) diff --git a/propolis/src/hw/nvme/mod.rs b/propolis/src/hw/nvme/mod.rs index 7ce5fd0ae..d823db27c 100644 --- a/propolis/src/hw/nvme/mod.rs +++ b/propolis/src/hw/nvme/mod.rs @@ -598,7 +598,7 @@ impl PciNvme { // Grab the Admin CQ too let cq = state.get_admin_cq(); - while let Some(sub) = sq.pop(ctx) { + while let Some((sub, cqe_permit)) = sq.pop(ctx) { use cmds::AdminCmd; let parsed = AdminCmd::parse(sub); @@ -629,7 +629,7 @@ impl PciNvme { } }; - sq.push_completion(sub.cid(), comp, ctx); + cqe_permit.push_completion(sub.cid(), comp, ctx); } // Notify for any newly added completions @@ -650,18 +650,16 @@ impl PciNvme { // Collect all the IO SQ entries, per namespace let mut io_cmds = BTreeMap::new(); - while let Some(sub) = sq.pop(ctx) { - io_cmds.entry(sub.nsid).or_insert_with(Vec::new).push(sub); + while let Some(sub_permit) = sq.pop(ctx) { + io_cmds + .entry(sub_permit.0.nsid) + .or_insert_with(Vec::new) + .push(sub_permit); } // Queue up said IO entries to the underlying block device, per namespace for (nsid, io_cmds) in io_cmds { - state.get_ns(nsid)?.queue_io_cmds( - io_cmds, - cq.clone(), - sq.clone(), - ctx, - )?; + state.get_ns(nsid)?.queue_io_cmds(io_cmds, ctx)?; } // Notify for any newly added completions diff --git a/propolis/src/hw/nvme/ns.rs b/propolis/src/hw/nvme/ns.rs index fc563dac3..81a43b60d 100644 --- a/propolis/src/hw/nvme/ns.rs +++ b/propolis/src/hw/nvme/ns.rs @@ -7,7 +7,7 @@ use crate::{common, dispatch::DispCtx}; use super::bits::{self, RawSubmission}; use super::cmds::{Completion, ReadCmd, WriteCmd}; -use super::queue::{CompQueue, SubQueue}; +use super::queue::CompQueueEntryPermit; use super::NvmeError; /// Max number of namespaces we support @@ -69,12 +69,10 @@ impl NvmeNs { /// block device as appropriate. pub(super) fn queue_io_cmds( &self, - cmds: Vec, - cq: Arc, - sq: Arc, + cmds: Vec<(RawSubmission, CompQueueEntryPermit)>, ctx: &DispCtx, ) -> Result<(), NvmeError> { - for sub in cmds { + for (sub, cqe_permit) in cmds { let cmd = NvmCmd::parse(sub)?; match cmd { NvmCmd::Write(_) if self.is_ro => { @@ -82,38 +80,28 @@ impl NvmeNs { bits::StatusCodeType::CmdSpecific, bits::STS_WRITE_READ_ONLY_RANGE, ); - sq.push_completion(sub.cid(), comp, ctx); + cqe_permit.push_completion(sub.cid(), comp, ctx); } NvmCmd::Write(cmd) => { - self.write_cmd(sub.cid(), cmd, ctx, cq.clone(), sq.clone()) + self.write_cmd(sub.cid(), cmd, ctx, cqe_permit) } NvmCmd::Read(cmd) => { - self.read_cmd(sub.cid(), cmd, ctx, cq.clone(), sq.clone()) - } - NvmCmd::Flush => { - self.flush_cmd(sub.cid(), cq.clone(), sq.clone()) + self.read_cmd(sub.cid(), cmd, ctx, cqe_permit) } + NvmCmd::Flush => self.flush_cmd(sub.cid(), cqe_permit), NvmCmd::Unknown(_) => { // For any other command, just immediately complete it let comp = Completion::generic_err(bits::STS_INTERNAL_ERR); - sq.push_completion(sub.cid(), comp, ctx); + cqe_permit.push_completion(sub.cid(), comp, ctx); } } } - // Notify for any newly added completions - cq.fire_interrupt(ctx); - Ok(()) } /// Enqueues a flush to the underlying block device - fn flush_cmd( - &self, - cid: u16, - cq: Arc, - sq: Arc, - ) { + fn flush_cmd(&self, cid: u16, permit: CompQueueEntryPermit) { // TODO: handles if it gets unmapped? self.bdev.enqueue(Request { op: BlockOp::Flush, @@ -121,8 +109,7 @@ impl NvmeNs { xfer_left: 0, bufs: VecDeque::new(), cid, - cq, - sq, + permit, }); } @@ -132,8 +119,7 @@ impl NvmeNs { cid: u16, cmd: ReadCmd, ctx: &DispCtx, - cq: Arc, - sq: Arc, + permit: CompQueueEntryPermit, ) { probe_nvme_read_enqueue!(|| (cid, cmd.slba, cmd.nlb)); let off = self.nlb_to_size(cmd.slba as usize); @@ -146,8 +132,7 @@ impl NvmeNs { xfer_left: size, bufs, cid, - cq, - sq, + permit, }); } @@ -157,8 +142,7 @@ impl NvmeNs { cid: u16, cmd: WriteCmd, ctx: &DispCtx, - cq: Arc, - sq: Arc, + permit: CompQueueEntryPermit, ) { probe_nvme_write_enqueue!(|| (cid, cmd.slba, cmd.nlb)); let off = self.nlb_to_size(cmd.slba as usize); @@ -171,8 +155,7 @@ impl NvmeNs { xfer_left: size, bufs, cid, - cq, - sq, + permit, }); } } @@ -194,11 +177,8 @@ pub struct Request { /// The associated command id cid: u16, - /// The associated Completion Queue - cq: Arc, - - /// The associated Submission Queue - sq: Arc, + /// The permit for pushing onto the Completion Queue + permit: CompQueueEntryPermit, } impl BlockReq for Request { @@ -248,9 +228,6 @@ impl BlockReq for Request { _ => {} } - self.sq.push_completion(self.cid, comp, ctx); - - // TODO: should this be done here? - self.cq.fire_interrupt(ctx); + self.permit.push_completion(self.cid, comp, ctx); } } diff --git a/propolis/src/hw/nvme/queue.rs b/propolis/src/hw/nvme/queue.rs index dc8f1f390..b5dce169c 100644 --- a/propolis/src/hw/nvme/queue.rs +++ b/propolis/src/hw/nvme/queue.rs @@ -63,13 +63,19 @@ bitstruct! { /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition tail: u16 = 16..32; + /// Number of entries that are available for use. + /// + /// Starts off as queue size - 1 and gets decremented for each corresponding + /// Submission Queue entry we begin servicing. + avail: u16 = 32..48; + /// The current phase tag. /// /// The Phase Tag is used to identify to the host (VM) that a Completion /// entry is new. Flips every time the Tail entry pointer wraps around. /// /// See NVMe 1.0e Section 4.5 Completion Queue Entry - Phase Tag (P) - phase: bool = 32; + phase: bool = 48; } } @@ -173,7 +179,8 @@ impl QueueState { // As the device side, we start with our phase tag as asserted (1) // as the host side (VM) will create all the Completion Queue entries // with the phase initially zeroed out. - let inner = CompQueueState(0).with_phase(true); + let inner = + CompQueueState(0).with_phase(true).with_avail((size - 1) as u16); Self { size, inner: AtomicU64::new(inner.0), _qt: PhantomData } } @@ -223,8 +230,9 @@ impl QueueState { if pop_count > self.avail_occupied(state.head(), state.tail()) { return None; } - // Replace head with given idx - Some(state.with_head(idx).0) + // Replace head with given idx and update the number of available slots + let avail = state.avail() + pop_count; + Some(state.with_head(idx).with_avail(avail).0) }) .map_err(|_| "index too far") .map(|_| ()) @@ -336,13 +344,21 @@ impl SubQueue { } /// Returns the next entry off of the Queue or [`None`] if it is empty. - pub fn pop(&self, ctx: &DispCtx) -> Option { + pub fn pop( + self: &Arc, + ctx: &DispCtx, + ) -> Option<(bits::RawSubmission, CompQueueEntryPermit)> { + // Attempt to reserve an entry on the Completion Queue + let cqe_permit = self.cq.reserve_entry(self.clone())?; + // Reserve some space on the Completion Queue if let Some(idx) = self.state.pop_head() { let mem = ctx.mctx.memctx(); let ent: Option = mem.read(self.entry_addr(idx)); // XXX: handle a guest addr that becomes unmapped later - ent + ent.map(|ent| (ent, cqe_permit)) } else { + // No Submission Queue entry, so return the CQE permit + cqe_permit.remit(); None } } @@ -362,21 +378,6 @@ impl SubQueue { self.cq.clone() } - /// Push a new entry onto this Submission Queue's corresponding - /// Completion Queue. - pub fn push_completion(&self, cid: u16, comp: Completion, ctx: &DispCtx) { - let completion = bits::RawCompletion { - dw0: comp.dw0, - rsvd: 0, - sqhd: self.head(), - sqid: self.id(), - cid, - status_phase: comp.status | self.cq.phase(), - }; - - self.cq.push(completion, ctx); - } - /// Returns the corresponding [`GuestAddr`] for a given entry in /// the Submission Queue. fn entry_addr(&self, idx: u16) -> GuestAddr { @@ -467,6 +468,29 @@ impl CompQueue { } } + /// Attempt to reserve an entry in the Completion Queue. + /// + /// An entry permit allows the user to push onto the Completion Queue. + pub fn reserve_entry( + self: &Arc, + sq: Arc, + ) -> Option { + self.state + .inner + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { + let state = CompQueueState(state); + let avail = state.avail(); + // No more spots available + if avail == 0 { + return None; + } + // Otherwise claim a spot + Some(state.with_avail(avail - 1).0) + }) + .ok() + .map(|_| CompQueueEntryPermit { cq: self.clone(), sq }) + } + /// Returns the current Phase Tag bit. /// /// The current Phase Tag to identify to the host (VM) that a Completion @@ -521,3 +545,46 @@ impl CompQueue { region.map(|_| ()).ok_or(QueueCreateErr::InvalidBaseAddr) } } + +/// A type which allows pushing a Completion Entry onto the Completion Queue. +pub struct CompQueueEntryPermit { + /// The corresponding Completion Queue for which we have a permit. + cq: Arc, + + /// The Submission Queue for which this entry is reserved. + sq: Arc, +} + +impl CompQueueEntryPermit { + /// Consume the permit by placing an entry into the Completion Queue. + pub fn push_completion(self, cid: u16, comp: Completion, ctx: &DispCtx) { + let completion = bits::RawCompletion { + dw0: comp.dw0, + rsvd: 0, + sqhd: self.sq.head(), + sqid: self.sq.id(), + cid, + status_phase: comp.status | self.cq.phase(), + }; + + self.cq.push(completion, ctx); + + // TODO: should this be done here? + self.cq.fire_interrupt(ctx); + } + + /// Return the permit without having actually used it. + /// + /// Frees up the space for someone else to grab it via `CompQueue::reserve_entry`. + fn remit(self) { + self.cq + .state + .inner + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { + let state = CompQueueState(state); + let avail = state.avail(); + Some(state.with_avail(avail + 1).0) + }) + .unwrap(); + } +} From 0350efbfe475637b573aa9663bb970b77a3b1dc3 Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Tue, 5 Oct 2021 14:26:02 -0700 Subject: [PATCH 07/23] nvme: create proper error type instead of &'static str --- propolis/src/hw/nvme/mod.rs | 26 ++++++++++---------------- propolis/src/hw/nvme/queue.rs | 26 ++++++++++++++++++-------- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/propolis/src/hw/nvme/mod.rs b/propolis/src/hw/nvme/mod.rs index d823db27c..b416d7d96 100644 --- a/propolis/src/hw/nvme/mod.rs +++ b/propolis/src/hw/nvme/mod.rs @@ -49,6 +49,9 @@ pub enum NvmeError { #[error("failed to create queue: {0}")] QueueCreateErr(#[from] queue::QueueCreateErr), + #[error("failed to update queue: {0}")] + QueueUpdateError(#[from] queue::QueueUpdateError), + /// MSI-X Interrupt handle is unavailable #[error("the MSI-X interrupt handle is unavailable")] MsixHdlUnavailable, @@ -530,10 +533,7 @@ impl PciNvme { let val = wo.read_u32().try_into().unwrap(); let state = self.state.lock().unwrap(); let admin_sq = state.get_admin_sq(); - match admin_sq.notify_tail(val) { - Ok(_) => {} - Err(_) => todo!("set controller error state"), - } + admin_sq.notify_tail(val)?; // Process any new SQ entries self.process_admin_queue(state, admin_sq, ctx)?; @@ -542,10 +542,8 @@ impl PciNvme { let val = wo.read_u32().try_into().unwrap(); let state = self.state.lock().unwrap(); let admin_cq = state.get_admin_cq(); - match admin_cq.notify_head(val) { - Ok(_) => {} - Err(_) => todo!("set controller error state"), - } + admin_cq.notify_head(val)?; + // TODO: post any entries to the CQ now that it has more space } @@ -567,19 +565,15 @@ impl PciNvme { // Completion Queue y Head Doorbell let y = (off - 4) >> 3; let cq = state.get_cq(y as u16)?; - match cq.notify_head(val) { - Ok(_) => {} - Err(_) => todo!("set controller error state"), - } + cq.notify_head(val)?; + // TODO: post any entries to the CQ now that it has more space } else { // Submission Queue y Tail Doorbell let y = off >> 3; let sq = state.get_sq(y as u16)?; - match sq.notify_tail(val) { - Ok(_) => {} - Err(_) => todo!("set controller error state"), - } + sq.notify_tail(val)?; + self.process_io_queue(state, sq, ctx)?; } } diff --git a/propolis/src/hw/nvme/queue.rs b/propolis/src/hw/nvme/queue.rs index b5dce169c..6711fb86d 100644 --- a/propolis/src/hw/nvme/queue.rs +++ b/propolis/src/hw/nvme/queue.rs @@ -219,9 +219,9 @@ impl QueueState { /// must have enough occupied slots otherwise we return an error. /// Conceptually this method indicates some entries have been consumed /// from the queue. - fn pop_head_to(&self, idx: u16) -> Result<(), &'static str> { + fn pop_head_to(&self, idx: u16) -> Result<(), QueueUpdateError> { if idx as u32 >= self.size { - return Err("invalid index"); + return Err(QueueUpdateError::InvalidEntry); } self.inner .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { @@ -234,7 +234,7 @@ impl QueueState { let avail = state.avail() + pop_count; Some(state.with_head(idx).with_avail(avail).0) }) - .map_err(|_| "index too far") + .map_err(|_| QueueUpdateError::TooManyEntries) .map(|_| ()) } } @@ -278,9 +278,9 @@ impl QueueState { /// must have enough empty slots available otherwise we return an error. /// Conceptually this method indicates new entries have been added to the /// queue. - fn push_tail_to(&self, idx: u16) -> Result<(), &'static str> { + fn push_tail_to(&self, idx: u16) -> Result<(), QueueUpdateError> { if idx as u32 >= self.size { - return Err("invalid index"); + return Err(QueueUpdateError::InvalidEntry); } self.inner .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { @@ -292,7 +292,7 @@ impl QueueState { // Replace tail with given idx Some(state.with_tail(idx).0) }) - .map_err(|_| "index too far") + .map_err(|_| QueueUpdateError::TooManyEntries) .map(|_| ()) } } @@ -309,6 +309,16 @@ pub enum QueueCreateErr { InvalidSize, } +/// Errors that may be encountered while adjusting Queue head/tail pointers. +#[derive(Error, Debug)] +pub enum QueueUpdateError { + #[error("tried to move head or tail pointer to an invalid index")] + InvalidEntry, + + #[error("tried to push or pop too many entries given the current head/tail")] + TooManyEntries, +} + /// Type for manipulating Submission Queues. pub struct SubQueue { /// The ID of this Submission Queue. @@ -339,7 +349,7 @@ impl SubQueue { } /// Attempt to move the Tail entry pointer forward to the given index. - pub fn notify_tail(&self, idx: u16) -> Result<(), &'static str> { + pub fn notify_tail(&self, idx: u16) -> Result<(), QueueUpdateError> { self.state.push_tail_to(idx) } @@ -451,7 +461,7 @@ impl CompQueue { } /// Attempt to move the Head entry pointer forward to the given index. - pub fn notify_head(&self, idx: u16) -> Result<(), &'static str> { + pub fn notify_head(&self, idx: u16) -> Result<(), QueueUpdateError> { self.state.pop_head_to(idx) } From 234d562428660edd12b40328882948c48586cc1f Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Tue, 5 Oct 2021 15:07:28 -0700 Subject: [PATCH 08/23] nvme: kick SQ's on CQ doorbell if we previously didn't have a permit available --- propolis/src/hw/nvme/mod.rs | 23 +++++++++++++++++++---- propolis/src/hw/nvme/queue.rs | 19 +++++++++++++++++-- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/propolis/src/hw/nvme/mod.rs b/propolis/src/hw/nvme/mod.rs index b416d7d96..846def719 100644 --- a/propolis/src/hw/nvme/mod.rs +++ b/propolis/src/hw/nvme/mod.rs @@ -544,7 +544,13 @@ impl PciNvme { let admin_cq = state.get_admin_cq(); admin_cq.notify_head(val)?; - // TODO: post any entries to the CQ now that it has more space + // We may have skipped pulling entries off the admin sq + // due to no available completion entry permit, so just + // kick it here again in case. + if admin_cq.kick() { + let admin_sq = state.get_admin_sq(); + self.process_admin_queue(state, admin_sq, ctx)?; + } } CtrlrReg::IOQueueDoorBells => { @@ -567,14 +573,23 @@ impl PciNvme { let cq = state.get_cq(y as u16)?; cq.notify_head(val)?; - // TODO: post any entries to the CQ now that it has more space + // We may have skipped pulling entries off some SQ due to this + // CQ having no available entry slots. Since we've just free'd + // up some slots, kick the SQs (excl. admin) here just in case. + // TODO: worth kicking only the SQs specifically associated + // with this CQ? + if cq.kick() { + for sq in state.sqs.iter().skip(1).flatten() { + self.process_io_queue(&state, sq.clone(), ctx)?; + } + } } else { // Submission Queue y Tail Doorbell let y = off >> 3; let sq = state.get_sq(y as u16)?; sq.notify_tail(val)?; - self.process_io_queue(state, sq, ctx)?; + self.process_io_queue(&state, sq, ctx)?; } } } @@ -635,7 +650,7 @@ impl PciNvme { /// Process any new entries in an I/O Submission Queue fn process_io_queue( &self, - state: MutexGuard, + state: &NvmeCtrl, sq: Arc, ctx: &DispCtx, ) -> Result<(), NvmeError> { diff --git a/propolis/src/hw/nvme/queue.rs b/propolis/src/hw/nvme/queue.rs index 6711fb86d..b5f88a44f 100644 --- a/propolis/src/hw/nvme/queue.rs +++ b/propolis/src/hw/nvme/queue.rs @@ -1,5 +1,5 @@ use std::marker::PhantomData; -use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::Arc; use super::bits::{self, RawCompletion, RawSubmission}; @@ -315,7 +315,9 @@ pub enum QueueUpdateError { #[error("tried to move head or tail pointer to an invalid index")] InvalidEntry, - #[error("tried to push or pop too many entries given the current head/tail")] + #[error( + "tried to push or pop too many entries given the current head/tail" + )] TooManyEntries, } @@ -433,6 +435,11 @@ pub struct CompQueue { /// Queue state such as the size and current head/tail entry pointers. state: QueueState, + /// Flag set when when've run out of `CompQueueEntryPermit`'s to give out. + /// + /// Polled on CQ Doorbell events so that we kick the SQ's that wanted a CompQueueEntryPermit. + kick: AtomicBool, + /// The [`GuestAddr`] at which the Queue is mapped. base: GuestAddr, @@ -455,6 +462,7 @@ impl CompQueue { Ok(Self { iv, state: QueueState::new_completion_state(size), + kick: AtomicBool::new(false), base, hdl, }) @@ -492,6 +500,8 @@ impl CompQueue { let avail = state.avail(); // No more spots available if avail == 0 { + // Make sure we kick the SQ's when we have space available again + self.kick.store(true, Ordering::SeqCst); return None; } // Otherwise claim a spot @@ -501,6 +511,11 @@ impl CompQueue { .map(|_| CompQueueEntryPermit { cq: self.clone(), sq }) } + /// Returns whether the SQ's should be kicked due to no permits being available previously. + pub fn kick(&self) -> bool { + self.kick.swap(false, Ordering::SeqCst) + } + /// Returns the current Phase Tag bit. /// /// The current Phase Tag to identify to the host (VM) that a Completion From 8a2bbff0b542380ebdbb1db96b59c6270de4c3a5 Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Tue, 5 Oct 2021 15:21:30 -0700 Subject: [PATCH 09/23] nvme: refactor cq push method and add comment for unwrap. --- propolis/src/hw/nvme/queue.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/propolis/src/hw/nvme/queue.rs b/propolis/src/hw/nvme/queue.rs index b5f88a44f..5f0b7da70 100644 --- a/propolis/src/hw/nvme/queue.rs +++ b/propolis/src/hw/nvme/queue.rs @@ -473,17 +473,15 @@ impl CompQueue { self.state.pop_head_to(idx) } - /// Attempt to add a new entry to the Completion Queue. - /// - /// TODO: handle the case where the queue may be currently full. - fn push(&self, entry: RawCompletion, ctx: &DispCtx) { - if let Some(idx) = self.state.push_tail() { - let mem = ctx.mctx.memctx(); - let addr = self.entry_addr(idx); - mem.write(addr, &entry); - // XXX: handle a guest addr that becomes unmapped later - // XXX: figure out interrupts - } + /// Add a new entry to the Completion Queue while consuming a `CompQueueEntryPermit`. + fn push(&self, _: CompQueueEntryPermit, entry: RawCompletion, ctx: &DispCtx) { + // Since we have a permit, there should always be at least + // one space in the queue and this unwrap shouldn't fail. + let idx = self.state.push_tail().unwrap(); + let mem = ctx.mctx.memctx(); + let addr = self.entry_addr(idx); + mem.write(addr, &entry); + // XXX: handle a guest addr that becomes unmapped later } /// Attempt to reserve an entry in the Completion Queue. @@ -592,10 +590,12 @@ impl CompQueueEntryPermit { status_phase: comp.status | self.cq.phase(), }; - self.cq.push(completion, ctx); + let cq = self.cq.clone(); + + cq.push(self, completion, ctx); // TODO: should this be done here? - self.cq.fire_interrupt(ctx); + cq.fire_interrupt(ctx); } /// Return the permit without having actually used it. From 655c0a65d1f18bb5c5bc302376d8610718458241 Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Tue, 5 Oct 2021 17:40:56 -0700 Subject: [PATCH 10/23] remove extraneous comment --- propolis/src/hw/nvme/queue.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/propolis/src/hw/nvme/queue.rs b/propolis/src/hw/nvme/queue.rs index 5f0b7da70..3af1e7547 100644 --- a/propolis/src/hw/nvme/queue.rs +++ b/propolis/src/hw/nvme/queue.rs @@ -362,7 +362,6 @@ impl SubQueue { ) -> Option<(bits::RawSubmission, CompQueueEntryPermit)> { // Attempt to reserve an entry on the Completion Queue let cqe_permit = self.cq.reserve_entry(self.clone())?; - // Reserve some space on the Completion Queue if let Some(idx) = self.state.pop_head() { let mem = ctx.mctx.memctx(); let ent: Option = mem.read(self.entry_addr(idx)); From b475e63f6fbc57885b08c27077f79ebab789b796 Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Tue, 5 Oct 2021 17:49:23 -0700 Subject: [PATCH 11/23] nvme: reduce pub methods on queues. --- propolis/src/hw/nvme/queue.rs | 65 +++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/propolis/src/hw/nvme/queue.rs b/propolis/src/hw/nvme/queue.rs index 3af1e7547..0e8ab2314 100644 --- a/propolis/src/hw/nvme/queue.rs +++ b/propolis/src/hw/nvme/queue.rs @@ -374,21 +374,21 @@ impl SubQueue { } } + /// Returns the corresponding Completion Queue + pub fn cq(&self) -> Arc { + self.cq.clone() + } + /// Returns the current Head entry pointer. - pub fn head(&self) -> u16 { + fn head(&self) -> u16 { SubQueueState(self.state.inner.load(Ordering::SeqCst)).head() } /// Returns the ID of this Submission Queue. - pub fn id(&self) -> QueueId { + fn id(&self) -> QueueId { self.id } - /// Returns the corresponding Completion Queue - pub fn cq(&self) -> Arc { - self.cq.clone() - } - /// Returns the corresponding [`GuestAddr`] for a given entry in /// the Submission Queue. fn entry_addr(&self, idx: u16) -> GuestAddr { @@ -472,21 +472,24 @@ impl CompQueue { self.state.pop_head_to(idx) } - /// Add a new entry to the Completion Queue while consuming a `CompQueueEntryPermit`. - fn push(&self, _: CompQueueEntryPermit, entry: RawCompletion, ctx: &DispCtx) { - // Since we have a permit, there should always be at least - // one space in the queue and this unwrap shouldn't fail. - let idx = self.state.push_tail().unwrap(); - let mem = ctx.mctx.memctx(); - let addr = self.entry_addr(idx); - mem.write(addr, &entry); - // XXX: handle a guest addr that becomes unmapped later + /// Fires an interrupt to the guest with the associated interrupt vector + /// if the queue is not currently empty. + pub fn fire_interrupt(&self, ctx: &DispCtx) { + let state = CompQueueState(self.state.inner.load(Ordering::SeqCst)); + if !self.state.is_empty(state.head(), state.tail()) { + self.hdl.fire(self.iv, ctx); + } + } + + /// Returns whether the SQ's should be kicked due to no permits being available previously. + pub fn kick(&self) -> bool { + self.kick.swap(false, Ordering::SeqCst) } /// Attempt to reserve an entry in the Completion Queue. /// /// An entry permit allows the user to push onto the Completion Queue. - pub fn reserve_entry( + fn reserve_entry( self: &Arc, sq: Arc, ) -> Option { @@ -508,9 +511,20 @@ impl CompQueue { .map(|_| CompQueueEntryPermit { cq: self.clone(), sq }) } - /// Returns whether the SQ's should be kicked due to no permits being available previously. - pub fn kick(&self) -> bool { - self.kick.swap(false, Ordering::SeqCst) + /// Add a new entry to the Completion Queue while consuming a `CompQueueEntryPermit`. + fn push( + &self, + _: CompQueueEntryPermit, + entry: RawCompletion, + ctx: &DispCtx, + ) { + // Since we have a permit, there should always be at least + // one space in the queue and this unwrap shouldn't fail. + let idx = self.state.push_tail().unwrap(); + let mem = ctx.mctx.memctx(); + let addr = self.entry_addr(idx); + mem.write(addr, &entry); + // XXX: handle a guest addr that becomes unmapped later } /// Returns the current Phase Tag bit. @@ -519,19 +533,10 @@ impl CompQueue { /// entry is new. Flips every time the Tail entry pointer wraps around. /// /// See NVMe 1.0e Section 4.5 Completion Queue Entry - Phase Tag (P) - pub fn phase(&self) -> u16 { + fn phase(&self) -> u16 { CompQueueState(self.state.inner.load(Ordering::SeqCst)).phase() as u16 } - /// Fires an interrupt to the guest with the associated interrupt vector - /// if the queue is not currently empty. - pub fn fire_interrupt(&self, ctx: &DispCtx) { - let state = CompQueueState(self.state.inner.load(Ordering::SeqCst)); - if !self.state.is_empty(state.head(), state.tail()) { - self.hdl.fire(self.iv, ctx); - } - } - /// Returns the corresponding [`GuestAddr`] for a given entry in /// the Completion Queue. fn entry_addr(&self, idx: u16) -> GuestAddr { From c3c4d78dbe149be9d121d212c9753de835f389b9 Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Wed, 6 Oct 2021 19:23:23 -0700 Subject: [PATCH 12/23] nvme: remove multi-namespace support for now and expose a single namespace per device. --- propolis/src/hw/mod.rs | 3 +- propolis/src/hw/nvme/admin.rs | 12 +- propolis/src/hw/nvme/mod.rs | 78 ++++++------ propolis/src/hw/nvme/ns.rs | 233 ---------------------------------- standalone/src/main.rs | 42 ++---- 5 files changed, 52 insertions(+), 316 deletions(-) delete mode 100644 propolis/src/hw/nvme/ns.rs diff --git a/propolis/src/hw/mod.rs b/propolis/src/hw/mod.rs index 471965195..9e57162b3 100644 --- a/propolis/src/hw/mod.rs +++ b/propolis/src/hw/mod.rs @@ -1,7 +1,6 @@ pub mod chipset; pub mod ibmpc; -// XXX: skip nvme for now -//pub mod nvme; +pub mod nvme; pub mod pci; pub mod ps2ctrl; pub mod qemu; diff --git a/propolis/src/hw/nvme/admin.rs b/propolis/src/hw/nvme/admin.rs index d9139c2d9..f8d208e64 100644 --- a/propolis/src/hw/nvme/admin.rs +++ b/propolis/src/hw/nvme/admin.rs @@ -117,18 +117,14 @@ impl NvmeCtrl { ) -> cmds::Completion { match cmd.cns { IDENT_CNS_NAMESPACE => match cmd.nsid { - n if n > 0 && n <= super::ns::MAX_NUM_NAMESPACES as u32 => { + 1 => { assert!(size_of::() <= PAGE_SIZE); let buf = cmd .data(ctx.mctx.memctx()) .next() .expect("missing prp entry for ident response"); - if let Ok(ns) = self.get_ns(n) { - assert!(ctx.mctx.memctx().write(buf.0, &ns.ident)); - cmds::Completion::success() - } else { - cmds::Completion::generic_err(STS_INVALID_NS) - } + assert!(ctx.mctx.memctx().write(buf.0, &self.ns_ident)); + cmds::Completion::success() } // 0 is not a valid NSID (See NVMe 1.0e, Section 6.1 Namespaces) // We also don't currently support namespace management @@ -143,7 +139,7 @@ impl NvmeCtrl { .data(ctx.mctx.memctx()) .next() .expect("missing prp entry for ident response"); - assert!(ctx.mctx.memctx().write(buf.0, &self.ident)); + assert!(ctx.mctx.memctx().write(buf.0, &self.ctrl_ident)); cmds::Completion::success() } // We currently present NVMe version 1.0 in which CNS is a 1-bit field diff --git a/propolis/src/hw/nvme/mod.rs b/propolis/src/hw/nvme/mod.rs index 846def719..ab7e8cce5 100644 --- a/propolis/src/hw/nvme/mod.rs +++ b/propolis/src/hw/nvme/mod.rs @@ -3,7 +3,7 @@ use std::convert::TryInto; use std::mem::size_of; use std::sync::{Arc, Mutex, MutexGuard}; -use crate::common::*; +use crate::{block, common::*}; use crate::dispatch::DispCtx; use crate::hw::pci; use crate::util::regmap::RegMap; @@ -11,21 +11,21 @@ use crate::util::regmap::RegMap; use lazy_static::lazy_static; use thiserror::Error; -pub use ns::{NvmeNs, Request}; - mod admin; mod bits; mod cmds; -mod ns; mod queue; use bits::*; -use ns::MAX_NUM_NAMESPACES; use queue::{CompQueue, QueueId, SubQueue}; /// The max number of MSI-X interrupts we support const NVME_MSIX_COUNT: u16 = 1024; +/// Supported block size. +/// TODO: Support more +const BLOCK_SZ: u64 = 512; + /// NVMe errors #[derive(Debug, Error)] pub enum NvmeError { @@ -119,11 +119,11 @@ struct NvmeCtrl { /// The list of Submission Queues handled by the controller sqs: [Option>; MAX_NUM_QUEUES], - /// The list of namespaces handled by the controller - nss: [Option; MAX_NUM_NAMESPACES], - /// The Identify structure returned for Identify controller commands - ident: IdentifyController, + ctrl_ident: IdentifyController, + + /// The Identify structure returned for Identify namespace commands + ns_ident: IdentifyNamespace, } impl NvmeCtrl { @@ -242,26 +242,6 @@ impl NvmeCtrl { self.get_sq(queue::ADMIN_QUEUE_ID).unwrap() } - /// Add a new namespace to the controller - fn add_ns(&mut self, ns: NvmeNs) -> Result<(), NvmeError> { - // Find the first empty spot - if let Some(spot) = self.nss.iter_mut().find(|n| n.is_none()) { - *spot = Some(ns); - self.ident.nn += 1; - Ok(()) - } else { - Err(NvmeError::TooManyNamespaces) - } - } - - /// Returns a reference to the [`NvmeNs`] which corresponds to the given namespace id (`nsid`). - fn get_ns(&self, nsid: u32) -> Result<&NvmeNs, NvmeError> { - debug_assert!((nsid as usize) <= MAX_NUM_NAMESPACES); - self.nss[nsid as usize - 1] - .as_ref() - .ok_or(NvmeError::InvalidNamespace(nsid)) - } - /// Performs a Controller Reset. /// /// The reset deletes all I/O Submission & Completion Queues, resets @@ -293,7 +273,7 @@ pub struct PciNvme { impl PciNvme { /// Create a new pci-nvme device with the given values - pub fn create(vendor: u16, device: u16) -> Arc { + pub fn create(vendor: u16, device: u16, info: block::DeviceInfo) -> Arc { let builder = pci::Builder::new(pci::Ident { vendor_id: vendor, device_id: device, @@ -314,7 +294,7 @@ impl PciNvme { // Initialize the Identify structure returned when the host issues // an Identify Controller command. - let ident = bits::IdentifyController { + let ctrl_ident = bits::IdentifyController { vid: vendor, ssvid: vendor, // TODO: fill out serial number @@ -323,13 +303,33 @@ impl PciNvme { // data, so required (minimum) == maximum sqes: NvmQueueEntrySize(0).with_maximum(sqes).with_required(sqes), cqes: NvmQueueEntrySize(0).with_maximum(cqes).with_required(cqes), - // No namespaces initially - nn: 0, + // Supporting multiple namespaces complicates I/O dispatching, + // so for now we limit the device to a single namespace. + nn: 1, // bit 0 indicates volatile write cache is present vwc: 1, ..Default::default() }; + // Initialize the Identify structure returned when the host issues + // an Identify Namespace command. + let total_bytes = info.total_size * info.block_size as u64; + let nsze = total_bytes / BLOCK_SZ; + let mut ns_ident = bits::IdentifyNamespace { + // No thin provisioning so nsze == ncap == nuse + nsze, + ncap: nsze, + nuse: nsze, + nlbaf: 0, // We only support a single LBA format (1 but 0-based) + flbas: 0, // And it is at index 0 in the lbaf array + ..Default::default() + }; + + // Update the block format we support + debug_assert!(BLOCK_SZ.is_power_of_two(), "BLOCK_SZ must be a power of 2"); + debug_assert!(BLOCK_SZ >= 512, "BLOCK_SZ must be at least 512 bytes"); + ns_ident.lbaf[0].lbads = BLOCK_SZ.trailing_zeros() as u8; + // Initialize the CAP "register" leaving most values // at their defaults (0): // TO = 0 => 0ms to wait for controller to be ready @@ -370,8 +370,8 @@ impl PciNvme { msix_hdl: None, cqs: Default::default(), sqs: Default::default(), - nss: Default::default(), - ident, + ctrl_ident, + ns_ident, }; let nvme = PciNvme { state: Mutex::new(state) }; @@ -386,12 +386,6 @@ impl PciNvme { .finish(Arc::new(nvme)) } - /// Add a new namespace to the controller - pub fn add_ns(&self, ns: NvmeNs) -> Result<(), NvmeError> { - let mut state = self.state.lock().unwrap(); - state.add_ns(ns) - } - /// Service a write to the NVMe Controller Configuration from the VM fn ctrlr_cfg_write( &self, @@ -668,7 +662,7 @@ impl PciNvme { // Queue up said IO entries to the underlying block device, per namespace for (nsid, io_cmds) in io_cmds { - state.get_ns(nsid)?.queue_io_cmds(io_cmds, ctx)?; + //state.get_ns(nsid)?.queue_io_cmds(io_cmds, ctx)?; } // Notify for any newly added completions diff --git a/propolis/src/hw/nvme/ns.rs b/propolis/src/hw/nvme/ns.rs deleted file mode 100644 index 81a43b60d..000000000 --- a/propolis/src/hw/nvme/ns.rs +++ /dev/null @@ -1,233 +0,0 @@ -use std::collections::VecDeque; -use std::sync::Arc; - -use crate::block::*; -use crate::hw::nvme::cmds::{self, NvmCmd}; -use crate::{common, dispatch::DispCtx}; - -use super::bits::{self, RawSubmission}; -use super::cmds::{Completion, ReadCmd, WriteCmd}; -use super::queue::CompQueueEntryPermit; -use super::NvmeError; - -/// Max number of namespaces we support -pub const MAX_NUM_NAMESPACES: usize = 16; - -/// Supported block size. -/// TODO: Support more -const BLOCK_SZ: u64 = 512; - -/// NVMe Namespace with underlying block device -pub struct NvmeNs { - /// The Identify structure returned for Identify namespace commands - pub ident: bits::IdentifyNamespace, - - /// The underlying block device to service read/write requests - bdev: Arc>, - - /// Whether the underlying block device readonly - is_ro: bool, -} - -impl NvmeNs { - /// Create a new NVMe namespace with the given block device - pub fn create(bdev: Arc>) -> Self { - let binfo = bdev.inquire(); - let total_bytes = binfo.total_size * binfo.block_size as u64; - let nsze = total_bytes / BLOCK_SZ; - - let mut ident = bits::IdentifyNamespace { - // No thin provisioning so nsze == ncap == nuse - nsze, - ncap: nsze, - nuse: nsze, - nlbaf: 0, // We only support a single LBA format (1 but 0-based) - flbas: 0, // And it is at index 0 in the lbaf array - ..Default::default() - }; - - debug_assert_eq!( - BLOCK_SZ.count_ones(), - 1, - "BLOCK_SZ must be a power of 2" - ); - debug_assert!( - BLOCK_SZ.trailing_zeros() >= 9, - "BLOCK_SZ must be at least 512 bytes" - ); - ident.lbaf[0].lbads = BLOCK_SZ.trailing_zeros() as u8; - - NvmeNs { ident, bdev, is_ro: !binfo.writable } - } - - /// Convert some number of logical blocks to bytes with the currently active LBA data size - fn nlb_to_size(&self, b: usize) -> usize { - b << (self.ident.lbaf[(self.ident.flbas & 0xF) as usize].lbads) - } - - /// Takes the given list of raw IO commands and queues up reads and writes to the underlying - /// block device as appropriate. - pub(super) fn queue_io_cmds( - &self, - cmds: Vec<(RawSubmission, CompQueueEntryPermit)>, - ctx: &DispCtx, - ) -> Result<(), NvmeError> { - for (sub, cqe_permit) in cmds { - let cmd = NvmCmd::parse(sub)?; - match cmd { - NvmCmd::Write(_) if self.is_ro => { - let comp = Completion::specific_err( - bits::StatusCodeType::CmdSpecific, - bits::STS_WRITE_READ_ONLY_RANGE, - ); - cqe_permit.push_completion(sub.cid(), comp, ctx); - } - NvmCmd::Write(cmd) => { - self.write_cmd(sub.cid(), cmd, ctx, cqe_permit) - } - NvmCmd::Read(cmd) => { - self.read_cmd(sub.cid(), cmd, ctx, cqe_permit) - } - NvmCmd::Flush => self.flush_cmd(sub.cid(), cqe_permit), - NvmCmd::Unknown(_) => { - // For any other command, just immediately complete it - let comp = Completion::generic_err(bits::STS_INTERNAL_ERR); - cqe_permit.push_completion(sub.cid(), comp, ctx); - } - } - } - - Ok(()) - } - - /// Enqueues a flush to the underlying block device - fn flush_cmd(&self, cid: u16, permit: CompQueueEntryPermit) { - // TODO: handles if it gets unmapped? - self.bdev.enqueue(Request { - op: BlockOp::Flush, - off: 0, - xfer_left: 0, - bufs: VecDeque::new(), - cid, - permit, - }); - } - - /// Enqueues a read to the underlying block device - fn read_cmd( - &self, - cid: u16, - cmd: ReadCmd, - ctx: &DispCtx, - permit: CompQueueEntryPermit, - ) { - probe_nvme_read_enqueue!(|| (cid, cmd.slba, cmd.nlb)); - let off = self.nlb_to_size(cmd.slba as usize); - let size = self.nlb_to_size(cmd.nlb as usize); - // TODO: handles if it gets unmapped? - let bufs = cmd.data(size as u64, ctx.mctx.memctx()).collect(); - self.bdev.enqueue(Request { - op: BlockOp::Read, - off, - xfer_left: size, - bufs, - cid, - permit, - }); - } - - /// Enqueues a write to the underlying block device - fn write_cmd( - &self, - cid: u16, - cmd: WriteCmd, - ctx: &DispCtx, - permit: CompQueueEntryPermit, - ) { - probe_nvme_write_enqueue!(|| (cid, cmd.slba, cmd.nlb)); - let off = self.nlb_to_size(cmd.slba as usize); - let size = self.nlb_to_size(cmd.nlb as usize); - // TODO: handles if it gets unmapped? - let bufs = cmd.data(size as u64, ctx.mctx.memctx()).collect(); - self.bdev.enqueue(Request { - op: BlockOp::Write, - off, - xfer_left: size, - bufs, - cid, - permit, - }); - } -} - -/// I/O Request to block device -pub struct Request { - /// The operation type - op: BlockOp, - - /// The offset at which to begin reading/writing - off: usize, - - /// How many bytes to read/write - xfer_left: usize, - - /// The buffers to read/write from/to - bufs: VecDeque, - - /// The associated command id - cid: u16, - - /// The permit for pushing onto the Completion Queue - permit: CompQueueEntryPermit, -} - -impl BlockReq for Request { - fn oper(&self) -> BlockOp { - self.op - } - - fn offset(&self) -> usize { - self.off - } - - fn next_buf(&mut self) -> Option { - if self.xfer_left == 0 { - return None; - } - - if let Some(mut region) = self.bufs.pop_front() { - if region.1 > self.xfer_left { - region.1 = self.xfer_left; - } - self.xfer_left -= region.1; - Some(region) - } else { - None - } - } - - fn complete(self, res: BlockResult, ctx: &DispCtx) { - let comp = match res { - BlockResult::Success => cmds::Completion::success(), - BlockResult::Failure => { - cmds::Completion::generic_err(bits::STS_DATA_XFER_ERR) - } - BlockResult::Unsupported => cmds::Completion::specific_err( - bits::StatusCodeType::CmdSpecific, - bits::STS_READ_CONFLICTING_ATTRS, - ), - }; - - match self.op { - BlockOp::Read => { - probe_nvme_read_complete!(|| (self.cid)); - } - BlockOp::Write => { - probe_nvme_write_complete!(|| (self.cid)); - } - _ => {} - } - - self.permit.push_completion(self.cid, comp, ctx); - } -} diff --git a/standalone/src/main.rs b/standalone/src/main.rs index 74ce95036..8ea094b82 100644 --- a/standalone/src/main.rs +++ b/standalone/src/main.rs @@ -184,8 +184,6 @@ fn main() { inv.register(&debug_device, "debug".to_string(), None) .map_err(|e| -> std::io::Error { e.into() })?; - // let mut devices = HashMap::new(); - for (name, dev) in config.devs() { let driver = &dev.driver as &str; let bdf = if driver.starts_with("pci-") { @@ -231,41 +229,23 @@ fn main() { chipset.pci_attach(bdf.unwrap(), viona); } // "pci-nvme" => { - // let nvme = hw::nvme::PciNvme::create(0x1de, 0x1000); - // devices.insert(&**name, nvme.clone()); - // chipset.pci_attach(bdf.unwrap(), nvme); - // } - // "nvme-ns" => { - // let nvme_ctrl = dev - // .options - // .get("controller") - // .unwrap() - // .as_str() - // .unwrap(); - - // let nvme = devices.get(nvme_ctrl).unwrap_or_else(|| { - // panic!("no such nvme controller: {}", nvme_ctrl) - // }); - // let block_dev = // dev.options.get("block_dev").unwrap().as_str().unwrap(); - // let block_dev = - // config.block_dev::(block_dev); + // let (backend, creg) = config.block_dev(block_dev); + + // let info = backend.info(); + // let nvme = hw::nvme::PciNvme::create(0x1de, 0x1000, info); - // let ns = hw::nvme::NvmeNs::create(block_dev.clone()); + // let id = inv + // .register(&nvme, format!("nvme-{}", name), None)?; + // let _be_id = inv + // .register_child(creg, id)?; - // if let Err(e) = - // nvme.with_inner(|nvme: Arc| { - // nvme.add_ns(ns) - // }) - // { - // eprintln!("failed to attach nvme-ns: {}", e); - // std::process::exit(libc::EXIT_FAILURE); - // } + // let blk = nvme.inner_dev::(); + // backend.attach(blk, disp); - // block_dev - // .start_dispatch(format!("bdev-{} thread", name), disp); + // chipset.pci_attach(bdf.unwrap(), nvme); // } _ => { eprintln!("unrecognized driver: {}", name); From a7a5d656e80cd0c0e868b54bbd9ef345302e09ce Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Thu, 7 Oct 2021 09:11:05 -0700 Subject: [PATCH 13/23] nvme: hook up nvme device to pass IO requests to block dev --- propolis/src/hw/nvme/mod.rs | 31 +++++- propolis/src/hw/nvme/requests.rs | 170 +++++++++++++++++++++++++++++++ 2 files changed, 196 insertions(+), 5 deletions(-) create mode 100644 propolis/src/hw/nvme/requests.rs diff --git a/propolis/src/hw/nvme/mod.rs b/propolis/src/hw/nvme/mod.rs index ab7e8cce5..b2134fe32 100644 --- a/propolis/src/hw/nvme/mod.rs +++ b/propolis/src/hw/nvme/mod.rs @@ -3,10 +3,10 @@ use std::convert::TryInto; use std::mem::size_of; use std::sync::{Arc, Mutex, MutexGuard}; -use crate::{block, common::*}; use crate::dispatch::DispCtx; use crate::hw::pci; use crate::util::regmap::RegMap; +use crate::{block, common::*}; use lazy_static::lazy_static; use thiserror::Error; @@ -15,6 +15,7 @@ mod admin; mod bits; mod cmds; mod queue; +mod requests; use bits::*; use queue::{CompQueue, QueueId, SubQueue}; @@ -124,6 +125,9 @@ struct NvmeCtrl { /// The Identify structure returned for Identify namespace commands ns_ident: IdentifyNamespace, + + /// Underlying Block Device info + binfo: block::DeviceInfo, } impl NvmeCtrl { @@ -263,17 +267,29 @@ impl NvmeCtrl { *cq = None; } } + + /// Convert some number of logical blocks to bytes with the currently active LBA data size + fn nlb_to_size(&self, b: usize) -> usize { + b << (self.ns_ident.lbaf[(self.ns_ident.flbas & 0xF) as usize]).lbads + } } /// NVMe over PCIe pub struct PciNvme { /// NVMe Controller state: Mutex, + + /// Underlying Block Device notifier + notifier: block::Notifier, } impl PciNvme { /// Create a new pci-nvme device with the given values - pub fn create(vendor: u16, device: u16, info: block::DeviceInfo) -> Arc { + pub fn create( + vendor: u16, + device: u16, + binfo: block::DeviceInfo, + ) -> Arc { let builder = pci::Builder::new(pci::Ident { vendor_id: vendor, device_id: device, @@ -313,7 +329,7 @@ impl PciNvme { // Initialize the Identify structure returned when the host issues // an Identify Namespace command. - let total_bytes = info.total_size * info.block_size as u64; + let total_bytes = binfo.total_size * binfo.block_size as u64; let nsze = total_bytes / BLOCK_SZ; let mut ns_ident = bits::IdentifyNamespace { // No thin provisioning so nsze == ncap == nuse @@ -326,7 +342,10 @@ impl PciNvme { }; // Update the block format we support - debug_assert!(BLOCK_SZ.is_power_of_two(), "BLOCK_SZ must be a power of 2"); + debug_assert!( + BLOCK_SZ.is_power_of_two(), + "BLOCK_SZ must be a power of 2" + ); debug_assert!(BLOCK_SZ >= 512, "BLOCK_SZ must be at least 512 bytes"); ns_ident.lbaf[0].lbads = BLOCK_SZ.trailing_zeros() as u8; @@ -372,9 +391,11 @@ impl PciNvme { sqs: Default::default(), ctrl_ident, ns_ident, + binfo, }; - let nvme = PciNvme { state: Mutex::new(state) }; + let notifier = block::Notifier::new(); + let nvme = PciNvme { state: Mutex::new(state), notifier }; builder // XXX: add room for doorbells diff --git a/propolis/src/hw/nvme/requests.rs b/propolis/src/hw/nvme/requests.rs new file mode 100644 index 000000000..0ab7d23be --- /dev/null +++ b/propolis/src/hw/nvme/requests.rs @@ -0,0 +1,170 @@ +use crate::{ + block::{self, Request}, + dispatch::DispCtx, + hw::nvme::{bits, cmds::Completion}, +}; + +use super::{ + cmds::{self, NvmCmd}, + queue::CompQueueEntryPermit, + NvmeCtrl, PciNvme, +}; + +/// NVMe IO Block Operations +/// TODO: Seems too generic to be here +enum BlockOp { + Read, + Write, + Flush, +} + +impl block::Device for PciNvme { + fn next(&self, ctx: &DispCtx) -> Option { + self.notifier.next_arming(|| self.next_req(ctx)) + } + + fn set_notifier(&self, f: Option>) { + self.notifier.set(f) + } +} + +impl PciNvme { + /// Pop an available I/O request off of a Submission Queue to begin + /// processing by the underlying Block Device. + fn next_req(&self, ctx: &DispCtx) -> Option { + let state = self.state.lock().unwrap(); + + // Go through all the queues (skip admin as we just want I/O queues) + // looking for a request to service + for sq in state.sqs.iter().skip(1).flatten() { + if let Some((sub, cqe_permit)) = sq.pop(ctx) { + // TODO: Shouldn't panic on malformed input from guest + let cmd = NvmCmd::parse(sub).unwrap(); + match cmd { + NvmCmd::Write(_) if !state.binfo.writable => { + let comp = Completion::specific_err( + bits::StatusCodeType::CmdSpecific, + bits::STS_WRITE_READ_ONLY_RANGE, + ); + cqe_permit.push_completion(sub.cid(), comp, ctx); + } + NvmCmd::Write(cmd) => { + return Some(write_op( + &state, + sub.cid(), + cmd, + cqe_permit, + ctx, + )); + } + NvmCmd::Read(cmd) => { + return Some(read_op( + &state, + sub.cid(), + cmd, + cqe_permit, + ctx, + )); + } + NvmCmd::Flush => { + return Some(flush_op(sub.cid(), cqe_permit)); + } + NvmCmd::Unknown(_) => { + // For any other command, just immediately complete it + let comp = + Completion::generic_err(bits::STS_INTERNAL_ERR); + cqe_permit.push_completion(sub.cid(), comp, ctx); + } + } + } + } + + None + } +} + +fn read_op( + state: &NvmeCtrl, + cid: u16, + cmd: cmds::ReadCmd, + cqe_permit: CompQueueEntryPermit, + ctx: &DispCtx, +) -> Request { + //probe_nvme_read_enqueue!(|| (cid, cmd.slba, cmd.nlb)); + let off = state.nlb_to_size(cmd.slba as usize); + let size = state.nlb_to_size(cmd.nlb as usize); + let bufs = cmd.data(size as u64, ctx.mctx.memctx()).collect(); + Request::new_read( + off, + bufs, + Box::new(move |res, ctx| { + complete_block_req(cid, BlockOp::Read, res, cqe_permit, ctx) + }), + ) +} + +fn write_op( + state: &NvmeCtrl, + cid: u16, + cmd: cmds::WriteCmd, + cqe_permit: CompQueueEntryPermit, + ctx: &DispCtx, +) -> Request { + //probe_nvme_write_enqueue!(|| (cid, cmd.slba, cmd.nlb)); + let off = state.nlb_to_size(cmd.slba as usize); + let size = state.nlb_to_size(cmd.nlb as usize); + let bufs = cmd.data(size as u64, ctx.mctx.memctx()).collect(); + Request::new_write( + off, + bufs, + Box::new(move |res, ctx| { + complete_block_req(cid, BlockOp::Write, res, cqe_permit, ctx) + }), + ) +} + +fn flush_op(cid: u16, cqe_permit: CompQueueEntryPermit) -> Request { + Request::new_flush( + 0, + 0, // TODO: is 0 enough or do we pass total size? + Box::new(move |res, ctx| { + complete_block_req( + cid, + BlockOp::Flush, + res, + cqe_permit, + ctx, + ) + }), + ) +} + +/// Callback invoked by the underlying Block Device once it has completed an I/O op. +/// +/// Place the operation result (success or failure) onto the corresponding Completion Queue. +fn complete_block_req( + cid: u16, + op: BlockOp, + res: block::Result, + cqe_permit: CompQueueEntryPermit, + ctx: &DispCtx, +) { + let comp = match res { + block::Result::Success => Completion::success(), + block::Result::Failure => { + Completion::generic_err(bits::STS_DATA_XFER_ERR) + } + block::Result::Unsupported => Completion::specific_err( + bits::StatusCodeType::CmdSpecific, + bits::STS_READ_CONFLICTING_ATTRS, + ), + }; + + // match op { + // BlockOp::Read => probe_nvme_read_complete!(|| (cid)), + // BlockOp::Write => probe_nvme_write_complete!(|| (cid)), + // _ => {} + // } + + cqe_permit.push_completion(cid, comp, ctx); +} From 5d29a445d0db974e7f7a97b4d44e3a2688b67247 Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Thu, 7 Oct 2021 09:17:17 -0700 Subject: [PATCH 14/23] nvme: poke block dev on appropriate doorbell events --- propolis/src/hw/nvme/mod.rs | 38 +++----------------------------- propolis/src/hw/nvme/requests.rs | 3 +++ 2 files changed, 6 insertions(+), 35 deletions(-) diff --git a/propolis/src/hw/nvme/mod.rs b/propolis/src/hw/nvme/mod.rs index b2134fe32..2e0ddaecf 100644 --- a/propolis/src/hw/nvme/mod.rs +++ b/propolis/src/hw/nvme/mod.rs @@ -1,4 +1,3 @@ -use std::collections::BTreeMap; use std::convert::TryInto; use std::mem::size_of; use std::sync::{Arc, Mutex, MutexGuard}; @@ -594,9 +593,7 @@ impl PciNvme { // TODO: worth kicking only the SQs specifically associated // with this CQ? if cq.kick() { - for sq in state.sqs.iter().skip(1).flatten() { - self.process_io_queue(&state, sq.clone(), ctx)?; - } + self.notifier.notify(self, ctx); } } else { // Submission Queue y Tail Doorbell @@ -604,7 +601,8 @@ impl PciNvme { let sq = state.get_sq(y as u16)?; sq.notify_tail(val)?; - self.process_io_queue(&state, sq, ctx)?; + // Poke block device to service new requests + self.notifier.notify(self, ctx); } } } @@ -661,36 +659,6 @@ impl PciNvme { Ok(()) } - - /// Process any new entries in an I/O Submission Queue - fn process_io_queue( - &self, - state: &NvmeCtrl, - sq: Arc, - ctx: &DispCtx, - ) -> Result<(), NvmeError> { - // Grab the corresponding CQ - let cq = sq.cq(); - - // Collect all the IO SQ entries, per namespace - let mut io_cmds = BTreeMap::new(); - while let Some(sub_permit) = sq.pop(ctx) { - io_cmds - .entry(sub_permit.0.nsid) - .or_insert_with(Vec::new) - .push(sub_permit); - } - - // Queue up said IO entries to the underlying block device, per namespace - for (nsid, io_cmds) in io_cmds { - //state.get_ns(nsid)?.queue_io_cmds(io_cmds, ctx)?; - } - - // Notify for any newly added completions - cq.fire_interrupt(ctx); - - Ok(()) - } } impl pci::Device for PciNvme { diff --git a/propolis/src/hw/nvme/requests.rs b/propolis/src/hw/nvme/requests.rs index 0ab7d23be..cf2f1c793 100644 --- a/propolis/src/hw/nvme/requests.rs +++ b/propolis/src/hw/nvme/requests.rs @@ -76,6 +76,9 @@ impl PciNvme { cqe_permit.push_completion(sub.cid(), comp, ctx); } } + + // Notify for any newly added completions + sq.cq().fire_interrupt(ctx); } } From aa29549f0cfd1ec8616274288ac08378e2385bfa Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Thu, 7 Oct 2021 09:18:15 -0700 Subject: [PATCH 15/23] nvme: re-enable nvme devices for propolis-standalone --- standalone/src/main.rs | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/standalone/src/main.rs b/standalone/src/main.rs index 8ea094b82..c1506ce7f 100644 --- a/standalone/src/main.rs +++ b/standalone/src/main.rs @@ -228,25 +228,24 @@ fn main() { .map_err(|e| -> std::io::Error { e.into() })?; chipset.pci_attach(bdf.unwrap(), viona); } - // "pci-nvme" => { - // let block_dev = - // dev.options.get("block_dev").unwrap().as_str().unwrap(); + "pci-nvme" => { + let block_dev = + dev.options.get("block_dev").unwrap().as_str().unwrap(); - // let (backend, creg) = config.block_dev(block_dev); + let (backend, creg) = config.block_dev(block_dev); - // let info = backend.info(); - // let nvme = hw::nvme::PciNvme::create(0x1de, 0x1000, info); + let info = backend.info(); + let nvme = hw::nvme::PciNvme::create(0x1de, 0x1000, info); - // let id = inv - // .register(&nvme, format!("nvme-{}", name), None)?; - // let _be_id = inv - // .register_child(creg, id)?; + let id = + inv.register(&nvme, format!("nvme-{}", name), None)?; + let _be_id = inv.register_child(creg, id)?; - // let blk = nvme.inner_dev::(); - // backend.attach(blk, disp); + let blk = nvme.inner_dev::(); + backend.attach(blk, disp); - // chipset.pci_attach(bdf.unwrap(), nvme); - // } + chipset.pci_attach(bdf.unwrap(), nvme); + } _ => { eprintln!("unrecognized driver: {}", name); std::process::exit(libc::EXIT_FAILURE); From 7288b5f1826f06c5a44ce24ee31a9d4b94d617f4 Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Thu, 7 Oct 2021 09:22:14 -0700 Subject: [PATCH 16/23] nvme: re-enable usdt probes --- propolis/src/hw/nvme/requests.rs | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/propolis/src/hw/nvme/requests.rs b/propolis/src/hw/nvme/requests.rs index cf2f1c793..bd9fd72f4 100644 --- a/propolis/src/hw/nvme/requests.rs +++ b/propolis/src/hw/nvme/requests.rs @@ -93,7 +93,7 @@ fn read_op( cqe_permit: CompQueueEntryPermit, ctx: &DispCtx, ) -> Request { - //probe_nvme_read_enqueue!(|| (cid, cmd.slba, cmd.nlb)); + probe_nvme_read_enqueue!(|| (cid, cmd.slba, cmd.nlb)); let off = state.nlb_to_size(cmd.slba as usize); let size = state.nlb_to_size(cmd.nlb as usize); let bufs = cmd.data(size as u64, ctx.mctx.memctx()).collect(); @@ -113,7 +113,7 @@ fn write_op( cqe_permit: CompQueueEntryPermit, ctx: &DispCtx, ) -> Request { - //probe_nvme_write_enqueue!(|| (cid, cmd.slba, cmd.nlb)); + probe_nvme_write_enqueue!(|| (cid, cmd.slba, cmd.nlb)); let off = state.nlb_to_size(cmd.slba as usize); let size = state.nlb_to_size(cmd.nlb as usize); let bufs = cmd.data(size as u64, ctx.mctx.memctx()).collect(); @@ -131,13 +131,7 @@ fn flush_op(cid: u16, cqe_permit: CompQueueEntryPermit) -> Request { 0, 0, // TODO: is 0 enough or do we pass total size? Box::new(move |res, ctx| { - complete_block_req( - cid, - BlockOp::Flush, - res, - cqe_permit, - ctx, - ) + complete_block_req(cid, BlockOp::Flush, res, cqe_permit, ctx) }), ) } @@ -163,11 +157,15 @@ fn complete_block_req( ), }; - // match op { - // BlockOp::Read => probe_nvme_read_complete!(|| (cid)), - // BlockOp::Write => probe_nvme_write_complete!(|| (cid)), - // _ => {} - // } + match op { + BlockOp::Read => { + probe_nvme_read_complete!(|| (cid)); + } + BlockOp::Write => { + probe_nvme_write_complete!(|| (cid)); + } + _ => {} + } cqe_permit.push_completion(cid, comp, ctx); } From 350435bee335200b344507950a6c30b1fdf5a177 Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Thu, 7 Oct 2021 09:49:51 -0700 Subject: [PATCH 17/23] nvme: complete malformed command with error instead of panic-ing --- propolis/src/hw/nvme/requests.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/propolis/src/hw/nvme/requests.rs b/propolis/src/hw/nvme/requests.rs index bd9fd72f4..5f0fa688f 100644 --- a/propolis/src/hw/nvme/requests.rs +++ b/propolis/src/hw/nvme/requests.rs @@ -38,17 +38,16 @@ impl PciNvme { // looking for a request to service for sq in state.sqs.iter().skip(1).flatten() { if let Some((sub, cqe_permit)) = sq.pop(ctx) { - // TODO: Shouldn't panic on malformed input from guest - let cmd = NvmCmd::parse(sub).unwrap(); + let cmd = NvmCmd::parse(sub); match cmd { - NvmCmd::Write(_) if !state.binfo.writable => { + Ok(NvmCmd::Write(_)) if !state.binfo.writable => { let comp = Completion::specific_err( bits::StatusCodeType::CmdSpecific, bits::STS_WRITE_READ_ONLY_RANGE, ); cqe_permit.push_completion(sub.cid(), comp, ctx); } - NvmCmd::Write(cmd) => { + Ok(NvmCmd::Write(cmd)) => { return Some(write_op( &state, sub.cid(), @@ -57,7 +56,7 @@ impl PciNvme { ctx, )); } - NvmCmd::Read(cmd) => { + Ok(NvmCmd::Read(cmd)) => { return Some(read_op( &state, sub.cid(), @@ -66,11 +65,12 @@ impl PciNvme { ctx, )); } - NvmCmd::Flush => { + Ok(NvmCmd::Flush) => { return Some(flush_op(sub.cid(), cqe_permit)); } - NvmCmd::Unknown(_) => { - // For any other command, just immediately complete it + Ok(NvmCmd::Unknown(_)) | Err(_) => { + // For any other unrecognized or malformed command, + // just immediately complete it with an error let comp = Completion::generic_err(bits::STS_INTERNAL_ERR); cqe_permit.push_completion(sub.cid(), comp, ctx); From 1a56bd8ab95d868fc2dbe2a115e01f813ce4ce9d Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Mon, 11 Oct 2021 12:48:02 -0700 Subject: [PATCH 18/23] block: pass Operation to IO completion callbacks. --- propolis/src/block/mod.rs | 5 +++-- propolis/src/hw/nvme/requests.rs | 28 ++++++++++------------------ propolis/src/hw/virtio/block.rs | 4 ++-- 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/propolis/src/block/mod.rs b/propolis/src/block/mod.rs index 6bdec99aa..4c7aec61c 100644 --- a/propolis/src/block/mod.rs +++ b/propolis/src/block/mod.rs @@ -33,7 +33,8 @@ pub enum Result { Unsupported, } -pub type CompleteFn = dyn FnOnce(Result, &DispCtx) + Send + Sync + 'static; +pub type CompleteFn = + dyn FnOnce(Operation, Result, &DispCtx) + Send + Sync + 'static; /// Block device operation request pub struct Request { @@ -100,7 +101,7 @@ impl Request { /// Indiciate disposition of completed request pub fn complete(mut self, res: Result, ctx: &DispCtx) { let func = self.donef.take().unwrap(); - func(res, ctx); + func(self.op, res, ctx); } } impl Drop for Request { diff --git a/propolis/src/hw/nvme/requests.rs b/propolis/src/hw/nvme/requests.rs index 5f0fa688f..94a461bd3 100644 --- a/propolis/src/hw/nvme/requests.rs +++ b/propolis/src/hw/nvme/requests.rs @@ -1,5 +1,5 @@ use crate::{ - block::{self, Request}, + block::{self, Operation, Request}, dispatch::DispCtx, hw::nvme::{bits, cmds::Completion}, }; @@ -10,14 +10,6 @@ use super::{ NvmeCtrl, PciNvme, }; -/// NVMe IO Block Operations -/// TODO: Seems too generic to be here -enum BlockOp { - Read, - Write, - Flush, -} - impl block::Device for PciNvme { fn next(&self, ctx: &DispCtx) -> Option { self.notifier.next_arming(|| self.next_req(ctx)) @@ -100,8 +92,8 @@ fn read_op( Request::new_read( off, bufs, - Box::new(move |res, ctx| { - complete_block_req(cid, BlockOp::Read, res, cqe_permit, ctx) + Box::new(move |op, res, ctx| { + complete_block_req(cid, op, res, cqe_permit, ctx) }), ) } @@ -120,8 +112,8 @@ fn write_op( Request::new_write( off, bufs, - Box::new(move |res, ctx| { - complete_block_req(cid, BlockOp::Write, res, cqe_permit, ctx) + Box::new(move |op, res, ctx| { + complete_block_req(cid, op, res, cqe_permit, ctx) }), ) } @@ -130,8 +122,8 @@ fn flush_op(cid: u16, cqe_permit: CompQueueEntryPermit) -> Request { Request::new_flush( 0, 0, // TODO: is 0 enough or do we pass total size? - Box::new(move |res, ctx| { - complete_block_req(cid, BlockOp::Flush, res, cqe_permit, ctx) + Box::new(move |op, res, ctx| { + complete_block_req(cid, op, res, cqe_permit, ctx) }), ) } @@ -141,7 +133,7 @@ fn flush_op(cid: u16, cqe_permit: CompQueueEntryPermit) -> Request { /// Place the operation result (success or failure) onto the corresponding Completion Queue. fn complete_block_req( cid: u16, - op: BlockOp, + op: Operation, res: block::Result, cqe_permit: CompQueueEntryPermit, ctx: &DispCtx, @@ -158,10 +150,10 @@ fn complete_block_req( }; match op { - BlockOp::Read => { + Operation::Read(..) => { probe_nvme_read_complete!(|| (cid)); } - BlockOp::Write => { + Operation::Write(..) => { probe_nvme_write_complete!(|| (cid)); } _ => {} diff --git a/propolis/src/hw/virtio/block.rs b/propolis/src/hw/virtio/block.rs index bfaf92678..650e17686 100644 --- a/propolis/src/hw/virtio/block.rs +++ b/propolis/src/hw/virtio/block.rs @@ -91,7 +91,7 @@ impl VirtioBlock { Ok(block::Request::new_read( breq.sector as usize * SECTOR_SZ, regions, - Box::new(move |res, ctx| { + Box::new(move |_op, res, ctx| { complete_blockreq(res, chain, mvq, ctx); }), )) @@ -108,7 +108,7 @@ impl VirtioBlock { Ok(block::Request::new_write( breq.sector as usize * SECTOR_SZ, regions, - Box::new(move |res, ctx| { + Box::new(move |_op, res, ctx| { complete_blockreq(res, chain, mvq, ctx); }), )) From 9b0d5a8c0b1ab2a03a6a6d06dc46691fbb14b83d Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Mon, 11 Oct 2021 12:56:22 -0700 Subject: [PATCH 19/23] nvme: try to continue popping requests off the same SQ in case we immediately complete a command --- propolis/src/hw/nvme/queue.rs | 7 +------ propolis/src/hw/nvme/requests.rs | 5 +---- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/propolis/src/hw/nvme/queue.rs b/propolis/src/hw/nvme/queue.rs index 0e8ab2314..0f8cfad7b 100644 --- a/propolis/src/hw/nvme/queue.rs +++ b/propolis/src/hw/nvme/queue.rs @@ -374,11 +374,6 @@ impl SubQueue { } } - /// Returns the corresponding Completion Queue - pub fn cq(&self) -> Arc { - self.cq.clone() - } - /// Returns the current Head entry pointer. fn head(&self) -> u16 { SubQueueState(self.state.inner.load(Ordering::SeqCst)).head() @@ -514,7 +509,7 @@ impl CompQueue { /// Add a new entry to the Completion Queue while consuming a `CompQueueEntryPermit`. fn push( &self, - _: CompQueueEntryPermit, + _permit: CompQueueEntryPermit, entry: RawCompletion, ctx: &DispCtx, ) { diff --git a/propolis/src/hw/nvme/requests.rs b/propolis/src/hw/nvme/requests.rs index 94a461bd3..547c3d6c6 100644 --- a/propolis/src/hw/nvme/requests.rs +++ b/propolis/src/hw/nvme/requests.rs @@ -29,7 +29,7 @@ impl PciNvme { // Go through all the queues (skip admin as we just want I/O queues) // looking for a request to service for sq in state.sqs.iter().skip(1).flatten() { - if let Some((sub, cqe_permit)) = sq.pop(ctx) { + while let Some((sub, cqe_permit)) = sq.pop(ctx) { let cmd = NvmCmd::parse(sub); match cmd { Ok(NvmCmd::Write(_)) if !state.binfo.writable => { @@ -68,9 +68,6 @@ impl PciNvme { cqe_permit.push_completion(sub.cid(), comp, ctx); } } - - // Notify for any newly added completions - sq.cq().fire_interrupt(ctx); } } From a00f12d9dff639ec3e6893e39e873bc107b3cad2 Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Mon, 11 Oct 2021 16:52:23 -0700 Subject: [PATCH 20/23] nvme: fold atomic kick flag into existing atomic queue state. --- propolis/src/hw/nvme/queue.rs | 47 ++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/propolis/src/hw/nvme/queue.rs b/propolis/src/hw/nvme/queue.rs index 0f8cfad7b..f1cca691b 100644 --- a/propolis/src/hw/nvme/queue.rs +++ b/propolis/src/hw/nvme/queue.rs @@ -1,5 +1,5 @@ use std::marker::PhantomData; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; use super::bits::{self, RawCompletion, RawSubmission}; @@ -76,6 +76,12 @@ bitstruct! { /// /// See NVMe 1.0e Section 4.5 Completion Queue Entry - Phase Tag (P) phase: bool = 48; + + /// Whether the CQ should kick its SQs due to no permits being available previously. + /// + /// One may only pop something off the SQ if there's at least one space available in + /// the corresponding CQ. If there isn't, we set the kick flag. + kick: bool = 49; } } @@ -429,11 +435,6 @@ pub struct CompQueue { /// Queue state such as the size and current head/tail entry pointers. state: QueueState, - /// Flag set when when've run out of `CompQueueEntryPermit`'s to give out. - /// - /// Polled on CQ Doorbell events so that we kick the SQ's that wanted a CompQueueEntryPermit. - kick: AtomicBool, - /// The [`GuestAddr`] at which the Queue is mapped. base: GuestAddr, @@ -456,7 +457,6 @@ impl CompQueue { Ok(Self { iv, state: QueueState::new_completion_state(size), - kick: AtomicBool::new(false), base, hdl, }) @@ -478,7 +478,16 @@ impl CompQueue { /// Returns whether the SQ's should be kicked due to no permits being available previously. pub fn kick(&self) -> bool { - self.kick.swap(false, Ordering::SeqCst) + self.state + .inner + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { + let state = CompQueueState(state); + Some(state.with_kick(false).0) + }) + .ok() + .map(|old_state| CompQueueState(old_state).kick()) + // This unwrap is fine as our fetch_update never returns None + .unwrap() } /// Attempt to reserve an entry in the Completion Queue. @@ -488,7 +497,8 @@ impl CompQueue { self: &Arc, sq: Arc, ) -> Option { - self.state + let old_state = self + .state .inner .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { let state = CompQueueState(state); @@ -496,14 +506,20 @@ impl CompQueue { // No more spots available if avail == 0 { // Make sure we kick the SQ's when we have space available again - self.kick.store(true, Ordering::SeqCst); - return None; + Some(state.with_kick(true).0) + } else { + // Otherwise claim a spot + Some(state.with_avail(avail - 1).0) } - // Otherwise claim a spot - Some(state.with_avail(avail - 1).0) }) - .ok() - .map(|_| CompQueueEntryPermit { cq: self.clone(), sq }) + // Unwrap here is fine as our fetch_update never returns None. + .unwrap(); + let old_state = CompQueueState(old_state); + if old_state.avail() == 0 { + None + } else { + Some(CompQueueEntryPermit { cq: self.clone(), sq }) + } } /// Add a new entry to the Completion Queue while consuming a `CompQueueEntryPermit`. @@ -609,6 +625,7 @@ impl CompQueueEntryPermit { let avail = state.avail(); Some(state.with_avail(avail + 1).0) }) + // Unwrap here is fine as our fetch_update never returns None .unwrap(); } } From e7f0eef51005abb6af1a7df2f40a5c0af3bcb0e9 Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Mon, 11 Oct 2021 23:10:42 -0400 Subject: [PATCH 21/23] nvme: add tests to exercise the completion/submission queues. --- propolis/Cargo.toml | 4 +- propolis/src/dispatch/mod.rs | 4 +- propolis/src/dispatch/sync_tasks.rs | 2 +- propolis/src/hw/nvme/queue.rs | 445 ++++++++++++++++++++++++++++ propolis/src/hw/pci/device.rs | 11 +- propolis/src/lib.rs | 4 +- propolis/src/util/regmap.rs | 2 + propolis/src/vmm/hdl.rs | 8 +- propolis/src/vmm/machine.rs | 38 ++- propolis/src/vmm/mapping.rs | 2 +- 10 files changed, 508 insertions(+), 12 deletions(-) diff --git a/propolis/Cargo.toml b/propolis/Cargo.toml index d2404d5fa..0e1ca1b52 100644 --- a/propolis/Cargo.toml +++ b/propolis/Cargo.toml @@ -23,4 +23,6 @@ tokio = { version = "1", features = ["full"] } futures = "0.3" [dev-dependencies] -tempfile = "3.2" +crossbeam-channel = "0.5" +rand = "0.8" +tempfile = "3.2" \ No newline at end of file diff --git a/propolis/src/dispatch/mod.rs b/propolis/src/dispatch/mod.rs index 1477a7075..71724fded 100644 --- a/propolis/src/dispatch/mod.rs +++ b/propolis/src/dispatch/mod.rs @@ -138,13 +138,13 @@ impl SelfArc for Dispatcher { } } -struct SharedCtx { +pub(crate) struct SharedCtx { mctx: MachineCtx, disp: Weak, inst: Weak, } impl SharedCtx { - fn create(disp: &Dispatcher) -> Self { + pub(crate) fn create(disp: &Dispatcher) -> Self { Self { mctx: MachineCtx::new(&disp.machine), disp: disp.self_weak(), diff --git a/propolis/src/dispatch/sync_tasks.rs b/propolis/src/dispatch/sync_tasks.rs index 59d716696..bbfd4f6c3 100644 --- a/propolis/src/dispatch/sync_tasks.rs +++ b/propolis/src/dispatch/sync_tasks.rs @@ -376,7 +376,7 @@ pub struct SyncCtx { } impl SyncCtx { - pub(super) fn standalone(shared: SharedCtx) -> Self { + pub(crate) fn standalone(shared: SharedCtx) -> Self { Self { shared, ctrl: None } } fn for_worker(shared: SharedCtx, ctrl: Arc) -> Self { diff --git a/propolis/src/hw/nvme/queue.rs b/propolis/src/hw/nvme/queue.rs index f1cca691b..e17f11c25 100644 --- a/propolis/src/hw/nvme/queue.rs +++ b/propolis/src/hw/nvme/queue.rs @@ -39,9 +39,11 @@ const MAX_ADMIN_QUEUE_SIZE: u32 = 1 << 12; pub const ADMIN_QUEUE_ID: QueueId = 0; /// Marker type to indicate a Completion Queue. +#[derive(Debug)] enum CompletionQueueType {} /// Marker type to indicate a Submission Queue. +#[derive(Debug)] enum SubmissionQueueType {} bitstruct! { @@ -112,6 +114,7 @@ bitstruct! { /// methods exposed based on whether the queue in question /// is a Completion or Submission queue. Use either /// `CompletionQueueType` or `SubmissionQueueType`. +#[derive(Debug)] struct QueueState { /// The size of the queue in question. /// @@ -328,6 +331,7 @@ pub enum QueueUpdateError { } /// Type for manipulating Submission Queues. +#[derive(Debug)] pub struct SubQueue { /// The ID of this Submission Queue. id: QueueId, @@ -427,6 +431,7 @@ impl SubQueue { } /// Type for manipulating Completion Queues. +#[derive(Debug)] pub struct CompQueue { /// The Interrupt Vector used to signal to the host (VM) upon pushing /// entries onto the Completion Queue. @@ -585,6 +590,7 @@ impl CompQueue { } /// A type which allows pushing a Completion Entry onto the Completion Queue. +#[derive(Debug)] pub struct CompQueueEntryPermit { /// The corresponding Completion Queue for which we have a permit. cq: Arc, @@ -613,6 +619,18 @@ impl CompQueueEntryPermit { cq.fire_interrupt(ctx); } + /// Consume the permit by placing an entry into the Completion Queue. + /// + /// This is a simpler version of `CompQueueEntryPermit::push_completion` + /// just for testing purposes that doesn't require passing in the actual + /// completion data. Meant just for excercising the Submission & Completion + /// Queues in unit tests. + #[cfg(test)] + fn push_completion_test(self, ctx: &DispCtx) { + let cq = self.cq.clone(); + cq.push(self, bits::RawCompletion::default(), ctx); + } + /// Return the permit without having actually used it. /// /// Frees up the space for someone else to grab it via `CompQueue::reserve_entry`. @@ -629,3 +647,430 @@ impl CompQueueEntryPermit { .unwrap(); } } + +#[cfg(test)] +mod tests { + use rand::Rng; + + use super::*; + + use crate::{ + common::GuestAddr, + dispatch::{SharedCtx, SyncCtx}, + instance::Instance, + }; + use std::assert_matches::assert_matches; + use std::io::Error; + use std::thread::{sleep, spawn}; + use std::time::Duration; + + macro_rules! dispctx ( + ($ctx:ident, $instance:expr) => { + let mut sctx = + SyncCtx::standalone(SharedCtx::create(&$instance.disp)); + let $ctx = sctx.dispctx(); + }; + ); + + #[test] + fn create_cqs() -> Result<(), Error> { + let instance = Instance::new_test(None)?; + dispctx!(ctx, instance); + + let hdl = pci::MsixHdl::new_test(); + let read_base = GuestAddr(0); + let write_base = GuestAddr(1024 * 1024); + + // Admin queues must be less than 4K + let cq = CompQueue::new( + ADMIN_QUEUE_ID, + 0, + 1024, + write_base, + &ctx, + hdl.clone(), + ); + assert_matches!(cq, Ok(_)); + let cq = CompQueue::new( + ADMIN_QUEUE_ID, + 0, + 5 * 1024, + write_base, + &ctx, + hdl.clone(), + ); + assert_matches!(cq, Err(QueueCreateErr::InvalidSize)); + + // I/O queues must be less than 64K + let cq = CompQueue::new(1, 0, 1024, write_base, &ctx, hdl.clone()); + assert_matches!(cq, Ok(_)); + let cq = CompQueue::new(1, 0, 65 * 1024, write_base, &ctx, hdl.clone()); + assert_matches!(cq, Err(QueueCreateErr::InvalidSize)); + + // Neither must be less than 2 + let cq = + CompQueue::new(ADMIN_QUEUE_ID, 0, 1, write_base, &ctx, hdl.clone()); + assert_matches!(cq, Err(QueueCreateErr::InvalidSize)); + let cq = CompQueue::new(1, 0, 1, write_base, &ctx, hdl.clone()); + assert_matches!(cq, Err(QueueCreateErr::InvalidSize)); + + // Completion Queue's must be mapped to writable memory + let cq = + CompQueue::new(ADMIN_QUEUE_ID, 0, 2, read_base, &ctx, hdl.clone()); + assert_matches!(cq, Err(QueueCreateErr::InvalidBaseAddr)); + let cq = CompQueue::new(1, 0, 2, read_base, &ctx, hdl.clone()); + assert_matches!(cq, Err(QueueCreateErr::InvalidBaseAddr)); + + Ok(()) + } + + #[test] + fn create_sqs() -> Result<(), Error> { + let instance = Instance::new_test(None)?; + dispctx!(ctx, instance); + + let hdl = pci::MsixHdl::new_test(); + let read_base = GuestAddr(0); + let write_base = GuestAddr(1024 * 1024); + + // Create corresponding CQs + let admin_cq = Arc::new( + CompQueue::new( + ADMIN_QUEUE_ID, + 0, + 1024, + write_base, + &ctx, + hdl.clone(), + ) + .unwrap(), + ); + let io_cq = Arc::new( + CompQueue::new(1, 0, 1024, write_base, &ctx, hdl.clone()).unwrap(), + ); + + // Admin queues must be less than 4K + let sq = SubQueue::new( + ADMIN_QUEUE_ID, + admin_cq.clone(), + 1024, + read_base, + &ctx, + ); + assert_matches!(sq, Ok(_)); + let sq = SubQueue::new( + ADMIN_QUEUE_ID, + admin_cq.clone(), + 5 * 1024, + read_base, + &ctx, + ); + assert_matches!(sq, Err(QueueCreateErr::InvalidSize)); + + // I/O queues must be less than 64K + let sq = SubQueue::new(1, io_cq.clone(), 1024, read_base, &ctx); + assert_matches!(sq, Ok(_)); + let sq = SubQueue::new(1, io_cq.clone(), 65 * 1024, read_base, &ctx); + assert_matches!(sq, Err(QueueCreateErr::InvalidSize)); + + // Neither must be less than 2 + let sq = + SubQueue::new(ADMIN_QUEUE_ID, admin_cq.clone(), 1, read_base, &ctx); + assert_matches!(sq, Err(QueueCreateErr::InvalidSize)); + let sq = SubQueue::new(1, admin_cq.clone(), 1, read_base, &ctx); + assert_matches!(sq, Err(QueueCreateErr::InvalidSize)); + + // Completion Queue's must be mapped to readable memory + let sq = SubQueue::new( + ADMIN_QUEUE_ID, + admin_cq.clone(), + 2, + write_base, + &ctx, + ); + assert_matches!(sq, Err(QueueCreateErr::InvalidBaseAddr)); + let sq = SubQueue::new(1, admin_cq.clone(), 2, write_base, &ctx); + assert_matches!(sq, Err(QueueCreateErr::InvalidBaseAddr)); + + Ok(()) + } + + #[test] + fn push_failures() -> Result<(), Error> { + let instance = Instance::new_test(None)?; + dispctx!(ctx, instance); + let hdl = pci::MsixHdl::new_test(); + let read_base = GuestAddr(0); + let write_base = GuestAddr(1024 * 1024); + + // Create our queues + let cq = Arc::new( + CompQueue::new(1, 0, 4, write_base, &ctx, hdl.clone()).unwrap(), + ); + let sq = + Arc::new(SubQueue::new(1, cq.clone(), 4, read_base, &ctx).unwrap()); + + // Replicate guest VM notifying us things were pushed to the SQ + let mut sq_tail = 0; + for _ in 0..sq.state.size - 1 { + sq_tail = sq.state.wrap_add(sq_tail, 1); + // These should all succeed + assert_matches!(sq.notify_tail(sq_tail), Ok(_)); + } + + // But anything more should fail + sq_tail = sq.state.wrap_add(sq_tail, 1); + assert_matches!( + sq.notify_tail(sq_tail), + Err(QueueUpdateError::TooManyEntries) + ); + + // Also anything that falls outside the boundaries (i.e. we didn't wrap properly) + assert_matches!( + sq.notify_tail(sq.state.size as u16), + Err(QueueUpdateError::InvalidEntry) + ); + + // Now pop those SQ items and complete them in the CQ + while let Some((_, permit)) = sq.pop(&ctx) { + permit.push_completion_test(&ctx); + } + + // Replicate guest VM notifying us things were consumed off the CQ + let mut cq_head = 0; + for _ in 0..sq.state.size - 1 { + cq_head = cq.state.wrap_add(cq_head, 1); + // These should all succeed + assert_matches!(cq.notify_head(cq_head), Ok(_)); + } + + // There's nothing else to pop so this should fail + cq_head = cq.state.wrap_add(cq_head, 1); + assert_matches!( + cq.notify_head(cq_head), + Err(QueueUpdateError::TooManyEntries) + ); + + // Also anything that falls outside the boundaries (i.e. we didn't wrap properly) + assert_matches!( + cq.notify_head(cq.state.size as u16), + Err(QueueUpdateError::InvalidEntry) + ); + + Ok(()) + } + + #[test] + fn cq_kicks() -> Result<(), Error> { + let instance = Instance::new_test(None)?; + dispctx!(ctx, instance); + let hdl = pci::MsixHdl::new_test(); + let read_base = GuestAddr(0); + let write_base = GuestAddr(1024 * 1024); + + // Create our queues + // Purposely make the CQ smaller to test kicks + let cq = Arc::new( + CompQueue::new(1, 0, 2, write_base, &ctx, hdl.clone()).unwrap(), + ); + let sq = + Arc::new(SubQueue::new(1, cq.clone(), 4, read_base, &ctx).unwrap()); + + // Replicate guest VM notifying us things were pushed to the SQ + let mut sq_tail = 0; + for _ in 0..sq.state.size - 1 { + sq_tail = sq.state.wrap_add(sq_tail, 1); + assert_matches!(sq.notify_tail(sq_tail), Ok(_)); + } + + // We should be able to pop based on how much space is in the CQ + for _ in 0..cq.state.size - 1 { + let pop = sq.pop(&ctx); + assert_matches!(pop, Some(_)); + + // Complete these in the CQ (but note guest won't have acknowledged them yet) + pop.unwrap().1.push_completion_test(&ctx); + } + + // But we can't pop anymore due to no more CQ space to reserve + assert_matches!(sq.pop(&ctx), None); + + // The guest consuming things off the CQ should let free us + assert_matches!(cq.notify_head(1), Ok(_)); + + // Kick should've been set in the failed pop + assert!(cq.kick()); + + // We should have one more space now and should be able to pop 1 more + assert_matches!(sq.pop(&ctx), Some(_)); + + Ok(()) + } + + #[test] + fn push_pop() -> Result<(), Error> { + let instance = Instance::new_test(None)?; + dispctx!(ctx, instance); + let hdl = pci::MsixHdl::new_test(); + let read_base = GuestAddr(0); + let write_base = GuestAddr(1024 * 1024); + + // Create a pair of Completion and Submission Queues + // with a random size. We purposefully give the CQ a smaller + // size to exercise the "kick" conditions where we have some + // request available in the SQ but can't pop it until there's + // space available in the CQ. + let mut rng = rand::thread_rng(); + let sq_size = rng.gen_range(512..2048); + let cq = Arc::new( + CompQueue::new(1, 0, 4, write_base, &ctx, hdl.clone()).unwrap(), + ); + let sq = Arc::new( + SubQueue::new(1, cq.clone(), sq_size, read_base, &ctx).unwrap(), + ); + + // We'll be generating a random number of submissions + let submissions_rand = rng.gen_range(2..sq.state.size - 1); + + let (doorbell_tx, doorbell_rx) = + crossbeam_channel::unbounded::(); + let (workers_tx, workers_rx) = crossbeam_channel::unbounded(); + let (comp_tx, comp_rx) = crossbeam_channel::unbounded(); + + // Create a thread to mimic the main device thread that + // will handle "doorbell" read/write ops. + enum Doorbell { + Cq(u16), + Sq(u16), + } + let (doorbell_cq, doorbell_sq) = (cq.clone(), sq.clone()); + let doorbell_handler = spawn(move || { + // Keep track of the "host" side CQ head and SQ tail as + // we receive "doorbell" hits. + let mut cq_head = 0; + let mut sq_tail = 0; + loop { + match doorbell_rx.recv() { + Ok(Doorbell::Cq(n)) => { + cq_head = doorbell_cq.state.wrap_add(cq_head, n); + assert_matches!( + doorbell_cq.notify_head(cq_head), + Ok(_) + ); + if doorbell_cq.kick() { + assert!(workers_tx.send(()).is_ok()); + } + } + Ok(Doorbell::Sq(n)) => { + sq_tail = doorbell_sq.state.wrap_add(sq_tail, n); + // The "doorbell" was rung and so let's have the SQ + // update its internal state before poking the workers + assert_matches!( + doorbell_sq.notify_tail(sq_tail), + Ok(_) + ); + assert!(workers_tx.send(()).is_ok()); + } + Err(_) => break, + } + } + }); + + // Create a number of worker threads to simulate the block + // dev backend workers that will be notified every time the + // SQ "doorbell" is hit and will attempt to pull a new IO + // request off the SQ. At the end, each will return a count + // of how many requests they received and then completed. + let io_workers = (0..4) + .map(|_| { + let worker_instance = instance.clone(); + let worker_rx = workers_rx.clone(); + let worker_sq = sq.clone(); + let worker_comp_tx = comp_tx.clone(); + spawn(move || { + dispctx!(ctx, worker_instance); + let mut rng = rand::thread_rng(); + let mut submissions = 0; + loop { + match worker_rx.recv() { + Ok(()) => { + while let Some((_, cqe_permit)) = + worker_sq.pop(&ctx) + { + submissions += 1; + + // Sleep for a bit to mimic actually doing some + // work before we complete the IO + sleep(Duration::from_micros( + rng.gen_range(0..500), + )); + + cqe_permit.push_completion_test(&ctx); + + // Now signal the "guest" side of the completion handler + assert!(worker_comp_tx.send(()).is_ok()); + } + } + Err(_) => break submissions, + } + } + }) + }) + .collect::>(); + + // Create a thread to "consume" things off the Completion Queue. + // This simulates the host reacting to our CQ pushes and "ringing" the + // CQ doorbell. At the end, it returns how many completion were handled. + // Regardless, it'll stop after the number of completions is at least + // as many as the number of submissions we decided to generate. + let comp_doorbell_tx = doorbell_tx.clone(); + let comp_handler = spawn(move || { + let exit_after = submissions_rand; + let mut completions = 0; + loop { + match comp_rx.recv() { + Ok(()) => { + // "Ring" the CQ doorbell + // TODO: test completing more than 1 at a time + assert!(comp_doorbell_tx.send(Doorbell::Cq(1)).is_ok()); + completions += 1; + } + Err(_) => break completions, + } + if completions >= exit_after { + break completions; + } + } + }); + + // Now, start generating a random number of submissions + for _ in 0..submissions_rand { + // "Ring" the SQ doorbell + // TODO: test submitting more than 1 at a time + //let doorbell_tx = doorbell_tx.clone(); + assert!(doorbell_tx.send(Doorbell::Sq(1)).is_ok()); + + // Sleep up to 100us in between + sleep(Duration::from_micros(rng.gen_range(0..100))); + } + drop(doorbell_tx); + + // Wait for the completion handler and its count + let completions: u32 = comp_handler.join().unwrap(); + + // Wait for doorbell handler + doorbell_handler.join().unwrap(); + + // Wait for the IO workers to complete and sum the total + // number of submissions they recevied + let submissions: u32 = + io_workers.into_iter().map(|j| j.join().unwrap()).sum(); + + // Make sure the number of submission we recevied matched the + // number we generated and completed + assert_eq!(submissions, submissions_rand); + assert_eq!(submissions, completions); + + Ok(()) + } +} diff --git a/propolis/src/hw/pci/device.rs b/propolis/src/hw/pci/device.rs index 17a174265..4607d2c6f 100644 --- a/propolis/src/hw/pci/device.rs +++ b/propolis/src/hw/pci/device.rs @@ -917,6 +917,7 @@ pub trait Device: Send + Sync + 'static + Entity { // fn cap_write(&self); } +#[derive(Debug)] enum MsixBarReg { Addr(u16), Data(u16), @@ -946,7 +947,7 @@ const MSIX_VEC_MASK: u32 = 1 << 0; const MSIX_MSGCTRL_ENABLE: u16 = 1 << 15; const MSIX_MSGCTRL_FMASK: u16 = 1 << 14; -#[derive(Default)] +#[derive(Debug, Default)] struct MsixEntry { addr: u64, data: u32, @@ -982,6 +983,7 @@ impl MsixEntry { } } +#[derive(Debug)] struct MsixCfg { count: u16, bar: BarN, @@ -990,7 +992,7 @@ struct MsixCfg { entries: Vec>, state: Mutex, } -#[derive(Default)] +#[derive(Debug, Default)] struct MsixCfgState { enabled: bool, func_mask: bool, @@ -1256,6 +1258,7 @@ pub struct MsiEnt { pub pending: bool, } +#[derive(Debug)] pub struct MsixHdl { cfg: Arc, } @@ -1263,6 +1266,10 @@ impl MsixHdl { fn new(cfg: &Arc) -> Self { Self { cfg: Arc::clone(cfg) } } + #[cfg(test)] + pub(crate) fn new_test() -> Self { + Self { cfg: MsixCfg::new(2048, BarN::BAR0).0 } + } pub fn fire(&self, idx: u16, ctx: &DispCtx) { self.cfg.fire(idx, ctx); } diff --git a/propolis/src/lib.rs b/propolis/src/lib.rs index e1683f3fb..d165e9aaa 100644 --- a/propolis/src/lib.rs +++ b/propolis/src/lib.rs @@ -1,6 +1,8 @@ +#![allow(clippy::style)] // Pull in asm!() support for USDT #![feature(asm)] -#![allow(clippy::style)] +// Pull in `assert_matches` for tests +#![cfg_attr(test, feature(assert_matches))] use usdt::dtrace_provider; diff --git a/propolis/src/util/regmap.rs b/propolis/src/util/regmap.rs index f78ca177a..78ac05e56 100644 --- a/propolis/src/util/regmap.rs +++ b/propolis/src/util/regmap.rs @@ -4,12 +4,14 @@ use std::ops::Bound::Included; use super::aspace::ASpace; use crate::common::*; +#[derive(Debug)] struct RegDef { id: ID, flags: Flags, } /// Represents a mapping of registers within an address space. +#[derive(Debug)] pub struct RegMap { len: usize, space: ASpace>, diff --git a/propolis/src/vmm/hdl.rs b/propolis/src/vmm/hdl.rs index 5e5717900..61c448b10 100644 --- a/propolis/src/vmm/hdl.rs +++ b/propolis/src/vmm/hdl.rs @@ -144,7 +144,7 @@ impl VmmFile { /// A handle to an existing virtual machine monitor. pub struct VmmHdl { - inner: VmmFile, + pub(super) inner: VmmFile, destroyed: AtomicBool, name: String, } @@ -380,8 +380,10 @@ impl VmmHdl { /// Build a VmmHdl instance suitable for unit tests, but nothing else, since /// it will not be backed by any real vmm reousrces. pub(crate) fn new_test() -> Result { - // TODO: use something else - let fp = File::open("/dev/null")?; + use tempfile::tempfile; + // Create a 2M temp file to use as our VM "memory" + let fp = tempfile()?; + fp.set_len(2 * 1024 * 1024).unwrap(); Ok(Self { inner: VmmFile(fp), destroyed: AtomicBool::new(false), diff --git a/propolis/src/vmm/machine.rs b/propolis/src/vmm/machine.rs index e4ac785bb..6f4d0b867 100644 --- a/propolis/src/vmm/machine.rs +++ b/propolis/src/vmm/machine.rs @@ -125,7 +125,43 @@ impl Machine { // TODO: meaningfully populate these let guard_space = GuardSpace::new(crate::common::PAGE_SIZE)?; - let map = ASpace::new(0, MAX_PHYSMEM); + let mut map = ASpace::new(0, MAX_PHYSMEM); + map.register( + 0, + 1024 * 1024, + MapEnt { + kind: MapKind::SysMem(0, Prot::READ), + name: "test-readable".to_string(), + guest_map: Some(Mapping::new( + 1024 * 1024, + Prot::READ, + &hdl.inner, + 0, + )?), + dev_map: None, + }, + ) + .map_err(|e| { + std::io::Error::new(std::io::ErrorKind::Other, format!("{:?}", e)) + })?; + map.register( + 1024 * 1024, + 1024 * 1024, + MapEnt { + kind: MapKind::SysMem(0, Prot::WRITE), + name: "test-writable".to_string(), + guest_map: Some(Mapping::new( + 1024 * 1024, + Prot::WRITE, + &hdl.inner, + 1024 * 1024, + )?), + dev_map: None, + }, + ) + .map_err(|e| { + std::io::Error::new(std::io::ErrorKind::Other, format!("{:?}", e)) + })?; Ok(Machine { hdl: Arc::new(hdl), diff --git a/propolis/src/vmm/mapping.rs b/propolis/src/vmm/mapping.rs index a63aafa6b..eca4fdf2a 100644 --- a/propolis/src/vmm/mapping.rs +++ b/propolis/src/vmm/mapping.rs @@ -619,7 +619,7 @@ pub mod tests { let input: u64 = 0xDEADBEEF; mapping.as_ref().write(&input).unwrap(); - let output = mapping.as_ref().read().unwrap(); + let output: u64 = mapping.as_ref().read().unwrap(); assert_eq!(input, output); } From 1af299aa64e9b529ac14c4fec36538ad2d745754 Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Tue, 12 Oct 2021 16:14:32 -0400 Subject: [PATCH 22/23] Don't expose dispatcher guts and just use with_ctx method. --- propolis/src/dispatch/mod.rs | 4 +- propolis/src/dispatch/sync_tasks.rs | 2 +- propolis/src/hw/nvme/queue.rs | 679 ++++++++++++++-------------- 3 files changed, 352 insertions(+), 333 deletions(-) diff --git a/propolis/src/dispatch/mod.rs b/propolis/src/dispatch/mod.rs index 71724fded..1477a7075 100644 --- a/propolis/src/dispatch/mod.rs +++ b/propolis/src/dispatch/mod.rs @@ -138,13 +138,13 @@ impl SelfArc for Dispatcher { } } -pub(crate) struct SharedCtx { +struct SharedCtx { mctx: MachineCtx, disp: Weak, inst: Weak, } impl SharedCtx { - pub(crate) fn create(disp: &Dispatcher) -> Self { + fn create(disp: &Dispatcher) -> Self { Self { mctx: MachineCtx::new(&disp.machine), disp: disp.self_weak(), diff --git a/propolis/src/dispatch/sync_tasks.rs b/propolis/src/dispatch/sync_tasks.rs index bbfd4f6c3..59d716696 100644 --- a/propolis/src/dispatch/sync_tasks.rs +++ b/propolis/src/dispatch/sync_tasks.rs @@ -376,7 +376,7 @@ pub struct SyncCtx { } impl SyncCtx { - pub(crate) fn standalone(shared: SharedCtx) -> Self { + pub(super) fn standalone(shared: SharedCtx) -> Self { Self { shared, ctrl: None } } fn for_worker(shared: SharedCtx, ctrl: Arc) -> Self { diff --git a/propolis/src/hw/nvme/queue.rs b/propolis/src/hw/nvme/queue.rs index e17f11c25..506de96a3 100644 --- a/propolis/src/hw/nvme/queue.rs +++ b/propolis/src/hw/nvme/queue.rs @@ -654,72 +654,73 @@ mod tests { use super::*; - use crate::{ - common::GuestAddr, - dispatch::{SharedCtx, SyncCtx}, - instance::Instance, - }; + use crate::{common::GuestAddr, instance::Instance}; use std::assert_matches::assert_matches; use std::io::Error; use std::thread::{sleep, spawn}; use std::time::Duration; - macro_rules! dispctx ( - ($ctx:ident, $instance:expr) => { - let mut sctx = - SyncCtx::standalone(SharedCtx::create(&$instance.disp)); - let $ctx = sctx.dispctx(); - }; - ); - #[test] fn create_cqs() -> Result<(), Error> { let instance = Instance::new_test(None)?; - dispctx!(ctx, instance); - let hdl = pci::MsixHdl::new_test(); let read_base = GuestAddr(0); let write_base = GuestAddr(1024 * 1024); - // Admin queues must be less than 4K - let cq = CompQueue::new( - ADMIN_QUEUE_ID, - 0, - 1024, - write_base, - &ctx, - hdl.clone(), - ); - assert_matches!(cq, Ok(_)); - let cq = CompQueue::new( - ADMIN_QUEUE_ID, - 0, - 5 * 1024, - write_base, - &ctx, - hdl.clone(), - ); - assert_matches!(cq, Err(QueueCreateErr::InvalidSize)); - - // I/O queues must be less than 64K - let cq = CompQueue::new(1, 0, 1024, write_base, &ctx, hdl.clone()); - assert_matches!(cq, Ok(_)); - let cq = CompQueue::new(1, 0, 65 * 1024, write_base, &ctx, hdl.clone()); - assert_matches!(cq, Err(QueueCreateErr::InvalidSize)); - - // Neither must be less than 2 - let cq = - CompQueue::new(ADMIN_QUEUE_ID, 0, 1, write_base, &ctx, hdl.clone()); - assert_matches!(cq, Err(QueueCreateErr::InvalidSize)); - let cq = CompQueue::new(1, 0, 1, write_base, &ctx, hdl.clone()); - assert_matches!(cq, Err(QueueCreateErr::InvalidSize)); - - // Completion Queue's must be mapped to writable memory - let cq = - CompQueue::new(ADMIN_QUEUE_ID, 0, 2, read_base, &ctx, hdl.clone()); - assert_matches!(cq, Err(QueueCreateErr::InvalidBaseAddr)); - let cq = CompQueue::new(1, 0, 2, read_base, &ctx, hdl.clone()); - assert_matches!(cq, Err(QueueCreateErr::InvalidBaseAddr)); + instance.disp.with_ctx(|ctx| { + // Admin queues must be less than 4K + let cq = CompQueue::new( + ADMIN_QUEUE_ID, + 0, + 1024, + write_base, + ctx, + hdl.clone(), + ); + assert_matches!(cq, Ok(_)); + let cq = CompQueue::new( + ADMIN_QUEUE_ID, + 0, + 5 * 1024, + write_base, + ctx, + hdl.clone(), + ); + assert_matches!(cq, Err(QueueCreateErr::InvalidSize)); + + // I/O queues must be less than 64K + let cq = CompQueue::new(1, 0, 1024, write_base, ctx, hdl.clone()); + assert_matches!(cq, Ok(_)); + let cq = + CompQueue::new(1, 0, 65 * 1024, write_base, ctx, hdl.clone()); + assert_matches!(cq, Err(QueueCreateErr::InvalidSize)); + + // Neither must be less than 2 + let cq = CompQueue::new( + ADMIN_QUEUE_ID, + 0, + 1, + write_base, + ctx, + hdl.clone(), + ); + assert_matches!(cq, Err(QueueCreateErr::InvalidSize)); + let cq = CompQueue::new(1, 0, 1, write_base, ctx, hdl.clone()); + assert_matches!(cq, Err(QueueCreateErr::InvalidSize)); + + // Completion Queue's must be mapped to writable memory + let cq = CompQueue::new( + ADMIN_QUEUE_ID, + 0, + 2, + read_base, + ctx, + hdl.clone(), + ); + assert_matches!(cq, Err(QueueCreateErr::InvalidBaseAddr)); + let cq = CompQueue::new(1, 0, 2, read_base, ctx, hdl.clone()); + assert_matches!(cq, Err(QueueCreateErr::InvalidBaseAddr)); + }); Ok(()) } @@ -727,70 +728,76 @@ mod tests { #[test] fn create_sqs() -> Result<(), Error> { let instance = Instance::new_test(None)?; - dispctx!(ctx, instance); - let hdl = pci::MsixHdl::new_test(); let read_base = GuestAddr(0); let write_base = GuestAddr(1024 * 1024); - // Create corresponding CQs - let admin_cq = Arc::new( - CompQueue::new( + instance.disp.with_ctx(|ctx| { + // Create corresponding CQs + let admin_cq = Arc::new( + CompQueue::new( + ADMIN_QUEUE_ID, + 0, + 1024, + write_base, + ctx, + hdl.clone(), + ) + .unwrap(), + ); + let io_cq = Arc::new( + CompQueue::new(1, 0, 1024, write_base, ctx, hdl.clone()) + .unwrap(), + ); + + // Admin queues must be less than 4K + let sq = SubQueue::new( ADMIN_QUEUE_ID, - 0, + admin_cq.clone(), 1024, + read_base, + ctx, + ); + assert_matches!(sq, Ok(_)); + let sq = SubQueue::new( + ADMIN_QUEUE_ID, + admin_cq.clone(), + 5 * 1024, + read_base, + ctx, + ); + assert_matches!(sq, Err(QueueCreateErr::InvalidSize)); + + // I/O queues must be less than 64K + let sq = SubQueue::new(1, io_cq.clone(), 1024, read_base, ctx); + assert_matches!(sq, Ok(_)); + let sq = SubQueue::new(1, io_cq.clone(), 65 * 1024, read_base, ctx); + assert_matches!(sq, Err(QueueCreateErr::InvalidSize)); + + // Neither must be less than 2 + let sq = SubQueue::new( + ADMIN_QUEUE_ID, + admin_cq.clone(), + 1, + read_base, + ctx, + ); + assert_matches!(sq, Err(QueueCreateErr::InvalidSize)); + let sq = SubQueue::new(1, admin_cq.clone(), 1, read_base, ctx); + assert_matches!(sq, Err(QueueCreateErr::InvalidSize)); + + // Completion Queue's must be mapped to readable memory + let sq = SubQueue::new( + ADMIN_QUEUE_ID, + admin_cq.clone(), + 2, write_base, - &ctx, - hdl.clone(), - ) - .unwrap(), - ); - let io_cq = Arc::new( - CompQueue::new(1, 0, 1024, write_base, &ctx, hdl.clone()).unwrap(), - ); - - // Admin queues must be less than 4K - let sq = SubQueue::new( - ADMIN_QUEUE_ID, - admin_cq.clone(), - 1024, - read_base, - &ctx, - ); - assert_matches!(sq, Ok(_)); - let sq = SubQueue::new( - ADMIN_QUEUE_ID, - admin_cq.clone(), - 5 * 1024, - read_base, - &ctx, - ); - assert_matches!(sq, Err(QueueCreateErr::InvalidSize)); - - // I/O queues must be less than 64K - let sq = SubQueue::new(1, io_cq.clone(), 1024, read_base, &ctx); - assert_matches!(sq, Ok(_)); - let sq = SubQueue::new(1, io_cq.clone(), 65 * 1024, read_base, &ctx); - assert_matches!(sq, Err(QueueCreateErr::InvalidSize)); - - // Neither must be less than 2 - let sq = - SubQueue::new(ADMIN_QUEUE_ID, admin_cq.clone(), 1, read_base, &ctx); - assert_matches!(sq, Err(QueueCreateErr::InvalidSize)); - let sq = SubQueue::new(1, admin_cq.clone(), 1, read_base, &ctx); - assert_matches!(sq, Err(QueueCreateErr::InvalidSize)); - - // Completion Queue's must be mapped to readable memory - let sq = SubQueue::new( - ADMIN_QUEUE_ID, - admin_cq.clone(), - 2, - write_base, - &ctx, - ); - assert_matches!(sq, Err(QueueCreateErr::InvalidBaseAddr)); - let sq = SubQueue::new(1, admin_cq.clone(), 2, write_base, &ctx); - assert_matches!(sq, Err(QueueCreateErr::InvalidBaseAddr)); + ctx, + ); + assert_matches!(sq, Err(QueueCreateErr::InvalidBaseAddr)); + let sq = SubQueue::new(1, admin_cq.clone(), 2, write_base, ctx); + assert_matches!(sq, Err(QueueCreateErr::InvalidBaseAddr)); + }); Ok(()) } @@ -798,64 +805,66 @@ mod tests { #[test] fn push_failures() -> Result<(), Error> { let instance = Instance::new_test(None)?; - dispctx!(ctx, instance); let hdl = pci::MsixHdl::new_test(); let read_base = GuestAddr(0); let write_base = GuestAddr(1024 * 1024); - // Create our queues - let cq = Arc::new( - CompQueue::new(1, 0, 4, write_base, &ctx, hdl.clone()).unwrap(), - ); - let sq = - Arc::new(SubQueue::new(1, cq.clone(), 4, read_base, &ctx).unwrap()); + instance.disp.with_ctx(|ctx| { + // Create our queues + let cq = Arc::new( + CompQueue::new(1, 0, 4, write_base, ctx, hdl.clone()).unwrap(), + ); + let sq = Arc::new( + SubQueue::new(1, cq.clone(), 4, read_base, ctx).unwrap(), + ); + + // Replicate guest VM notifying us things were pushed to the SQ + let mut sq_tail = 0; + for _ in 0..sq.state.size - 1 { + sq_tail = sq.state.wrap_add(sq_tail, 1); + // These should all succeed + assert_matches!(sq.notify_tail(sq_tail), Ok(_)); + } - // Replicate guest VM notifying us things were pushed to the SQ - let mut sq_tail = 0; - for _ in 0..sq.state.size - 1 { + // But anything more should fail sq_tail = sq.state.wrap_add(sq_tail, 1); - // These should all succeed - assert_matches!(sq.notify_tail(sq_tail), Ok(_)); - } + assert_matches!( + sq.notify_tail(sq_tail), + Err(QueueUpdateError::TooManyEntries) + ); + + // Also anything that falls outside the boundaries (i.e. we didn't wrap properly) + assert_matches!( + sq.notify_tail(sq.state.size as u16), + Err(QueueUpdateError::InvalidEntry) + ); + + // Now pop those SQ items and complete them in the CQ + while let Some((_, permit)) = sq.pop(ctx) { + permit.push_completion_test(ctx); + } - // But anything more should fail - sq_tail = sq.state.wrap_add(sq_tail, 1); - assert_matches!( - sq.notify_tail(sq_tail), - Err(QueueUpdateError::TooManyEntries) - ); - - // Also anything that falls outside the boundaries (i.e. we didn't wrap properly) - assert_matches!( - sq.notify_tail(sq.state.size as u16), - Err(QueueUpdateError::InvalidEntry) - ); - - // Now pop those SQ items and complete them in the CQ - while let Some((_, permit)) = sq.pop(&ctx) { - permit.push_completion_test(&ctx); - } + // Replicate guest VM notifying us things were consumed off the CQ + let mut cq_head = 0; + for _ in 0..sq.state.size - 1 { + cq_head = cq.state.wrap_add(cq_head, 1); + // These should all succeed + assert_matches!(cq.notify_head(cq_head), Ok(_)); + } - // Replicate guest VM notifying us things were consumed off the CQ - let mut cq_head = 0; - for _ in 0..sq.state.size - 1 { + // There's nothing else to pop so this should fail cq_head = cq.state.wrap_add(cq_head, 1); - // These should all succeed - assert_matches!(cq.notify_head(cq_head), Ok(_)); - } - - // There's nothing else to pop so this should fail - cq_head = cq.state.wrap_add(cq_head, 1); - assert_matches!( - cq.notify_head(cq_head), - Err(QueueUpdateError::TooManyEntries) - ); - - // Also anything that falls outside the boundaries (i.e. we didn't wrap properly) - assert_matches!( - cq.notify_head(cq.state.size as u16), - Err(QueueUpdateError::InvalidEntry) - ); + assert_matches!( + cq.notify_head(cq_head), + Err(QueueUpdateError::TooManyEntries) + ); + + // Also anything that falls outside the boundaries (i.e. we didn't wrap properly) + assert_matches!( + cq.notify_head(cq.state.size as u16), + Err(QueueUpdateError::InvalidEntry) + ); + }); Ok(()) } @@ -863,46 +872,48 @@ mod tests { #[test] fn cq_kicks() -> Result<(), Error> { let instance = Instance::new_test(None)?; - dispctx!(ctx, instance); let hdl = pci::MsixHdl::new_test(); let read_base = GuestAddr(0); let write_base = GuestAddr(1024 * 1024); - // Create our queues - // Purposely make the CQ smaller to test kicks - let cq = Arc::new( - CompQueue::new(1, 0, 2, write_base, &ctx, hdl.clone()).unwrap(), - ); - let sq = - Arc::new(SubQueue::new(1, cq.clone(), 4, read_base, &ctx).unwrap()); - - // Replicate guest VM notifying us things were pushed to the SQ - let mut sq_tail = 0; - for _ in 0..sq.state.size - 1 { - sq_tail = sq.state.wrap_add(sq_tail, 1); - assert_matches!(sq.notify_tail(sq_tail), Ok(_)); - } + instance.disp.with_ctx(|ctx| { + // Create our queues + // Purposely make the CQ smaller to test kicks + let cq = Arc::new( + CompQueue::new(1, 0, 2, write_base, ctx, hdl.clone()).unwrap(), + ); + let sq = Arc::new( + SubQueue::new(1, cq.clone(), 4, read_base, ctx).unwrap(), + ); + + // Replicate guest VM notifying us things were pushed to the SQ + let mut sq_tail = 0; + for _ in 0..sq.state.size - 1 { + sq_tail = sq.state.wrap_add(sq_tail, 1); + assert_matches!(sq.notify_tail(sq_tail), Ok(_)); + } - // We should be able to pop based on how much space is in the CQ - for _ in 0..cq.state.size - 1 { - let pop = sq.pop(&ctx); - assert_matches!(pop, Some(_)); + // We should be able to pop based on how much space is in the CQ + for _ in 0..cq.state.size - 1 { + let pop = sq.pop(ctx); + assert_matches!(pop, Some(_)); - // Complete these in the CQ (but note guest won't have acknowledged them yet) - pop.unwrap().1.push_completion_test(&ctx); - } + // Complete these in the CQ (but note guest won't have acknowledged them yet) + pop.unwrap().1.push_completion_test(ctx); + } - // But we can't pop anymore due to no more CQ space to reserve - assert_matches!(sq.pop(&ctx), None); + // But we can't pop anymore due to no more CQ space to reserve + assert_matches!(sq.pop(ctx), None); - // The guest consuming things off the CQ should let free us - assert_matches!(cq.notify_head(1), Ok(_)); + // The guest consuming things off the CQ should let free us + assert_matches!(cq.notify_head(1), Ok(_)); - // Kick should've been set in the failed pop - assert!(cq.kick()); + // Kick should've been set in the failed pop + assert!(cq.kick()); - // We should have one more space now and should be able to pop 1 more - assert_matches!(sq.pop(&ctx), Some(_)); + // We should have one more space now and should be able to pop 1 more + assert_matches!(sq.pop(ctx), Some(_)); + }); Ok(()) } @@ -910,166 +921,174 @@ mod tests { #[test] fn push_pop() -> Result<(), Error> { let instance = Instance::new_test(None)?; - dispctx!(ctx, instance); let hdl = pci::MsixHdl::new_test(); let read_base = GuestAddr(0); let write_base = GuestAddr(1024 * 1024); - // Create a pair of Completion and Submission Queues - // with a random size. We purposefully give the CQ a smaller - // size to exercise the "kick" conditions where we have some - // request available in the SQ but can't pop it until there's - // space available in the CQ. - let mut rng = rand::thread_rng(); - let sq_size = rng.gen_range(512..2048); - let cq = Arc::new( - CompQueue::new(1, 0, 4, write_base, &ctx, hdl.clone()).unwrap(), - ); - let sq = Arc::new( - SubQueue::new(1, cq.clone(), sq_size, read_base, &ctx).unwrap(), - ); - - // We'll be generating a random number of submissions - let submissions_rand = rng.gen_range(2..sq.state.size - 1); - - let (doorbell_tx, doorbell_rx) = - crossbeam_channel::unbounded::(); - let (workers_tx, workers_rx) = crossbeam_channel::unbounded(); - let (comp_tx, comp_rx) = crossbeam_channel::unbounded(); - - // Create a thread to mimic the main device thread that - // will handle "doorbell" read/write ops. - enum Doorbell { - Cq(u16), - Sq(u16), - } - let (doorbell_cq, doorbell_sq) = (cq.clone(), sq.clone()); - let doorbell_handler = spawn(move || { - // Keep track of the "host" side CQ head and SQ tail as - // we receive "doorbell" hits. - let mut cq_head = 0; - let mut sq_tail = 0; - loop { - match doorbell_rx.recv() { - Ok(Doorbell::Cq(n)) => { - cq_head = doorbell_cq.state.wrap_add(cq_head, n); - assert_matches!( - doorbell_cq.notify_head(cq_head), - Ok(_) - ); - if doorbell_cq.kick() { + instance.disp.with_ctx(|ctx| { + // Create a pair of Completion and Submission Queues + // with a random size. We purposefully give the CQ a smaller + // size to exercise the "kick" conditions where we have some + // request available in the SQ but can't pop it until there's + // space available in the CQ. + let mut rng = rand::thread_rng(); + let sq_size = rng.gen_range(512..2048); + let cq = Arc::new( + CompQueue::new(1, 0, 4, write_base, ctx, hdl.clone()).unwrap(), + ); + let sq = Arc::new( + SubQueue::new(1, cq.clone(), sq_size, read_base, ctx).unwrap(), + ); + + // We'll be generating a random number of submissions + let submissions_rand = rng.gen_range(2..sq.state.size - 1); + + let (doorbell_tx, doorbell_rx) = + crossbeam_channel::unbounded::(); + let (workers_tx, workers_rx) = crossbeam_channel::unbounded(); + let (comp_tx, comp_rx) = crossbeam_channel::unbounded(); + + // Create a thread to mimic the main device thread that + // will handle "doorbell" read/write ops. + enum Doorbell { + Cq(u16), + Sq(u16), + } + let (doorbell_cq, doorbell_sq) = (cq.clone(), sq.clone()); + let doorbell_handler = spawn(move || { + // Keep track of the "host" side CQ head and SQ tail as + // we receive "doorbell" hits. + let mut cq_head = 0; + let mut sq_tail = 0; + loop { + match doorbell_rx.recv() { + Ok(Doorbell::Cq(n)) => { + cq_head = doorbell_cq.state.wrap_add(cq_head, n); + assert_matches!( + doorbell_cq.notify_head(cq_head), + Ok(_) + ); + if doorbell_cq.kick() { + assert!(workers_tx.send(()).is_ok()); + } + } + Ok(Doorbell::Sq(n)) => { + sq_tail = doorbell_sq.state.wrap_add(sq_tail, n); + // The "doorbell" was rung and so let's have the SQ + // update its internal state before poking the workers + assert_matches!( + doorbell_sq.notify_tail(sq_tail), + Ok(_) + ); assert!(workers_tx.send(()).is_ok()); } + Err(_) => break, } - Ok(Doorbell::Sq(n)) => { - sq_tail = doorbell_sq.state.wrap_add(sq_tail, n); - // The "doorbell" was rung and so let's have the SQ - // update its internal state before poking the workers - assert_matches!( - doorbell_sq.notify_tail(sq_tail), - Ok(_) - ); - assert!(workers_tx.send(()).is_ok()); - } - Err(_) => break, } - } - }); - - // Create a number of worker threads to simulate the block - // dev backend workers that will be notified every time the - // SQ "doorbell" is hit and will attempt to pull a new IO - // request off the SQ. At the end, each will return a count - // of how many requests they received and then completed. - let io_workers = (0..4) - .map(|_| { - let worker_instance = instance.clone(); - let worker_rx = workers_rx.clone(); - let worker_sq = sq.clone(); - let worker_comp_tx = comp_tx.clone(); - spawn(move || { - dispctx!(ctx, worker_instance); - let mut rng = rand::thread_rng(); - let mut submissions = 0; - loop { - match worker_rx.recv() { - Ok(()) => { - while let Some((_, cqe_permit)) = - worker_sq.pop(&ctx) - { - submissions += 1; - - // Sleep for a bit to mimic actually doing some - // work before we complete the IO - sleep(Duration::from_micros( - rng.gen_range(0..500), - )); - - cqe_permit.push_completion_test(&ctx); - - // Now signal the "guest" side of the completion handler - assert!(worker_comp_tx.send(()).is_ok()); + }); + + // Create a number of worker threads to simulate the block + // dev backend workers that will be notified every time the + // SQ "doorbell" is hit and will attempt to pull a new IO + // request off the SQ. At the end, each will return a count + // of how many requests they received and then completed. + let io_workers = (0..4) + .map(|_| { + let worker_instance = instance.clone(); + let worker_rx = workers_rx.clone(); + let worker_sq = sq.clone(); + let worker_comp_tx = comp_tx.clone(); + spawn(move || { + let mut submissions = 0; + worker_instance.disp.with_ctx(|ctx| { + let mut rng = rand::thread_rng(); + loop { + match worker_rx.recv() { + Ok(()) => { + while let Some((_, cqe_permit)) = + worker_sq.pop(ctx) + { + submissions += 1; + + // Sleep for a bit to mimic actually doing some + // work before we complete the IO + sleep(Duration::from_micros( + rng.gen_range(0..500), + )); + + cqe_permit + .push_completion_test(ctx); + + // Signal the "guest" side of the completion handler + assert!(worker_comp_tx + .send(()) + .is_ok()); + } + } + Err(_) => break, } } - Err(_) => break submissions, + }); + submissions + }) + }) + .collect::>(); + + // Create a thread to "consume" things off the Completion Queue. + // This simulates the host reacting to our CQ pushes and "ringing" the + // CQ doorbell. At the end, it returns how many completion were handled. + // Regardless, it'll stop after the number of completions is at least + // as many as the number of submissions we decided to generate. + let comp_doorbell_tx = doorbell_tx.clone(); + let comp_handler = spawn(move || { + let exit_after = submissions_rand; + let mut completions = 0; + loop { + match comp_rx.recv() { + Ok(()) => { + // "Ring" the CQ doorbell + // TODO: test completing more than 1 at a time + assert!(comp_doorbell_tx + .send(Doorbell::Cq(1)) + .is_ok()); + completions += 1; } + Err(_) => break completions, } - }) - }) - .collect::>(); - - // Create a thread to "consume" things off the Completion Queue. - // This simulates the host reacting to our CQ pushes and "ringing" the - // CQ doorbell. At the end, it returns how many completion were handled. - // Regardless, it'll stop after the number of completions is at least - // as many as the number of submissions we decided to generate. - let comp_doorbell_tx = doorbell_tx.clone(); - let comp_handler = spawn(move || { - let exit_after = submissions_rand; - let mut completions = 0; - loop { - match comp_rx.recv() { - Ok(()) => { - // "Ring" the CQ doorbell - // TODO: test completing more than 1 at a time - assert!(comp_doorbell_tx.send(Doorbell::Cq(1)).is_ok()); - completions += 1; + if completions >= exit_after { + break completions; } - Err(_) => break completions, } - if completions >= exit_after { - break completions; - } - } - }); + }); - // Now, start generating a random number of submissions - for _ in 0..submissions_rand { - // "Ring" the SQ doorbell - // TODO: test submitting more than 1 at a time - //let doorbell_tx = doorbell_tx.clone(); - assert!(doorbell_tx.send(Doorbell::Sq(1)).is_ok()); + // Now, start generating a random number of submissions + for _ in 0..submissions_rand { + // "Ring" the SQ doorbell + // TODO: test submitting more than 1 at a time + //let doorbell_tx = doorbell_tx.clone(); + assert!(doorbell_tx.send(Doorbell::Sq(1)).is_ok()); - // Sleep up to 100us in between - sleep(Duration::from_micros(rng.gen_range(0..100))); - } - drop(doorbell_tx); + // Sleep up to 100us in between + sleep(Duration::from_micros(rng.gen_range(0..100))); + } + drop(doorbell_tx); - // Wait for the completion handler and its count - let completions: u32 = comp_handler.join().unwrap(); + // Wait for the completion handler and its count + let completions: u32 = comp_handler.join().unwrap(); - // Wait for doorbell handler - doorbell_handler.join().unwrap(); + // Wait for doorbell handler + doorbell_handler.join().unwrap(); - // Wait for the IO workers to complete and sum the total - // number of submissions they recevied - let submissions: u32 = - io_workers.into_iter().map(|j| j.join().unwrap()).sum(); + // Wait for the IO workers to complete and sum the total + // number of submissions they recevied + let submissions: u32 = + io_workers.into_iter().map(|j| j.join().unwrap()).sum(); - // Make sure the number of submission we recevied matched the - // number we generated and completed - assert_eq!(submissions, submissions_rand); - assert_eq!(submissions, completions); + // Make sure the number of submission we recevied matched the + // number we generated and completed + assert_eq!(submissions, submissions_rand); + assert_eq!(submissions, completions); + }); Ok(()) } From 5ac52280347f65c6166e513088a72b22cbe5c3d0 Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Tue, 12 Oct 2021 18:24:03 -0400 Subject: [PATCH 23/23] nvme: switch back to mutex for queue state to simplify --- propolis/src/hw/nvme/queue.rs | 315 +++++++++++++++------------------- 1 file changed, 135 insertions(+), 180 deletions(-) diff --git a/propolis/src/hw/nvme/queue.rs b/propolis/src/hw/nvme/queue.rs index 506de96a3..96def2905 100644 --- a/propolis/src/hw/nvme/queue.rs +++ b/propolis/src/hw/nvme/queue.rs @@ -1,6 +1,4 @@ -use std::marker::PhantomData; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use super::bits::{self, RawCompletion, RawSubmission}; use super::cmds::Completion; @@ -8,7 +6,6 @@ use crate::common::*; use crate::dispatch::DispCtx; use crate::hw::pci; -use bitstruct::bitstruct; use thiserror::Error; /// Each queue is identified by a 16-bit ID. @@ -38,74 +35,64 @@ const MAX_ADMIN_QUEUE_SIZE: u32 = 1 << 12; /// See NVMe 1.0e Section 1.6.1 Admin Queue pub const ADMIN_QUEUE_ID: QueueId = 0; -/// Marker type to indicate a Completion Queue. +/// Completion Queue State #[derive(Debug)] -enum CompletionQueueType {} +struct CompQueueState { + /// The Queue Head entry pointer. + /// + /// The consumer of entries on a queue uses the current Head entry pointer + /// to identify the next entry to be pulled off the queue. + /// + /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition + head: u16, -/// Marker type to indicate a Submission Queue. -#[derive(Debug)] -enum SubmissionQueueType {} - -bitstruct! { - /// Completion Queue State we update atomically. - struct CompQueueState(pub u64) { - /// The Queue Head entry pointer. - /// - /// The consumer of entries on a queue uses the current Head entry pointer - /// to identify the next entry to be pulled off the queue. - /// - /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition - head: u16 = 0..16; - - /// The Queue Tail entry pointer. - /// - /// The submitter of entries to a queue uses the current Tail entry pointer - /// to identify the next open queue entry space. - /// - /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition - tail: u16 = 16..32; - - /// Number of entries that are available for use. - /// - /// Starts off as queue size - 1 and gets decremented for each corresponding - /// Submission Queue entry we begin servicing. - avail: u16 = 32..48; - - /// The current phase tag. - /// - /// The Phase Tag is used to identify to the host (VM) that a Completion - /// entry is new. Flips every time the Tail entry pointer wraps around. - /// - /// See NVMe 1.0e Section 4.5 Completion Queue Entry - Phase Tag (P) - phase: bool = 48; - - /// Whether the CQ should kick its SQs due to no permits being available previously. - /// - /// One may only pop something off the SQ if there's at least one space available in - /// the corresponding CQ. If there isn't, we set the kick flag. - kick: bool = 49; - } + /// The Queue Tail entry pointer. + /// + /// The submitter of entries to a queue uses the current Tail entry pointer + /// to identify the next open queue entry space. + /// + /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition + tail: u16, + + /// Number of entries that are available for use. + /// + /// Starts off as queue size - 1 and gets decremented for each corresponding + /// Submission Queue entry we begin servicing. + avail: u16, + + /// The current phase tag. + /// + /// The Phase Tag is used to identify to the host (VM) that a Completion + /// entry is new. Flips every time the Tail entry pointer wraps around. + /// + /// See NVMe 1.0e Section 4.5 Completion Queue Entry - Phase Tag (P) + phase: bool, + + /// Whether the CQ should kick its SQs due to no permits being available previously. + /// + /// One may only pop something off the SQ if there's at least one space available in + /// the corresponding CQ. If there isn't, we set the kick flag. + kick: bool, } -bitstruct! { - /// Submission Queue State we update atomically. - struct SubQueueState(pub u64) { - /// The Queue Head entry pointer. - /// - /// The consumer of entries on a queue uses the current Head entry pointer - /// to identify the next entry to be pulled off the queue. - /// - /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition - head: u16 = 0..16; - - /// The Queue Tail entry pointer. - /// - /// The submitter of entries to a queue uses the current Tail entry pointer - /// to identify the next open queue entry space. - /// - /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition - tail: u16 = 16..32; - } +/// Submission Queue State +#[derive(Debug)] +struct SubQueueState { + /// The Queue Head entry pointer. + /// + /// The consumer of entries on a queue uses the current Head entry pointer + /// to identify the next entry to be pulled off the queue. + /// + /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition + head: u16, + + /// The Queue Tail entry pointer. + /// + /// The submitter of entries to a queue uses the current Tail entry pointer + /// to identify the next open queue entry space. + /// + /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition + tail: u16, } /// Helper for manipulating Completion/Submission Queues @@ -115,23 +102,20 @@ bitstruct! { /// is a Completion or Submission queue. Use either /// `CompletionQueueType` or `SubmissionQueueType`. #[derive(Debug)] -struct QueueState { +struct QueueState { /// The size of the queue in question. /// /// See NVMe 1.0e Section 4.1.3 Queue Size size: u32, - /// The actual atomic state that gets updated during the normal course of operation. + /// The actual queue state that gets updated during the normal course of operation. /// /// Either `CompQueueState` for a Completion Queue or /// a `SubQueueState` for a Submission Queue. - inner: AtomicU64, - - /// Marker type to indicate what type of Queue we're modeling. - _qt: PhantomData, + inner: Mutex, } -impl QueueState { +impl QueueState { /// Returns if the queue is currently empty with the given head and tail pointers. /// /// A queue is empty when the Head entry pointer equals the Tail entry pointer. @@ -181,16 +165,21 @@ impl QueueState { } } -impl QueueState { +impl QueueState { /// Create a new `QueueState` for a Completion Queue - fn new_completion_state(size: u32) -> QueueState { + fn new_completion_state(size: u32) -> QueueState { assert!(size >= MIN_QUEUE_SIZE && size <= MAX_QUEUE_SIZE); // As the device side, we start with our phase tag as asserted (1) // as the host side (VM) will create all the Completion Queue entries // with the phase initially zeroed out. - let inner = - CompQueueState(0).with_phase(true).with_avail((size - 1) as u16); - Self { size, inner: AtomicU64::new(inner.0), _qt: PhantomData } + let inner = CompQueueState { + head: 0, + tail: 0, + avail: (size - 1) as u16, + phase: true, + kick: false, + }; + Self { size, inner: Mutex::new(inner) } } /// Attempt to return the Tail entry pointer and then move it forward by 1. @@ -199,21 +188,17 @@ impl QueueState { /// Otherwise, this method returns the current Tail entry pointer and then /// increments the Tail entry pointer by 1 (wrapping if necessary). fn push_tail(&self) -> Option { - self.inner - .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { - let mut state = CompQueueState(state); - if self.is_full(state.head(), state.tail()) { - return None; - } - if state.tail() as u32 + 1 >= self.size { - // We wrapped so flip phase - state.set_phase(!state.phase()); - } - let new_tail = self.wrap_add(state.tail(), 1); - Some(state.with_tail(new_tail).0) - }) - .ok() - .map(|state| CompQueueState(state).tail()) + let mut state = self.inner.lock().unwrap(); + if self.is_full(state.head, state.tail) { + return None; + } + if state.tail as u32 + 1 >= self.size { + // We wrapped so flip phase + state.phase = !state.phase; + } + let old_tail = state.tail; + state.tail = self.wrap_add(old_tail, 1); + Some(old_tail) } /// How many slots are occupied between the head and the tail i.e., how @@ -232,28 +217,25 @@ impl QueueState { if idx as u32 >= self.size { return Err(QueueUpdateError::InvalidEntry); } - self.inner - .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { - let state = CompQueueState(state); - let pop_count = self.wrap_sub(idx, state.head()); - if pop_count > self.avail_occupied(state.head(), state.tail()) { - return None; - } - // Replace head with given idx and update the number of available slots - let avail = state.avail() + pop_count; - Some(state.with_head(idx).with_avail(avail).0) - }) - .map_err(|_| QueueUpdateError::TooManyEntries) - .map(|_| ()) + let mut state = self.inner.lock().unwrap(); + let pop_count = self.wrap_sub(idx, state.head); + if pop_count > self.avail_occupied(state.head, state.tail) { + return Err(QueueUpdateError::TooManyEntries); + } + // Replace head with given idx and update the number of available slots + state.head = idx; + state.avail += pop_count; + + Ok(()) } } -impl QueueState { +impl QueueState { /// Create a new `QueueState` for a Submission Queue - fn new_submission_state(size: u32) -> QueueState { + fn new_submission_state(size: u32) -> QueueState { assert!(size >= MIN_QUEUE_SIZE && size <= MAX_QUEUE_SIZE); - let inner = SubQueueState(0); - Self { size, inner: AtomicU64::new(inner.0), _qt: PhantomData } + let inner = SubQueueState { head: 0, tail: 0 }; + Self { size, inner: Mutex::new(inner) } } /// Attempt to return the Head entry pointer and then move it forward by 1. @@ -262,17 +244,13 @@ impl QueueState { /// Otherwise, this method returns the current Head entry pointer and then /// increments the Head entry pointer by 1 (wrapping if necessary). fn pop_head(&self) -> Option { - self.inner - .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { - let state = SubQueueState(state); - if self.is_empty(state.head(), state.tail()) { - return None; - } - let new_head = self.wrap_add(state.head(), 1); - Some(state.with_head(new_head).0) - }) - .ok() - .map(|state| SubQueueState(state).head()) + let mut state = self.inner.lock().unwrap(); + if self.is_empty(state.head, state.tail) { + return None; + } + let old_head = state.head; + state.head = self.wrap_add(old_head, 1); + Some(old_head) } /// How many slots are empty between the tail and the head i.e., how many @@ -291,18 +269,15 @@ impl QueueState { if idx as u32 >= self.size { return Err(QueueUpdateError::InvalidEntry); } - self.inner - .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { - let state = SubQueueState(state); - let push_count = self.wrap_sub(idx, state.tail()); - if push_count > self.avail_empty(state.head(), state.tail()) { - return None; - } - // Replace tail with given idx - Some(state.with_tail(idx).0) - }) - .map_err(|_| QueueUpdateError::TooManyEntries) - .map(|_| ()) + let mut state = self.inner.lock().unwrap(); + let push_count = self.wrap_sub(idx, state.tail); + if push_count > self.avail_empty(state.head, state.tail) { + return Err(QueueUpdateError::TooManyEntries); + } + // Replace tail with given idx + state.tail = idx; + + Ok(()) } } @@ -340,7 +315,7 @@ pub struct SubQueue { cq: Arc, /// Queue state such as the size and current head/tail entry pointers. - state: QueueState, + state: QueueState, /// The [`GuestAddr`] at which the Queue is mapped. base: GuestAddr, @@ -386,7 +361,8 @@ impl SubQueue { /// Returns the current Head entry pointer. fn head(&self) -> u16 { - SubQueueState(self.state.inner.load(Ordering::SeqCst)).head() + let state = self.state.inner.lock().unwrap(); + state.head } /// Returns the ID of this Submission Queue. @@ -438,7 +414,7 @@ pub struct CompQueue { iv: u16, /// Queue state such as the size and current head/tail entry pointers. - state: QueueState, + state: QueueState, /// The [`GuestAddr`] at which the Queue is mapped. base: GuestAddr, @@ -475,24 +451,20 @@ impl CompQueue { /// Fires an interrupt to the guest with the associated interrupt vector /// if the queue is not currently empty. pub fn fire_interrupt(&self, ctx: &DispCtx) { - let state = CompQueueState(self.state.inner.load(Ordering::SeqCst)); - if !self.state.is_empty(state.head(), state.tail()) { + let state = self.state.inner.lock().unwrap(); + if !self.state.is_empty(state.head, state.tail) { self.hdl.fire(self.iv, ctx); } } /// Returns whether the SQ's should be kicked due to no permits being available previously. + /// + /// If the value was true, it will also get reset to false. pub fn kick(&self) -> bool { - self.state - .inner - .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { - let state = CompQueueState(state); - Some(state.with_kick(false).0) - }) - .ok() - .map(|old_state| CompQueueState(old_state).kick()) - // This unwrap is fine as our fetch_update never returns None - .unwrap() + let mut state = self.state.inner.lock().unwrap(); + let old_kick = state.kick; + state.kick = false; + old_kick } /// Attempt to reserve an entry in the Completion Queue. @@ -502,27 +474,17 @@ impl CompQueue { self: &Arc, sq: Arc, ) -> Option { - let old_state = self - .state - .inner - .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { - let state = CompQueueState(state); - let avail = state.avail(); - // No more spots available - if avail == 0 { - // Make sure we kick the SQ's when we have space available again - Some(state.with_kick(true).0) - } else { - // Otherwise claim a spot - Some(state.with_avail(avail - 1).0) - } - }) - // Unwrap here is fine as our fetch_update never returns None. - .unwrap(); - let old_state = CompQueueState(old_state); - if old_state.avail() == 0 { + let mut state = self.state.inner.lock().unwrap(); + // No more spots available + if state.avail == 0 { + // Make sure we kick the SQ's when we have space available again + state.kick = true; + None } else { + // Otherwise claim a spot + state.avail -= 1; + Some(CompQueueEntryPermit { cq: self.clone(), sq }) } } @@ -550,7 +512,8 @@ impl CompQueue { /// /// See NVMe 1.0e Section 4.5 Completion Queue Entry - Phase Tag (P) fn phase(&self) -> u16 { - CompQueueState(self.state.inner.load(Ordering::SeqCst)).phase() as u16 + let state = self.state.inner.lock().unwrap(); + state.phase as u16 } /// Returns the corresponding [`GuestAddr`] for a given entry in @@ -635,16 +598,8 @@ impl CompQueueEntryPermit { /// /// Frees up the space for someone else to grab it via `CompQueue::reserve_entry`. fn remit(self) { - self.cq - .state - .inner - .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |state| { - let state = CompQueueState(state); - let avail = state.avail(); - Some(state.with_avail(avail + 1).0) - }) - // Unwrap here is fine as our fetch_update never returns None - .unwrap(); + let mut state = self.cq.state.inner.lock().unwrap(); + state.avail += 1; } }