From 6545d7325bea8832f322c5d4557844cb519392c9 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 5 Sep 2025 16:22:37 -0700 Subject: [PATCH 01/11] WIP: pretty good changes so far --- .../background/tasks/blueprint_execution.rs | 9 +- .../app/background/tasks/blueprint_planner.rs | 7 +- nexus/src/app/mod.rs | 7 +- nexus/src/app/quiesce.rs | 175 ++++++++++++++++-- 4 files changed, 176 insertions(+), 22 deletions(-) diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index 443ec5eec18..edce51ef35b 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -428,11 +428,16 @@ mod test { let mut task = BlueprintExecutor::new( datastore.clone(), resolver.clone(), - blueprint_rx, + blueprint_rx.clone(), OmicronZoneUuid::new_v4(), Activator::new(), dummy_tx, - NexusQuiesceHandle::new(&opctx.log, datastore.clone()), + NexusQuiesceHandle::new( + &opctx.log, + datastore.clone(), + OmicronZoneUuid::new_v4(), + blueprint_rx, + ), ); // Now we're ready. diff --git a/nexus/src/app/background/tasks/blueprint_planner.rs b/nexus/src/app/background/tasks/blueprint_planner.rs index 41531985d54..54d8434d7de 100644 --- a/nexus/src/app/background/tasks/blueprint_planner.rs +++ b/nexus/src/app/background/tasks/blueprint_planner.rs @@ -436,7 +436,12 @@ mod test { OmicronZoneUuid::new_v4(), Activator::new(), dummy_tx, - NexusQuiesceHandle::new(&opctx.log, datastore.clone()), + NexusQuiesceHandle::new( + &opctx.log, + datastore.clone(), + OmicronZoneUuid::new_v4(), + rx_loader.clone(), + ), ); let value = executor.activate(&opctx).await; let value = value.as_object().expect("response is not a JSON object"); diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index d4dab2bb687..32db5909566 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -334,7 +334,12 @@ impl Nexus { sec_store, )); - let quiesce = NexusQuiesceHandle::new(&log, db_datastore.clone()); + let quiesce = NexusQuiesceHandle::new( + &log, + db_datastore.clone(), + config.deployment.id.into(), + rx_loader, + ); // It's a bit of a red flag to use an unbounded channel. // diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index 674366d8ecc..55a654f6c97 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -6,16 +6,25 @@ use assert_matches::assert_matches; use chrono::Utc; +use nexus_db_model::DbMetadataNexus; +use nexus_db_model::DbMetadataNexusState; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; +use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintTarget; +use nexus_types::deployment::BlueprintZoneDisposition; +use nexus_types::deployment::BlueprintZoneType; use nexus_types::internal_api::views::QuiesceState; use nexus_types::internal_api::views::QuiesceStatus; use nexus_types::quiesce::SagaQuiesceHandle; use omicron_common::api::external::LookupResult; use omicron_common::api::external::UpdateResult; +use omicron_uuid_kinds::BlueprintUuid; +use omicron_uuid_kinds::OmicronZoneUuid; use slog::Logger; use std::sync::Arc; +use std::time::Duration; use std::time::Instant; use tokio::sync::watch; @@ -43,17 +52,34 @@ impl super::Nexus { pub struct NexusQuiesceHandle { log: Logger, datastore: Arc, + my_nexus_id: OmicronZoneUuid, sagas: SagaQuiesceHandle, + latest_blueprint: + watch::Receiver>>, state: watch::Sender, } impl NexusQuiesceHandle { - pub fn new(log: &Logger, datastore: Arc) -> NexusQuiesceHandle { + pub fn new( + log: &Logger, + datastore: Arc, + my_nexus_id: OmicronZoneUuid, + latest_blueprint: watch::Receiver< + Option>, + >, + ) -> NexusQuiesceHandle { let my_log = log.new(o!("component" => "NexusQuiesceHandle")); let saga_quiesce_log = log.new(o!("component" => "SagaQuiesceHandle")); let sagas = SagaQuiesceHandle::new(saga_quiesce_log); let (state, _) = watch::channel(QuiesceState::Undetermined); - NexusQuiesceHandle { log: my_log, datastore, sagas, state } + NexusQuiesceHandle { + log: my_log, + datastore, + my_nexus_id, + sagas, + latest_blueprint, + state, + } } pub fn sagas(&self) -> SagaQuiesceHandle { @@ -115,28 +141,126 @@ impl NexusQuiesceHandle { async fn do_quiesce(quiesce: NexusQuiesceHandle) { let saga_quiesce = quiesce.sagas.clone(); let datastore = quiesce.datastore.clone(); + let my_nexus_id = quiesce.my_nexus_id; - // NOTE: This sequence will change as we implement RFD 588. - // We will need to use the datastore to report our saga drain status and - // also to see when other Nexus instances have finished draining their - // sagas. For now, this implementation begins quiescing its database as - // soon as its sagas are locally drained. assert_matches!( *quiesce.state.borrow(), QuiesceState::DrainingSagas { .. } ); - // TODO per RFD 588, this is where we will enter a loop, pausing either on - // timeout or when our local quiesce state changes. At each pause: if we - // need to update our db_metadata_nexus record, do so. Then load the - // current blueprint and check the records for all nexus instances. This - // work is covered by a combination of oxidecomputer/omicron#8859, - // oxidecomputer/omicron#8857, and oxidecomputer/omicron#8796. + // We've recorded that we should be quiescing. This will prevent new sagas + // from starting. However, we can still wind up running new sagas if a + // blueprint execution re-assigns us saga from an expunged Nexus. + // + // So here's the plan: we keep track of the blueprint id as of which we have + // fully drained. That means that we've processed all Nexus expungements up + // to that blueprint, _and_ we've recovered all sagas assigned to us, _and + // finished running them all. When this blueprint id changes (because we've + // processed re-assignments for a new blueprint), we'll do two things: + // + // (1) update our `db_metadata_nexus` record to reflect the change // - // For now, we skip the cross-Nexus coordination and simply wait for our own - // Nexus to finish what it's doing. - saga_quiesce.wait_for_drained().await; + // (2) check the `db_metadata_nexus` records of the other active Nexus + // instances to see if they've drained as of the same blueprint. If so, + // then all of the active Nexus instances have finished all sagas in the + // system and we can proceed to quiesce the database. + // + // If the other Nexus instances aren't drained up through the same + // blueprint, we'll check again shortly. + // + // One obvious case that's not handled here is that it's possible that other + // Nexus instances are drained as of a *subsequent* blueprint than the one + // we're looking for. That's no problem. We'll check again with the latest + // blueprint on the next lap. For this to converge, we have to assume + // blueprints aren't changing faster than we're checking. That's a pretty + // safe assumption here. They shouldn't be changing at all at this point + // unless reacting to something unrelated to the upgrade (e.g., a sled + // expungement), and we are checking awfully frequently, too. + const POLL_TIMEOUT: Duration = Duration::from_secs(1); + let mut last_recorded_blueprint_id: Option = None; + loop { + // Grab the most recently loaded blueprint. As usual, we clone to avoid + // locking the watch channel for the lifetime of this value. + let Some(_target, current_blueprint) = + quiesce.latest_blueprint.borrow().clone() + else { + // XXX-dap log + continue; + }; + + // Determine our own Nexus generation number. + let my_generation = todo!(); // XXX-dap + + // Wait up to a second for sagas to become drained as of this blueprint. + let Ok(try_wait) = tokio::time::timeout( + POLL_TIMEOUT, + saga_quiesce.wait_for_drained_blueprint(current_blueprint.id), + ) else { + // We're still not drained as of this blueprint. But the + // current target blueprint could have changed. So take another + // lap. + continue; + }; + + // We're drained up through this blueprint. First, update our record to + // reflect that, if we haven't already. + let try_update = datastore + .nexus_metadata_update_drained_blueprint( + my_nexus_id, + current_blueprint.id, + ) + .await; + if let Err(error) = try_update { + // Take another lap. + // XXX-dap log error + continue; + }; + + last_recorded_blueprint_id = Some(current_blueprint.id); + + // Now, see if everybody else is drained up to the same point, or if any + // of them is already quiesced. (That means *they* already saw that we + // were all drained up to the same point, which is good enough for us to + // proceed.) + let other_nexus_ids = current_blueprint + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter_map(|(_sled_id, zone)| { + if let BlueprintZoneType::Nexus(nexus) = zone.zone_type { + (nexus.nexus_generation == my_generation).then(zone.id) + } else { + None + } + }); + assert!(!other_nexus_ids.is_empty()); + let Ok(other_records): Result, _> = + datastore.nexus_metadata_load(other_nexus_ids).await + else { + // XXX-dap log error + continue; + }; + + if other_records + .iter() + .any(|r| r.state() == DbMetadataNexusState::Quiesced) + { + // XXX-dap log + break; + } + + if other_records.iter().all(|r| r.last_drained_blueprint_id) + == last_recorded_blueprint_id + { + // XXX-dap log + break; + } + // XXX-dap log take another lap + } + + // We're ready to hand off. + // + // XXX-dap write down that this fixes oxidecomputer/omicron#8859, + // oxidecomputer/omicron#8857, and oxidecomputer/omicron#8796 quiesce.state.send_modify(|q| { let QuiesceState::DrainingSagas { time_requested, @@ -176,8 +300,13 @@ async fn do_quiesce(quiesce: NexusQuiesceHandle) { }; }); - // TODO per RFD 588, this is where we will enter a loop trying to update our - // database record for the last time. See oxidecomputer/omicron#8971. + // XXX-dap write that this fixes omicron#8971. + while let Err(error) = + datastore.nexus_metadata_update_quiesced(my_nexus_id).await + { + // XXX-dap log and sleep + let _ = tokio::time::sleep(POLL_TIMEOUT).await; + } quiesce.state.send_modify(|q| { let QuiesceState::RecordingQuiesce { @@ -201,6 +330,16 @@ async fn do_quiesce(quiesce: NexusQuiesceHandle) { duration_total: finished - time_draining_sagas, }; }); + + // We're done! Thank you for service, Nexus `$nexus_id`. + // + // At this point, although most of the rest of Nexus is still running (e.g., + // background tasks) and it can even receive HTTP requests, everything that + // tries to use the database will fail. That's just about everything. + // + // This process will hang around until the new Nexus instances expunge this + // one. This is useful, since debugging tools and tests can still query + // this Nexus for its quiesce state. } #[cfg(test)] From cba808dcbefab2f9c6814c3b6f1a12f36b504a2e Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 5 Sep 2025 17:34:45 -0700 Subject: [PATCH 02/11] WIP: cont --- .../src/db/datastore/db_metadata.rs | 69 +++++++++++ nexus/db-queries/src/db/pool.rs | 12 ++ nexus/src/app/background/init.rs | 11 +- .../app/background/tasks/blueprint_load.rs | 9 +- .../app/background/tasks/blueprint_planner.rs | 4 +- nexus/src/app/mod.rs | 4 +- nexus/src/app/quiesce.rs | 109 +++++++++++++----- nexus/types/src/quiesce.rs | 28 ++++- 8 files changed, 208 insertions(+), 38 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index 4d679d2dfef..dd982ee1388 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -25,10 +25,12 @@ use nexus_db_model::SchemaUpgradeStep; use nexus_db_model::SchemaVersion; use nexus_types::deployment::BlueprintZoneDisposition; use omicron_common::api::external::Error; +use omicron_uuid_kinds::BlueprintUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; use semver::Version; use slog::{Logger, error, info, o}; +use std::collections::BTreeSet; use std::ops::Bound; use std::str::FromStr; @@ -683,6 +685,73 @@ impl DataStore { Ok(()) } + /// Returns information about access for all of the given Nexus ids + /// + /// This set is assumed to be pretty small. + // XXX-dap provide OpContext and do authz check + pub async fn database_nexus_access_all( + &self, + nexus_ids: &BTreeSet, + ) -> Result, Error> { + use nexus_db_schema::schema::db_metadata_nexus::dsl; + + let db_nexus_ids: BTreeSet<_> = nexus_ids + .iter() + .cloned() + .map(nexus_db_model::to_db_typed_uuid) + .collect(); + dsl::db_metadata_nexus + .filter(dsl::nexus_id.eq_any(db_nexus_ids)) + .load_async(&*self.pool_connection_unauthorized().await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Updates the "last_drained_blueprint_id" for the given Nexus id + pub async fn database_nexus_access_update_blueprint( + &self, + nexus_id: OmicronZoneUuid, + blueprint_id: Option, + ) -> Result { + use nexus_db_schema::schema::db_metadata_nexus::dsl; + + let nexus_id = nexus_db_model::to_db_typed_uuid(nexus_id); + let blueprint_id = blueprint_id.map(nexus_db_model::to_db_typed_uuid); + + // XXX-dap accept OpContext, do authz check + let conn = self.pool_connection_unauthorized().await?; + let count = diesel::update(dsl::db_metadata_nexus) + .filter(dsl::nexus_id.eq(nexus_id)) + // To be conservative, we'll only update this value if the record is + // currently active. There's no reason it should ever not be active + // if we're calling this function and if there were an easy way to + // return an error in that case, we'd just do that. + .filter(dsl::state.eq(DbMetadataNexusState::Active)) + .set(dsl::last_drained_blueprint_id.eq(blueprint_id)) + .execute_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + Ok(count) + } + + /// Updates the "last_drained_blueprint_id" for the given Nexus id + pub async fn database_nexus_access_update_quiesced( + &self, + nexus_id: OmicronZoneUuid, + ) -> Result { + use nexus_db_schema::schema::db_metadata_nexus::dsl; + + let nexus_id = nexus_db_model::to_db_typed_uuid(nexus_id); + let conn = self.pool.claim_quiesced().await?; + let count = diesel::update(dsl::db_metadata_nexus) + .filter(dsl::nexus_id.eq(nexus_id)) + .set(dsl::state.eq(DbMetadataNexusState::Quiesced)) + .execute_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + Ok(count) + } + // Returns the access this Nexus has to the database async fn database_nexus_access( &self, diff --git a/nexus/db-queries/src/db/pool.rs b/nexus/db-queries/src/db/pool.rs index 28a76a14965..65473aad14e 100644 --- a/nexus/db-queries/src/db/pool.rs +++ b/nexus/db-queries/src/db/pool.rs @@ -232,6 +232,18 @@ impl Pool { }) } + /// Returns a connection from the pool, bypassing the quiesce check + /// + /// This is only intended for use *during* quiesce to update our final + /// database record. + pub async fn claim_quiesced( + &self, + ) -> Result, Error> { + self.inner.claim().await.map_err(|err| { + Error::unavail(&format!("Failed to access DB connection: {err}")) + }) + } + /// Disables creation of all new database claims /// /// This is currently a one-way trip. The pool cannot be un-quiesced. diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index afaf57f82a7..17dc3e576ff 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -140,6 +140,8 @@ use nexus_config::DnsTasksConfig; use nexus_db_model::DnsGroup; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; +use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintTarget; use nexus_types::deployment::PendingMgsUpdates; use omicron_uuid_kinds::OmicronZoneUuid; use oximeter::types::ProducerRegistry; @@ -426,8 +428,10 @@ impl BackgroundTasksInitializer { // Background task: blueprint loader // // Registration is below so that it can watch the planner. - let blueprint_loader = - blueprint_load::TargetBlueprintLoader::new(datastore.clone()); + let blueprint_loader = blueprint_load::TargetBlueprintLoader::new( + datastore.clone(), + args.blueprint_load_tx, + ); let rx_blueprint = blueprint_loader.watcher(); // Background task: blueprint executor @@ -1022,6 +1026,9 @@ pub struct BackgroundTasksData { pub saga_recovery: saga_recovery::SagaRecoveryHelpers>, /// Channel for TUF repository artifacts to be replicated out to sleds pub tuf_artifact_replication_rx: mpsc::Receiver, + /// Channel for exposing the latest loaded blueprint + pub blueprint_load_tx: + watch::Sender>>, /// `reqwest::Client` for webhook delivery requests. /// /// This is shared with the external API as it's also used when sending diff --git a/nexus/src/app/background/tasks/blueprint_load.rs b/nexus/src/app/background/tasks/blueprint_load.rs index 7b7f546388d..469be135fb6 100644 --- a/nexus/src/app/background/tasks/blueprint_load.rs +++ b/nexus/src/app/background/tasks/blueprint_load.rs @@ -24,8 +24,10 @@ pub struct TargetBlueprintLoader { } impl TargetBlueprintLoader { - pub fn new(datastore: Arc) -> TargetBlueprintLoader { - let (tx, _) = watch::channel(None); + pub fn new( + datastore: Arc, + tx: watch::Sender>>, + ) -> TargetBlueprintLoader { TargetBlueprintLoader { datastore, last: None, tx } } @@ -256,7 +258,8 @@ mod test { datastore.clone(), ); - let mut task = TargetBlueprintLoader::new(datastore.clone()); + let (tx, _) = watch::channel(None); + let mut task = TargetBlueprintLoader::new(datastore.clone(), tx); let mut rx = task.watcher(); // We expect to see the initial blueprint set up by nexus-test-utils diff --git a/nexus/src/app/background/tasks/blueprint_planner.rs b/nexus/src/app/background/tasks/blueprint_planner.rs index 54d8434d7de..236f0964688 100644 --- a/nexus/src/app/background/tasks/blueprint_planner.rs +++ b/nexus/src/app/background/tasks/blueprint_planner.rs @@ -313,7 +313,9 @@ mod test { ); // Spin up the blueprint loader background task. - let mut loader = TargetBlueprintLoader::new(datastore.clone()); + let (tx_loader, _) = watch::channel(None); + let mut loader = + TargetBlueprintLoader::new(datastore.clone(), tx_loader); let mut rx_loader = loader.watcher(); loader.activate(&opctx).await; let (_initial_target, initial_blueprint) = &*rx_loader diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 32db5909566..12436d6bbfa 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -334,11 +334,12 @@ impl Nexus { sec_store, )); + let (blueprint_load_tx, blueprint_load_rx) = watch::channel(None); let quiesce = NexusQuiesceHandle::new( &log, db_datastore.clone(), config.deployment.id.into(), - rx_loader, + blueprint_load_rx, ); // It's a bit of a red flag to use an unbounded channel. @@ -606,6 +607,7 @@ impl Nexus { }, tuf_artifact_replication_rx, mgs_updates_tx, + blueprint_load_tx, }, ); diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index 55a654f6c97..cca58f1f7d4 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -23,6 +23,7 @@ use omicron_common::api::external::UpdateResult; use omicron_uuid_kinds::BlueprintUuid; use omicron_uuid_kinds::OmicronZoneUuid; use slog::Logger; +use std::collections::BTreeSet; use std::sync::Arc; use std::time::Duration; use std::time::Instant; @@ -181,21 +182,45 @@ async fn do_quiesce(quiesce: NexusQuiesceHandle) { loop { // Grab the most recently loaded blueprint. As usual, we clone to avoid // locking the watch channel for the lifetime of this value. - let Some(_target, current_blueprint) = - quiesce.latest_blueprint.borrow().clone() + let Some(current_blueprint): Option = quiesce + .latest_blueprint + .borrow() + .as_deref() + .map(|(_target, blueprint)| blueprint) + .cloned() else { // XXX-dap log + // XXX-dap need to sleep here, but we won't. Could we use + // wait_until() it's non-None? continue; }; // Determine our own Nexus generation number. - let my_generation = todo!(); // XXX-dap + // + // This doesn't change across iterations of the loop. But we need a + // blueprint to figure it out, and we don't have one until we're in this + // loop. + let Some(my_generation) = current_blueprint + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .find_map(|(_sled_id, zone)| { + if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { + (zone.id == my_nexus_id).then_some(nexus.nexus_generation) + } else { + None + } + }) + else { + // XXX-dap log, and also need to sleep here + continue; + }; // Wait up to a second for sagas to become drained as of this blueprint. - let Ok(try_wait) = tokio::time::timeout( + let Ok(_) = tokio::time::timeout( POLL_TIMEOUT, saga_quiesce.wait_for_drained_blueprint(current_blueprint.id), - ) else { + ) + .await + else { // We're still not drained as of this blueprint. But the // current target blueprint could have changed. So take another // lap. @@ -204,36 +229,48 @@ async fn do_quiesce(quiesce: NexusQuiesceHandle) { // We're drained up through this blueprint. First, update our record to // reflect that, if we haven't already. - let try_update = datastore - .nexus_metadata_update_drained_blueprint( - my_nexus_id, - current_blueprint.id, - ) - .await; - if let Err(error) = try_update { - // Take another lap. - // XXX-dap log error - continue; - }; + match last_recorded_blueprint_id { + Some(blueprint_id) if blueprint_id == current_blueprint.id => (), + _ => { + let try_update = datastore + .database_nexus_access_update_blueprint( + my_nexus_id, + Some(current_blueprint.id), + ) + .await; + match try_update { + Err(error) => { + // Take another lap. + // XXX-dap log error + continue; + } + Ok(0) => (), + Ok(count) => { + // XXX-dap log + } + } - last_recorded_blueprint_id = Some(current_blueprint.id); + last_recorded_blueprint_id = Some(current_blueprint.id); + } + }; // Now, see if everybody else is drained up to the same point, or if any // of them is already quiesced. (That means *they* already saw that we // were all drained up to the same point, which is good enough for us to // proceed.) - let other_nexus_ids = current_blueprint + let other_nexus_ids: BTreeSet = current_blueprint .all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter_map(|(_sled_id, zone)| { - if let BlueprintZoneType::Nexus(nexus) = zone.zone_type { - (nexus.nexus_generation == my_generation).then(zone.id) + if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { + (nexus.nexus_generation == my_generation).then_some(zone.id) } else { None } - }); + }) + .collect(); assert!(!other_nexus_ids.is_empty()); let Ok(other_records): Result, _> = - datastore.nexus_metadata_load(other_nexus_ids).await + datastore.database_nexus_access_all(&other_nexus_ids).await else { // XXX-dap log error continue; @@ -247,9 +284,14 @@ async fn do_quiesce(quiesce: NexusQuiesceHandle) { break; } - if other_records.iter().all(|r| r.last_drained_blueprint_id) - == last_recorded_blueprint_id - { + if other_records.len() < other_nexus_ids.len() { + // XXX-dap log + continue; + } + + if other_records.iter().all(|r| { + r.last_drained_blueprint_id() == last_recorded_blueprint_id + }) { // XXX-dap log break; } @@ -301,11 +343,18 @@ async fn do_quiesce(quiesce: NexusQuiesceHandle) { }); // XXX-dap write that this fixes omicron#8971. - while let Err(error) = - datastore.nexus_metadata_update_quiesced(my_nexus_id).await - { - // XXX-dap log and sleep - let _ = tokio::time::sleep(POLL_TIMEOUT).await; + loop { + match datastore.database_nexus_access_update_quiesced(my_nexus_id).await + { + Ok(count) => { + // XXX-dap log and proceed + break; + } + Err(error) => { + // XXX-dap log, sleep and try again + tokio::time::sleep(POLL_TIMEOUT).await; + } + } } quiesce.state.send_modify(|q| { diff --git a/nexus/types/src/quiesce.rs b/nexus/types/src/quiesce.rs index b336cd037ac..4ce25f2ae0c 100644 --- a/nexus/types/src/quiesce.rs +++ b/nexus/types/src/quiesce.rs @@ -285,10 +285,36 @@ impl SagaQuiesceHandle { /// /// Note that new sagas can still be assigned to this Nexus, resulting in it /// no longer being fully drained. - pub async fn wait_for_drained(&self) { + #[cfg(test)] + async fn wait_for_drained(&self) { let _ = self.inner.subscribe().wait_for(|q| q.is_fully_drained()).await; } + /// Wait for sagas to become drained as of this blueprint id + /// + /// This is cancel-safe. + /// + /// It's possible this will never happen. This is generally to be invoked + /// with a timeout or in a select on some other conditions. + /// + /// Note that when this returns, `self.fully_drained_blueprint()` could + /// already be a different blueprint. + pub async fn wait_for_drained_blueprint( + &self, + blueprint_id: BlueprintUuid, + ) { + let _ = self + .inner + .subscribe() + .wait_for(|q| { + matches!( + q.drained_blueprint_id, + Some(id) if id == blueprint_id + ) + }) + .await; + } + /// Wait for the initial determination to be made about whether sagas are /// allowed or not. pub async fn wait_for_determination(&self) { From 8dce7c1bde001af97f48a4692ea353728481e02e Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 5 Sep 2025 20:07:05 -0700 Subject: [PATCH 03/11] pass OpContext through usefully --- .../src/db/datastore/db_metadata.rs | 16 +++++--- .../background/tasks/blueprint_execution.rs | 2 +- .../app/background/tasks/blueprint_planner.rs | 3 +- nexus/src/app/mod.rs | 9 ++++- nexus/src/app/quiesce.rs | 40 +++++++++++++------ 5 files changed, 50 insertions(+), 20 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index dd982ee1388..559533d5fcb 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -688,9 +688,9 @@ impl DataStore { /// Returns information about access for all of the given Nexus ids /// /// This set is assumed to be pretty small. - // XXX-dap provide OpContext and do authz check pub async fn database_nexus_access_all( &self, + opctx: &OpContext, nexus_ids: &BTreeSet, ) -> Result, Error> { use nexus_db_schema::schema::db_metadata_nexus::dsl; @@ -702,7 +702,7 @@ impl DataStore { .collect(); dsl::db_metadata_nexus .filter(dsl::nexus_id.eq_any(db_nexus_ids)) - .load_async(&*self.pool_connection_unauthorized().await?) + .load_async(&*self.pool_connection_authorized(opctx).await?) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } @@ -710,6 +710,7 @@ impl DataStore { /// Updates the "last_drained_blueprint_id" for the given Nexus id pub async fn database_nexus_access_update_blueprint( &self, + opctx: &OpContext, nexus_id: OmicronZoneUuid, blueprint_id: Option, ) -> Result { @@ -718,8 +719,7 @@ impl DataStore { let nexus_id = nexus_db_model::to_db_typed_uuid(nexus_id); let blueprint_id = blueprint_id.map(nexus_db_model::to_db_typed_uuid); - // XXX-dap accept OpContext, do authz check - let conn = self.pool_connection_unauthorized().await?; + let conn = self.pool_connection_authorized(opctx).await?; let count = diesel::update(dsl::db_metadata_nexus) .filter(dsl::nexus_id.eq(nexus_id)) // To be conservative, we'll only update this value if the record is @@ -739,10 +739,16 @@ impl DataStore { &self, nexus_id: OmicronZoneUuid, ) -> Result { + // A traditional authz check is not possible here because we've quiesced + // the DataStore, so no further connections are ordinarily available. + // (We use the lower-level pool interface to bypass that.) + let conn = self.pool.claim_quiesced().await?; + + // XXX-dap could use a separate user here who only has this one + // privilege. use nexus_db_schema::schema::db_metadata_nexus::dsl; let nexus_id = nexus_db_model::to_db_typed_uuid(nexus_id); - let conn = self.pool.claim_quiesced().await?; let count = diesel::update(dsl::db_metadata_nexus) .filter(dsl::nexus_id.eq(nexus_id)) .set(dsl::state.eq(DbMetadataNexusState::Quiesced)) diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index edce51ef35b..78d81082348 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -433,10 +433,10 @@ mod test { Activator::new(), dummy_tx, NexusQuiesceHandle::new( - &opctx.log, datastore.clone(), OmicronZoneUuid::new_v4(), blueprint_rx, + opctx.child(BTreeMap::new()), ), ); diff --git a/nexus/src/app/background/tasks/blueprint_planner.rs b/nexus/src/app/background/tasks/blueprint_planner.rs index 236f0964688..1717a3da4ca 100644 --- a/nexus/src/app/background/tasks/blueprint_planner.rs +++ b/nexus/src/app/background/tasks/blueprint_planner.rs @@ -298,6 +298,7 @@ mod test { ReconfiguratorChickenSwitches, ReconfiguratorChickenSwitchesView, }; use omicron_uuid_kinds::OmicronZoneUuid; + use std::collections::BTreeMap; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -439,10 +440,10 @@ mod test { Activator::new(), dummy_tx, NexusQuiesceHandle::new( - &opctx.log, datastore.clone(), OmicronZoneUuid::new_v4(), rx_loader.clone(), + opctx.child(BTreeMap::new()), ), ); let value = executor.activate(&opctx).await; diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 12436d6bbfa..3bd8f8fd8bd 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -335,11 +335,18 @@ impl Nexus { )); let (blueprint_load_tx, blueprint_load_rx) = watch::channel(None); + let quiesce_log = log.new(o!("component" => "NexusQuiesceHandle")); + let quiesce_opctx = OpContext::for_background( + quiesce_log, + Arc::clone(&authz), + authn::Context::internal_api(), + Arc::clone(&db_datastore) as Arc, + ); let quiesce = NexusQuiesceHandle::new( - &log, db_datastore.clone(), config.deployment.id.into(), blueprint_load_rx, + quiesce_opctx, ); // It's a bit of a red flag to use an unbounded channel. diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index cca58f1f7d4..69415afbb7e 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -22,7 +22,6 @@ use omicron_common::api::external::LookupResult; use omicron_common::api::external::UpdateResult; use omicron_uuid_kinds::BlueprintUuid; use omicron_uuid_kinds::OmicronZoneUuid; -use slog::Logger; use std::collections::BTreeSet; use std::sync::Arc; use std::time::Duration; @@ -51,10 +50,10 @@ impl super::Nexus { /// Describes the configuration and state around quiescing Nexus #[derive(Clone)] pub struct NexusQuiesceHandle { - log: Logger, datastore: Arc, my_nexus_id: OmicronZoneUuid, sagas: SagaQuiesceHandle, + quiesce_opctx: Arc, latest_blueprint: watch::Receiver>>, state: watch::Sender, @@ -62,22 +61,22 @@ pub struct NexusQuiesceHandle { impl NexusQuiesceHandle { pub fn new( - log: &Logger, datastore: Arc, my_nexus_id: OmicronZoneUuid, latest_blueprint: watch::Receiver< Option>, >, + quiesce_opctx: OpContext, ) -> NexusQuiesceHandle { - let my_log = log.new(o!("component" => "NexusQuiesceHandle")); - let saga_quiesce_log = log.new(o!("component" => "SagaQuiesceHandle")); + let saga_quiesce_log = + quiesce_opctx.log.new(o!("component" => "SagaQuiesceHandle")); let sagas = SagaQuiesceHandle::new(saga_quiesce_log); let (state, _) = watch::channel(QuiesceState::Undetermined); NexusQuiesceHandle { - log: my_log, datastore, my_nexus_id, sagas, + quiesce_opctx: Arc::new(quiesce_opctx), latest_blueprint, state, } @@ -100,16 +99,17 @@ impl NexusQuiesceHandle { QuiesceState::Running }; + let log = &self.quiesce_opctx.log; let changed = self.state.send_if_modified(|q| { match q { QuiesceState::Undetermined => { - info!(&self.log, "initial state"; "state" => ?new_state); + info!(log, "initial state"; "state" => ?new_state); *q = new_state; true } QuiesceState::Running => { if quiescing { - info!(&self.log, "quiesce starting"); + info!(log, "quiesce starting"); *q = new_state; true } else { @@ -143,6 +143,7 @@ async fn do_quiesce(quiesce: NexusQuiesceHandle) { let saga_quiesce = quiesce.sagas.clone(); let datastore = quiesce.datastore.clone(); let my_nexus_id = quiesce.my_nexus_id; + let quiesce_opctx = &quiesce.quiesce_opctx; assert_matches!( *quiesce.state.borrow(), @@ -234,6 +235,7 @@ async fn do_quiesce(quiesce: NexusQuiesceHandle) { _ => { let try_update = datastore .database_nexus_access_update_blueprint( + quiesce_opctx, my_nexus_id, Some(current_blueprint.id), ) @@ -269,8 +271,9 @@ async fn do_quiesce(quiesce: NexusQuiesceHandle) { }) .collect(); assert!(!other_nexus_ids.is_empty()); - let Ok(other_records): Result, _> = - datastore.database_nexus_access_all(&other_nexus_ids).await + let Ok(other_records): Result, _> = datastore + .database_nexus_access_all(&quiesce_opctx, &other_nexus_ids) + .await else { // XXX-dap log error continue; @@ -475,6 +478,8 @@ mod test { assert!(status.db_claims.is_empty()); } + /// Exercise trivial case of app-level quiesce in an environment with just + /// one Nexus #[nexus_test(server = crate::Server)] async fn test_quiesce_easy(cptestctx: &ControlPlaneTestContext) { let log = &cptestctx.logctx.log; @@ -485,8 +490,8 @@ mod test { let nexus_client = nexus_client::Client::new(&nexus_internal_url, log.clone()); - // If we quiesce Nexus while it's not doing anything, that should - // complete quickly. + // If we quiesce the only Nexus while it's not doing anything, that + // should complete quickly. let before = Utc::now(); let _ = nexus_client .quiesce_start() @@ -498,6 +503,8 @@ mod test { verify_quiesced(before, after, rv); } + /// Exercise non-trivial app-level quiesce in an environment with just one + /// Nexus #[nexus_test(server = crate::Server)] async fn test_quiesce_full(cptestctx: &ControlPlaneTestContext) { let log = &cptestctx.logctx.log; @@ -659,4 +666,13 @@ mod test { StatusCode::SERVICE_UNAVAILABLE ); } + + /// Test Nexus quiesce with multiple different Nexus instances + /// + /// Unlike the tests above, this is not an "app-level" test. There's not a + /// full Nexus here. We're testing with just a `NexusQuiesceHandle` and the + /// few things that it requires (e.g., the datastore). + #[tokio::test] + async fn test_quiesce_multi() { + } } From 7ad5d5577fb37dbfd0bd7284a8d02d8512c45ccc Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 5 Sep 2025 21:02:56 -0700 Subject: [PATCH 04/11] start fixing/writing tests --- nexus/reconfigurator/planning/src/example.rs | 3 +- nexus/src/app/quiesce.rs | 203 +++++++++++++++++++ 2 files changed, 204 insertions(+), 2 deletions(-) diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index ce793980b2c..7e2737348ca 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -157,8 +157,7 @@ pub struct ExampleSystem { pub system: SystemDescription, pub input: PlanningInput, pub collection: Collection, - /// The initial blueprint that was used to describe the system. This - /// blueprint has sleds but no zones. + /// The initial blueprint that was used to describe the system. pub initial_blueprint: Blueprint, } diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index 69415afbb7e..c595ebaa1f8 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -396,6 +396,8 @@ async fn do_quiesce(quiesce: NexusQuiesceHandle) { #[cfg(test)] mod test { + use crate::app::quiesce::NexusQuiesceHandle; + use crate::app::sagas::test_helpers::test_opctx; use assert_matches::assert_matches; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::DateTime; @@ -405,12 +407,23 @@ mod test { use http::StatusCode; use nexus_client::types::QuiesceState; use nexus_client::types::QuiesceStatus; + use nexus_db_model::DbMetadataNexusState; use nexus_test_interface::NexusServer; + use nexus_test_utils::db::TestDatabase; use nexus_test_utils_macros::nexus_test; + use nexus_types::deployment::BlueprintTarget; + use nexus_types::deployment::BlueprintTargetSet; + use nexus_types::deployment::BlueprintZoneDisposition; + use nexus_types::quiesce::SagaReassignmentDone; use omicron_test_utils::dev::poll::CondCheckError; use omicron_test_utils::dev::poll::wait_for_condition; + use omicron_test_utils::dev::test_setup_log; use slog::Logger; + use std::collections::BTreeMap; + use std::collections::BTreeSet; + use std::sync::Arc; use std::time::Duration; + use tokio::sync::watch; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -478,6 +491,25 @@ mod test { assert!(status.db_claims.is_empty()); } + async fn enable_blueprint_execution(cptestctx: &ControlPlaneTestContext) { + let opctx = test_opctx(&cptestctx); + let nexus = &cptestctx.server.server_context().nexus; + nexus + .blueprint_target_set_enabled( + &opctx, + BlueprintTargetSet { + target_id: nexus + .blueprint_target_view(&opctx) + .await + .expect("current blueprint target") + .target_id, + enabled: true, + }, + ) + .await + .expect("enable blueprint execution"); + } + /// Exercise trivial case of app-level quiesce in an environment with just /// one Nexus #[nexus_test(server = crate::Server)] @@ -490,6 +522,10 @@ mod test { let nexus_client = nexus_client::Client::new(&nexus_internal_url, log.clone()); + // We need to enable blueprint execution in order to complete a saga + // assignment pass, which is required for quiescing to work. + enable_blueprint_execution(&cptestctx).await; + // If we quiesce the only Nexus while it's not doing anything, that // should complete quickly. let before = Utc::now(); @@ -515,6 +551,10 @@ mod test { let nexus_client = nexus_client::Client::new(&nexus_internal_url, log.clone()); + // We need to enable blueprint execution in order to complete a saga + // assignment pass, which is required for quiescing to work. + enable_blueprint_execution(&cptestctx).await; + // Kick off a demo saga that will block quiescing. let demo_saga = nexus_client .saga_demo_create() @@ -674,5 +714,168 @@ mod test { /// few things that it requires (e.g., the datastore). #[tokio::test] async fn test_quiesce_multi() { + use nexus_types::internal_api::views::QuiesceState; + + let logctx = test_setup_log("test_quiesce_multi"); + let testdb = TestDatabase::new_with_datastore(&logctx.log).await; + let opctx = testdb.opctx(); + let datastore = testdb.datastore(); + + // Set up. We need: + // + // - a datastore + // - a blueprint that includes several Nexus instances + // - a watch Receiver to provide this blueprint to the + // NexusQuiesceHandle + // - the datastore needs to be initialized with the Nexus access records + // for the Nexus instances in the blueprint + + let (_collection, _input, blueprint) = + nexus_reconfigurator_planning::example::example( + &logctx.log, + "test_quiesce_multi", + ); + let nexus_ids = blueprint + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter_map(|(_sled_id, z)| z.zone_type.is_nexus().then_some(z.id)) + .collect::>(); + // The example system creates three sleds. Each sled gets a Nexus zone. + assert_eq!( + nexus_ids.len(), + 3, + "Example system configuration has been changed. \ + Please update this test." + ); + + let mut nexus_id_iter = nexus_ids.iter(); + let nexus_id1 = *nexus_id_iter.next().expect("at least one Nexus id"); + let nexus_id2 = *nexus_id_iter.next().expect("at least two Nexus ids"); + let nexus_id3 = + *nexus_id_iter.next().expect("at least three Nexus ids"); + + // Fake up enough of what the blueprint loader produces. + let bp_target = BlueprintTarget { + target_id: blueprint.id, + enabled: false, + time_made_target: Utc::now(), + }; + let blueprint_id = blueprint.id; + let (_, blueprint_rx) = + watch::channel(Some(Arc::new((bp_target, blueprint)))); + + // Insert active records for the Nexus instances. + let conn = + datastore.pool_connection_for_tests().await.expect("db conn"); + datastore + .initialize_nexus_access_from_blueprint_on_connection( + &conn, + nexus_ids.into_iter().collect(), + ) + .await + .expect("initialize Nexus database access records"); + drop(conn); + + // Initialize our quiesce handles. + let nexus_qq1 = NexusQuiesceHandle::new( + datastore.clone(), + nexus_id1, + blueprint_rx.clone(), + opctx.child(BTreeMap::new()), + ); + let nexus_qq2 = NexusQuiesceHandle::new( + datastore.clone(), + nexus_id2, + blueprint_rx.clone(), + opctx.child(BTreeMap::new()), + ); + let nexus_qq3 = NexusQuiesceHandle::new( + datastore.clone(), + nexus_id3, + blueprint_rx.clone(), + opctx.child(BTreeMap::new()), + ); + + let handles = [&nexus_qq1, &nexus_qq2, &nexus_qq3]; + + // First, they're not quiescing. + for qq in &handles { + let state = qq.state(); + assert_matches!(state, QuiesceState::Undetermined); + } + + // Mark them all not-quiescing. + for qq in &handles { + qq.set_quiescing(false); + let state = qq.state(); + assert_matches!(state, QuiesceState::Running); + } + + // XXX-dap start a saga in one of them. Then verify they stay draining + // for a while. + + for qq in &handles { + // Have them all pretend to have finished a reassignment pass for our + // blueprint. + qq.sagas() + .reassign_sagas(async || { + ( + (), + SagaReassignmentDone::ReassignedAllAsOf( + blueprint_id, + false, + ), + ) + }) + .await; + + // Have them all pretend to do saga recovery, too. + qq.sagas().recover(async |_| ((), true)).await; + } + + // Start quiescing them all. + for qq in &handles { + qq.set_quiescing(true); + let state = qq.state(); + assert_matches!(state, QuiesceState::DrainingSagas { .. }); + } + + // They should all proceed to quiesce, but asynchronously. + wait_for_condition( + || async { + for qq in &handles { + if let QuiesceState::Quiesced { .. } = qq.state() { + // XXX-dap TODO-cleanup + continue; + } + + return Err(CondCheckError::<()>::NotYet); + } + + Ok(()) + }, + &Duration::from_millis(250), + &Duration::from_secs(15), + ) + .await + .expect("did not quiesce within timeout"); + + // Their records now should all say that they're quiesced. + // XXX-dap cannot verify this because the datastore is quiesced! + // XXX-dap this highlights that all three handles are quiescing the same + // datastore, which is not right! + // let records = datastore + // .database_nexus_access_all( + // &opctx, + // &BTreeSet::from([nexus_id1, nexus_id2, nexus_id3]), + // ) + // .await + // .expect("reading access records"); + // assert_eq!(records.len(), 3); + // assert!( + // records.iter().all(|r| r.state() == DbMetadataNexusState::Quiesced) + // ); + + testdb.terminate().await; + logctx.cleanup_successful(); } } From 40f75aca807576c39e733f2a7ebb144818640b0f Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 8 Sep 2025 14:58:01 -0700 Subject: [PATCH 05/11] fix test --- nexus/db-queries/src/db/pub_test_utils/mod.rs | 5 + nexus/src/app/quiesce.rs | 115 +++++++++++------- nexus/types/src/quiesce.rs | 1 + 3 files changed, 78 insertions(+), 43 deletions(-) diff --git a/nexus/db-queries/src/db/pub_test_utils/mod.rs b/nexus/db-queries/src/db/pub_test_utils/mod.rs index db2acabd06a..49cf5a61eeb 100644 --- a/nexus/db-queries/src/db/pub_test_utils/mod.rs +++ b/nexus/db-queries/src/db/pub_test_utils/mod.rs @@ -239,6 +239,11 @@ impl TestDatabase { } } + pub async fn extra_datastore(&self, log: &Logger) -> Arc { + let pool = new_pool(log, &self.db); + Arc::new(DataStore::new(&log, pool, None).await.unwrap()) + } + pub fn opctx(&self) -> &OpContext { match &self.kind { TestKind::NoPool diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index c595ebaa1f8..315610ffefb 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -303,9 +303,6 @@ async fn do_quiesce(quiesce: NexusQuiesceHandle) { } // We're ready to hand off. - // - // XXX-dap write down that this fixes oxidecomputer/omicron#8859, - // oxidecomputer/omicron#8857, and oxidecomputer/omicron#8796 quiesce.state.send_modify(|q| { let QuiesceState::DrainingSagas { time_requested, @@ -345,7 +342,6 @@ async fn do_quiesce(quiesce: NexusQuiesceHandle) { }; }); - // XXX-dap write that this fixes omicron#8971. loop { match datastore.database_nexus_access_update_quiesced(my_nexus_id).await { @@ -424,6 +420,7 @@ mod test { use std::sync::Arc; use std::time::Duration; use tokio::sync::watch; + use uuid::Uuid; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -717,7 +714,8 @@ mod test { use nexus_types::internal_api::views::QuiesceState; let logctx = test_setup_log("test_quiesce_multi"); - let testdb = TestDatabase::new_with_datastore(&logctx.log).await; + let log = &logctx.log; + let testdb = TestDatabase::new_with_datastore(log).await; let opctx = testdb.opctx(); let datastore = testdb.datastore(); @@ -732,7 +730,7 @@ mod test { let (_collection, _input, blueprint) = nexus_reconfigurator_planning::example::example( - &logctx.log, + log, "test_quiesce_multi", ); let nexus_ids = blueprint @@ -776,20 +774,23 @@ mod test { drop(conn); // Initialize our quiesce handles. + // + // Each needs its own DataStore, backed by its own Pool, in order to + // behave like three different Nexus instances. let nexus_qq1 = NexusQuiesceHandle::new( - datastore.clone(), + testdb.extra_datastore(log).await, nexus_id1, blueprint_rx.clone(), opctx.child(BTreeMap::new()), ); let nexus_qq2 = NexusQuiesceHandle::new( - datastore.clone(), + testdb.extra_datastore(log).await, nexus_id2, blueprint_rx.clone(), opctx.child(BTreeMap::new()), ); let nexus_qq3 = NexusQuiesceHandle::new( - datastore.clone(), + testdb.extra_datastore(log).await, nexus_id3, blueprint_rx.clone(), opctx.child(BTreeMap::new()), @@ -797,26 +798,27 @@ mod test { let handles = [&nexus_qq1, &nexus_qq2, &nexus_qq3]; - // First, they're not quiescing. + // Verify that at start, each handle's quiesce state is undetermined. + // This is tested more exhaustively elsewhere (in the SagaQuiesceHandle + // tests). We're just checking our assumption. for qq in &handles { let state = qq.state(); assert_matches!(state, QuiesceState::Undetermined); } - // Mark them all not-quiescing. + // Mark each handle as not-quiescing. for qq in &handles { qq.set_quiescing(false); let state = qq.state(); assert_matches!(state, QuiesceState::Running); } - // XXX-dap start a saga in one of them. Then verify they stay draining - // for a while. - + // In order to quiesce, each handle will need to have completed a + // re-assignment pass for our blueprint and also one saga recovery pass. + // Do both of those now. for qq in &handles { - // Have them all pretend to have finished a reassignment pass for our - // blueprint. - qq.sagas() + let sagas = qq.sagas(); + sagas .reassign_sagas(async || { ( (), @@ -827,27 +829,53 @@ mod test { ) }) .await; - - // Have them all pretend to do saga recovery, too. - qq.sagas().recover(async |_| ((), true)).await; + sagas.recover(async |_| ((), true)).await; } - // Start quiescing them all. + // Before we actually start quiescing, create a saga in one of these + // handles. + let saga_ref = nexus_qq1 + .sagas() + .saga_create( + steno::SagaId(Uuid::new_v4()), + &steno::SagaName::new("test-saga"), + ) + .expect("create saga while not quiesced"); + + // Now, start quiescing them all. for qq in &handles { qq.set_quiescing(true); let state = qq.state(); assert_matches!(state, QuiesceState::DrainingSagas { .. }); } - // They should all proceed to quiesce, but asynchronously. + // Importantly, *none* of these handles should quiesce while there's a + // saga running in any of them. It's hard to verify a negative. We'll + // wait a little while and make sure the state hasn't changed. + let _ = tokio::time::sleep(Duration::from_secs(10)).await; + for qq in &handles { + assert_matches!(qq.state(), QuiesceState::DrainingSagas { .. }); + } + let records = datastore + .database_nexus_access_all( + &opctx, + &BTreeSet::from([nexus_id1, nexus_id2, nexus_id3]), + ) + .await + .expect("reading access records"); + assert_eq!(records.len(), 3); + assert!( + records.iter().all(|r| r.state() == DbMetadataNexusState::Active) + ); + + // Now finish that saga. All three handles should quiesce. + drop(saga_ref); wait_for_condition( || async { - for qq in &handles { - if let QuiesceState::Quiesced { .. } = qq.state() { - // XXX-dap TODO-cleanup - continue; - } - + if !handles + .iter() + .all(|q| matches!(q.state(), QuiesceState::Quiesced { .. })) + { return Err(CondCheckError::<()>::NotYet); } @@ -859,21 +887,22 @@ mod test { .await .expect("did not quiesce within timeout"); - // Their records now should all say that they're quiesced. - // XXX-dap cannot verify this because the datastore is quiesced! - // XXX-dap this highlights that all three handles are quiescing the same - // datastore, which is not right! - // let records = datastore - // .database_nexus_access_all( - // &opctx, - // &BTreeSet::from([nexus_id1, nexus_id2, nexus_id3]), - // ) - // .await - // .expect("reading access records"); - // assert_eq!(records.len(), 3); - // assert!( - // records.iter().all(|r| r.state() == DbMetadataNexusState::Quiesced) - // ); + // Each "Nexus" record should say that it's quiesced. + // + // Note that we're using a different datastore (backed by the same + // CockroachDB instance) than the quiesce handles are. That's important + // since their datastores are all quiesced! + let records = datastore + .database_nexus_access_all( + &opctx, + &BTreeSet::from([nexus_id1, nexus_id2, nexus_id3]), + ) + .await + .expect("reading access records"); + assert_eq!(records.len(), 3); + assert!( + records.iter().all(|r| r.state() == DbMetadataNexusState::Quiesced) + ); testdb.terminate().await; logctx.cleanup_successful(); diff --git a/nexus/types/src/quiesce.rs b/nexus/types/src/quiesce.rs index 4ce25f2ae0c..94b98a38dad 100644 --- a/nexus/types/src/quiesce.rs +++ b/nexus/types/src/quiesce.rs @@ -787,6 +787,7 @@ impl Drop for NewlyPendingSagaRef { not recorded, and the saga is not still pending" ) }); + q.latch_blueprint_if_drained(); }); } } From b57236bc4063b96a525aa37bfcf552a24632ac8f Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 12 Sep 2025 12:42:57 -0700 Subject: [PATCH 06/11] WIP: live test for handoff --- dev-tools/reconfigurator-cli/src/lib.rs | 13 +- live-tests/tests/common/reconfigurator.rs | 171 ++++++-- live-tests/tests/test_nexus_add_remove.rs | 115 +++-- live-tests/tests/test_nexus_handoff.rs | 415 ++++++++++++++++++ .../src/db/datastore/db_metadata.rs | 75 +++- nexus/db-queries/src/db/datastore/vpc.rs | 2 + .../reconfigurator/execution/src/database.rs | 36 +- nexus/reconfigurator/execution/src/dns.rs | 1 + nexus/reconfigurator/execution/src/lib.rs | 29 +- .../planning/src/blueprint_builder/builder.rs | 9 +- nexus/reconfigurator/planning/src/example.rs | 1 + nexus/reconfigurator/planning/src/planner.rs | 7 +- 12 files changed, 758 insertions(+), 116 deletions(-) create mode 100644 live-tests/tests/test_nexus_handoff.rs diff --git a/dev-tools/reconfigurator-cli/src/lib.rs b/dev-tools/reconfigurator-cli/src/lib.rs index ceb7e0feba6..93f713e5b49 100644 --- a/dev-tools/reconfigurator-cli/src/lib.rs +++ b/dev-tools/reconfigurator-cli/src/lib.rs @@ -695,6 +695,11 @@ enum BlueprintEditCommands { AddNexus { /// sled on which to deploy the new instance sled_id: SledOpt, + + // XXX-dap fix tests + /// generation of the new Nexus instance + nexus_generation: Generation, + /// image source for the new zone /// /// The image source is required if the planning input of the system @@ -2129,7 +2134,11 @@ fn cmd_blueprint_edit( } let label = match args.edit_command { - BlueprintEditCommands::AddNexus { sled_id, image_source } => { + BlueprintEditCommands::AddNexus { + sled_id, + image_source, + nexus_generation, + } => { let sled_id = sled_id.to_sled_id(system.description())?; let image_source = image_source_unwrap_or( image_source, @@ -2137,7 +2146,7 @@ fn cmd_blueprint_edit( ZoneKind::Nexus, )?; builder - .sled_add_zone_nexus(sled_id, image_source) + .sled_add_zone_nexus(sled_id, image_source, nexus_generation) .context("failed to add Nexus zone")?; format!("added Nexus zone to sled {}", sled_id) } diff --git a/live-tests/tests/common/reconfigurator.rs b/live-tests/tests/common/reconfigurator.rs index c34ebf6124f..31fff46f704 100644 --- a/live-tests/tests/common/reconfigurator.rs +++ b/live-tests/tests/common/reconfigurator.rs @@ -4,14 +4,55 @@ //! Helpers common to Reconfigurator tests -use anyhow::{Context, ensure}; -use nexus_client::types::BlueprintTargetSet; +use anyhow::{Context, anyhow, bail, ensure}; +use nexus_client::types::{BackgroundTasksActivateRequest, BlueprintTargetSet}; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; use nexus_reconfigurator_planning::planner::PlannerRng; use nexus_types::deployment::{Blueprint, PlanningInput}; +use nexus_types::external_api::views::SledState; use nexus_types::inventory::Collection; +use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; use omicron_uuid_kinds::GenericUuid; use slog::{debug, info}; +use slog_error_chain::InlineErrorChain; +use std::time::Duration; + +/// Return the current target blueprint +/// +/// Also validates that it's enabled. If an operator has disabled execution, we +/// don't want to proceed with tests. +pub async fn blueprint_load_target_enabled( + log: &slog::Logger, + nexus: &nexus_client::Client, +) -> Result { + // Fetch the current target configuration. + info!(log, "editing current target blueprint"); + let target_blueprint = nexus + .blueprint_target_view() + .await + .context("fetch current target config")? + .into_inner(); + + debug!(log, "found current target blueprint"; + "blueprint_id" => %target_blueprint.target_id + ); + ensure!( + target_blueprint.enabled, + "refusing to operate on a system with target blueprint disabled" + ); + + let blueprint = nexus + .blueprint_view(target_blueprint.target_id.as_untyped_uuid()) + .await + .context("fetch current target blueprint")? + .into_inner(); + debug!(log, "fetched current target blueprint"; + "blueprint_id" => %target_blueprint.target_id + ); + Ok(blueprint) +} /// Modify the system by editing the current target blueprint /// @@ -44,28 +85,7 @@ pub async fn blueprint_edit_current_target( ) -> Result<(Blueprint, Blueprint), anyhow::Error> { // Fetch the current target configuration. info!(log, "editing current target blueprint"); - let target_blueprint = nexus - .blueprint_target_view() - .await - .context("fetch current target config")? - .into_inner(); - debug!(log, "found current target blueprint"; - "blueprint_id" => %target_blueprint.target_id - ); - ensure!( - target_blueprint.enabled, - "refusing to modify a system with target blueprint disabled" - ); - - // Fetch the actual blueprint. - let blueprint1 = nexus - .blueprint_view(target_blueprint.target_id.as_untyped_uuid()) - .await - .context("fetch current target blueprint")? - .into_inner(); - debug!(log, "fetched current target blueprint"; - "blueprint_id" => %target_blueprint.target_id - ); + let blueprint1 = blueprint_load_target_enabled(log, nexus).await?; // Make a new builder based on that blueprint and use `edit_fn` to edit it. let mut builder = BlueprintBuilder::new_based_on( @@ -83,7 +103,7 @@ pub async fn blueprint_edit_current_target( // Assemble the new blueprint, import it, and make it the new target. let blueprint2 = builder.build(); info!(log, "assembled new blueprint based on target"; - "current_target_id" => %target_blueprint.target_id, + "current_target_id" => %blueprint1.id, "new_blueprint_id" => %blueprint2.id, ); nexus @@ -107,3 +127,104 @@ pub async fn blueprint_edit_current_target( Ok((blueprint1, blueprint2)) } + +/// Returns whether the given blueprint's sled configurations appear to be +/// propagated to all sleds. +/// +/// Returns the inventory collection so that the caller can check additional +/// details if wanted. +pub async fn blueprint_sled_configs_propagated( + opctx: &OpContext, + datastore: &DataStore, + blueprint: &Blueprint, +) -> Result { + let log = &opctx.log; + let latest_collection = datastore + .inventory_get_latest_collection(opctx) + .await + .context("fetching latest collection")? + .ok_or_else(|| anyhow!("have no inventory collections"))?; + debug!(log, "got inventory"; "id" => %latest_collection.id); + for (sled_id, sled_config) in &blueprint.sleds { + if sled_config.state != SledState::Active { + continue; + } + + let agent = latest_collection + .sled_agents + .get(sled_id) + .ok_or_else(|| anyhow!("sled {sled_id}: missing inventory"))?; + let reconciled_config = &agent + .last_reconciliation + .as_ref() + .ok_or_else(|| { + anyhow!("sled {sled_id}: missing last_reconciliation") + })? + .last_reconciled_config; + if reconciled_config.generation < sled_config.sled_agent_generation { + bail!( + "sled {sled_id}: last reconciled generation {}, waiting for {}", + reconciled_config.generation, + sled_config.sled_agent_generation + ); + } + } + + Ok(latest_collection) +} + +/// Waits for the given blueprint's sled configurations to appear to be +/// propagated to all sleds. +/// +/// Returns the inventory collection so that the caller can check additional +/// details if wanted. +pub async fn blueprint_wait_sled_configs_propagated( + opctx: &OpContext, + datastore: &DataStore, + blueprint: &Blueprint, + nexus: &nexus_client::Client, + timeout: Duration, +) -> Result { + wait_for_condition( + || async { + match blueprint_sled_configs_propagated(opctx, datastore, blueprint) + .await + { + Ok(collection) => Ok(collection), + Err(error) => { + debug!( + opctx.log, + "blueprint_wait_sled_configs_propagated"; + InlineErrorChain::new(&*error) + ); + + // Activate the inventory collector. + info!(opctx.log, "activating inventory collector"); + nexus + .bgtask_activate(&BackgroundTasksActivateRequest { + bgtask_names: vec![String::from( + "inventory_collection", + )], + }) + .await + .expect("activating inventory background task"); + + // We don't use the variant of `CondCheckError` that carries + // a permanent error, so we need to put a type here. We + // want the resulting error to impl `ToString`, so we need a + // type that impls that. We pick `String`. + Err(CondCheckError::::NotYet) + } + } + }, + &Duration::from_millis(3000), + &timeout, + ) + .await + .map_err(|error| { + anyhow!( + "waiting for blueprint {}'s sled configs to be propagated: {error}", + blueprint.id + ) + }) +} diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs index 8116af7321d..c4705997550 100644 --- a/live-tests/tests/test_nexus_add_remove.rs +++ b/live-tests/tests/test_nexus_add_remove.rs @@ -2,15 +2,16 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -mod common; +pub mod common; +use crate::common::reconfigurator::blueprint_wait_sled_configs_propagated; use anyhow::Context; use common::LiveTestContext; use common::reconfigurator::blueprint_edit_current_target; use futures::TryStreamExt; use live_tests_macros::live_test; -use nexus_client::types::BackgroundTasksActivateRequest; use nexus_client::types::BlueprintTargetSet; +use nexus_client::types::QuiesceState; use nexus_client::types::Saga; use nexus_client::types::SagaState; use nexus_inventory::CollectionBuilder; @@ -20,8 +21,10 @@ use nexus_reconfigurator_planning::planner::PlannerRng; use nexus_reconfigurator_preparation::PlanningInputFromDb; use nexus_sled_agent_shared::inventory::ZoneKind; use nexus_types::deployment::BlueprintZoneDisposition; +use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::PlannerChickenSwitches; use nexus_types::deployment::SledFilter; +use nexus_types::deployment::blueprint_zone_type; use omicron_common::address::NEXUS_INTERNAL_PORT; use omicron_test_utils::dev::poll::CondCheckError; use omicron_test_utils::dev::poll::wait_for_condition; @@ -85,7 +88,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { // would only be wrong if there are existing Nexus zones with // different image sources, which would only be true in the middle // of an update. - let image_source = commissioned_sled_ids + let (image_source, nexus_generation) = commissioned_sled_ids .iter() .find_map(|&sled_id| { builder @@ -94,8 +97,17 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { BlueprintZoneDisposition::is_in_service, ) .find_map(|zone| { - if zone.zone_type.is_nexus() { - Some(zone.image_source.clone()) + if let BlueprintZoneType::Nexus( + blueprint_zone_type::Nexus { + nexus_generation, + .. + }, + ) = &zone.zone_type + { + Some(( + zone.image_source.clone(), + nexus_generation, + )) } else { None } @@ -106,7 +118,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { )?; builder - .sled_add_zone_nexus(sled_id, image_source) + .sled_add_zone_nexus(sled_id, image_source, *nexus_generation) .context("adding Nexus zone")?; Ok(()) @@ -129,15 +141,27 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { // Wait for the new Nexus zone to show up and be usable. let initial_sagas_list = wait_for_condition( || async { - list_sagas(&new_zone_client).await.map_err(|e| { + let list = list_sagas(&new_zone_client).await.map_err(|e| { debug!(log, "waiting for new Nexus to be available: listing sagas: {e:#}" ); CondCheckError::<()>::NotYet - }) + })?; + debug!(log, "new Nexus: listing sagas: ok"); + + let qq = new_zone_client + .quiesce_get() + .await + .expect("fetching quiesce state from new zone"); + debug!(log, "new Nexus: quiesce state"; "state" => ?qq); + if let QuiesceState::Undetermined = qq.state { + Err(CondCheckError::<()>::NotYet) + } else { + Ok(list) + } }, &Duration::from_millis(50), - &Duration::from_secs(60), + &Duration::from_secs(90), ) .await .expect("new Nexus to be usable"); @@ -217,57 +241,32 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { // For that to happen, inventory must first reflect that the Nexus we // expunged is really gone. Then we must run through another planning // round. - // - // First, kick one Nexus instance's inventory collector. Otherwise, it - // might take a while for the system to notice this zone is gone. Having - // activated the task, it shouldn't take too long to get an inventory - info!(log, "activating inventory collector"); - nexus - .bgtask_activate(&BackgroundTasksActivateRequest { - bgtask_names: vec![String::from("inventory_collection")], - }) - .await - .expect("activating inventory background task"); - let latest_collection = wait_for_condition( - || async { - let latest_collection = datastore - .inventory_get_latest_collection(opctx) - .await - .expect("latest inventory collection") - .expect("have a latest inventory collection"); - debug!(log, "got inventory"; "id" => %latest_collection.id); - let agent = latest_collection.sled_agents.get(&sled_id).expect( - "collection information for the sled we added a Nexus to", - ); - let ledgered_config = agent - .ledgered_sled_config - .as_ref() - .expect("sled should have ledgered config"); - if ledgered_config.zones.iter().any(|z| z.id == new_zone.id) { - debug!(log, "zone still present in ledger"); - return Err(CondCheckError::<()>::NotYet); - } - - let reconciled_config = &agent - .last_reconciliation - .as_ref() - .expect("sled should have reconciled config") - .last_reconciled_config; - if reconciled_config.zones.iter().any(|z| z.id == new_zone.id) { - debug!(log, "zone still present in inventory"); - return Err(CondCheckError::<()>::NotYet); - } - if reconciled_config.generation < expunged_generation { - debug!(log, "sled's reconciled config is too old"); - return Err(CondCheckError::<()>::NotYet); - } - return Ok(latest_collection); - }, - &Duration::from_millis(3000), - &Duration::from_secs(90), + let latest_collection = blueprint_wait_sled_configs_propagated( + opctx, + datastore, + &blueprint3, + nexus, + Duration::from_secs(90), ) .await - .expect("waiting for zone to be gone from inventory"); + .expect("waiting for blueprint3 sled configs"); + // These checks are not necessary, but document our assumptions. + let agent = latest_collection + .sled_agents + .get(&sled_id) + .expect("collection information for the sled we added a Nexus to"); + let ledgered_config = agent + .ledgered_sled_config + .as_ref() + .expect("sled should have ledgered config"); + assert!(!ledgered_config.zones.iter().any(|z| z.id == new_zone.id)); + let reconciled_config = &agent + .last_reconciliation + .as_ref() + .expect("sled should have reconciled config") + .last_reconciled_config; + assert!(!reconciled_config.zones.iter().any(|z| z.id == new_zone.id)); + assert!(reconciled_config.generation >= expunged_generation); // Now run through the planner. info!(log, "running through planner"); diff --git a/live-tests/tests/test_nexus_handoff.rs b/live-tests/tests/test_nexus_handoff.rs new file mode 100644 index 00000000000..40438719068 --- /dev/null +++ b/live-tests/tests/test_nexus_handoff.rs @@ -0,0 +1,415 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +pub mod common; + +use crate::common::reconfigurator::blueprint_load_target_enabled; +use crate::common::reconfigurator::blueprint_wait_sled_configs_propagated; +use anyhow::Context; +use common::LiveTestContext; +use common::reconfigurator::blueprint_edit_current_target; +use live_tests_macros::live_test; +use nexus_client::types::QuiesceState; +use nexus_db_model::DbMetadataNexusState; +use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; +use nexus_reconfigurator_preparation::PlanningInputFromDb; +use nexus_types::deployment::BlueprintZoneDisposition; +use nexus_types::deployment::BlueprintZoneImageSource; +use nexus_types::deployment::BlueprintZoneType; +use nexus_types::deployment::PlannerChickenSwitches; +use nexus_types::deployment::blueprint_zone_type; +use omicron_test_utils::dev::poll::CondCheckError; +use omicron_test_utils::dev::poll::wait_for_condition; +use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::SledUuid; +use slog::debug; +use slog::info; +use slog_error_chain::InlineErrorChain; +use std::collections::BTreeMap; +use std::time::Duration; + +#[live_test] +async fn test_nexus_handoff(lc: &LiveTestContext) { + // Test setup + let log = lc.log(); + let opctx = lc.opctx(); + let datastore = lc.datastore(); + let initial_nexus_clients = lc.all_internal_nexus_clients().await.unwrap(); + let nexus = initial_nexus_clients.first().expect("internal Nexus client"); + + // Make sure we're starting from a known-normal state. + // First, we have an enabled target blueprint. + let blueprint1 = blueprint_load_target_enabled(log, nexus) + .await + .expect("loading initial target blueprint"); + // That blueprint should be propagated to all sleds. We wait just a bit + // here to deal with races set up by other tests failing or other ongoing + // activity. + let collection = blueprint_wait_sled_configs_propagated( + opctx, + datastore, + &blueprint1, + nexus, + Duration::from_secs(15), + ) + .await + .expect("verifying initial blueprints' sled configs propagated"); + // Check that there's no Nexus handoff already pending. That means that + // there exist no Nexus zones with a generation newer than the blueprint's + // `nexus_generation`. + let new_zones = blueprint1 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter_map(|(_sled_id, z)| { + let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { + nexus_generation, + .. + }) = &z.zone_type + else { + return None; + }; + (*nexus_generation > blueprint1.nexus_generation).then_some(z.id) + }) + .collect::>(); + if !new_zones.is_empty() { + panic!( + "handoff in progress! found zones with generation newer than \ + current blueprint generation ({}): {}", + blueprint1.nexus_generation, + new_zones + .into_iter() + .map(|s| s.to_string()) + .collect::>() + .join(", ") + ); + } + + // XXX-dap make sure autoplanner is off + + // Identify the current generation of Nexus zones. + struct CurrentNexusZone<'a> { + sled_id: SledUuid, + image_source: &'a BlueprintZoneImageSource, + cfg: &'a blueprint_zone_type::Nexus, + } + let current_nexus_zones: BTreeMap = blueprint1 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter_map(|(sled_id, z)| { + let BlueprintZoneType::Nexus( + cfg @ blueprint_zone_type::Nexus { nexus_generation, .. }, + ) = &z.zone_type + else { + return None; + }; + (*nexus_generation == blueprint1.nexus_generation).then(|| { + ( + z.id, + CurrentNexusZone { + sled_id: sled_id, + image_source: &z.image_source, + cfg, + }, + ) + }) + }) + .collect(); + assert!( + !current_nexus_zones.is_empty(), + "found no Nexus instances at current nexus generation" + ); + let nexus_clients: BTreeMap<_, _> = current_nexus_zones + .iter() + .map(|(id, current)| { + ( + id, + lc.specific_internal_nexus_client(current.cfg.internal_address), + ) + }) + .collect(); + + // All Nexus instances ought to be running normally, not quiesced. + info!(log, "checking current Nexus status"); + for (id, nexus) in &nexus_clients { + let qq = nexus + .quiesce_get() + .await + .expect("fetching quiesce state") + .into_inner(); + let QuiesceState::Running = qq.state else { + panic!("Nexus {id} is in unexpected quiesce state: {qq:?}"); + }; + } + info!(log, "all test prerequisites complete"); + + // We're finally ready to start the test! + // + // For each zone in the current generation, create a replacement at the next + // generation. + let next_generation = blueprint1.nexus_generation.next(); + let chicken_switches = datastore + .reconfigurator_chicken_switches_get_latest(opctx) + .await + .expect("obtained latest chicken switches") + .map_or_else(PlannerChickenSwitches::default, |cs| { + cs.switches.planner_switches + }); + let planning_input = + PlanningInputFromDb::assemble(opctx, datastore, chicken_switches) + .await + .expect("planning input"); + let (_blueprint1, blueprint2) = blueprint_edit_current_target( + log, + &planning_input, + &collection, + &nexus, + &|builder: &mut BlueprintBuilder| { + for current_nexus in current_nexus_zones.values() { + builder + .sled_add_zone_nexus( + current_nexus.sled_id, + current_nexus.image_source.clone(), + next_generation, + ) + .context("adding Nexus zone")?; + } + Ok(()) + }, + ) + .await + .expect("editing blueprint to add zones"); + info!( + log, + "wrote new target blueprint with new Nexus zones"; + "blueprint_id" => %blueprint2.id + ); + + // Find the new Nexus zones and make clients for them. + let new_nexus_clients = blueprint2 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter_map(|(_sled_id, z)| { + let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { + nexus_generation, + internal_address, + .. + }) = &z.zone_type + else { + return None; + }; + (*nexus_generation == next_generation).then(|| { + (z.id, lc.specific_internal_nexus_client(*internal_address)) + }) + }) + .collect::>(); + assert_eq!(new_nexus_clients.len(), current_nexus_zones.len()); + + // Wait for the zones to be running. + // (This does not mean that their Nexus instances are running.) + let collection = blueprint_wait_sled_configs_propagated( + opctx, + datastore, + &blueprint2, + nexus, + Duration::from_secs(180), + ) + .await + .expect("three new Nexus zones running"); + info!(log, "blueprint configs propagated"; "blueprint_id" => %blueprint2.id); + + // Check that the db_metadata_nexus records for the new Nexus instances + // exist. + let new_records = datastore + .database_nexus_access_all( + opctx, + &new_nexus_clients.keys().cloned().collect(), + ) + .await + .expect("fetching db_metadata_nexus records"); + assert_eq!(new_records.len(), new_nexus_clients.len()); + assert!( + new_records.iter().all(|r| r.state() == DbMetadataNexusState::NotYet) + ); + + // Create a demo saga that will hold up quiescing. + let demo_nexus = + nexus_clients.values().next().expect("at least one Nexus client"); + let demo_saga = + demo_nexus.saga_demo_create().await.expect("demo saga create"); + info!(log, "created demo saga"; "demo_saga" => ?demo_saga); + + // Now update the target blueprint to trigger a handoff. + let planning_input = + PlanningInputFromDb::assemble(opctx, datastore, chicken_switches) + .await + .expect("planning input"); + let (_blueprint2, blueprint3) = blueprint_edit_current_target( + log, + &planning_input, + &collection, + &nexus, + &|builder: &mut BlueprintBuilder| { + builder.set_nexus_generation(next_generation); + Ok(()) + }, + ) + .await + .expect("editing blueprint to bump nexus_generation"); + info!( + log, + "wrote new target blueprint with new nexus generation"; + "blueprint_id" => %blueprint3.id + ); + + // The old Nexus zones should pretty soon report that they're quiescing, but + // not quiesced because of the one outstanding saga. + for (id, nexus) in &nexus_clients { + wait_for_condition( + || async { + let qq = nexus + .quiesce_get() + .await + .expect("fetching quiesce status") + .into_inner(); + debug!( + log, + "Nexus quiesce state"; + "nexus_id" => %id, + "state" => ?qq.state + ); + match qq.state { + QuiesceState::DrainingSagas { .. } => Ok(()), + _ => Err(CondCheckError::<()>::NotYet), + } + }, + &Duration::from_secs(1), + &Duration::from_secs(90), + ) + .await + .expect("waiting for Nexus to report quiescing"); + + info!(log, "Nexus reports quiescing"; "nexus_id" => %id); + } + + // At this point, we should not yet be able to reach the new Nexus instances + // because they're sitting waiting for the database to be ready for handoff. + for (id, nexus) in &new_nexus_clients { + if let Ok(_) = nexus.quiesce_get().await { + panic!("unexpectedly reached Nexus {id}"); + } + } + info!(log, "new Nexus instances are still not reachable (good)"); + + // Complete the demo saga to unblock quiescing. + demo_nexus + .saga_demo_complete(&demo_saga.demo_saga_id) + .await + .expect("complete demos aga"); + + // Now wait for the old Nexus zones to report being fully quiesced. + for (id, nexus) in &nexus_clients { + wait_for_condition( + || async { + let qq = nexus + .quiesce_get() + .await + .expect("fetching quiesce status") + .into_inner(); + debug!( + log, + "Nexus quiesce state"; + "nexus_id" => %id, + "state" => ?qq.state + ); + match qq.state { + QuiesceState::Quiesced { .. } => Ok(()), + _ => Err(CondCheckError::<()>::NotYet), + } + }, + &Duration::from_secs(1), + &Duration::from_secs(90), + ) + .await + .expect("waiting for old Nexus to report quiesced"); + + info!(log, "Nexus reports quiesced"; "nexus_id" => %id); + } + info!(log, "all old Nexus instances report quiescing!"); + + // Now wait for the new Nexus zones to report being online. + for (id, nexus) in &new_nexus_clients { + wait_for_condition( + || async { + match nexus.quiesce_get().await { + Ok(qq) => { + debug!( + log, + "new Nexus quiesce state"; + "state" => ?qq.state + ); + match qq.state { + QuiesceState::Undetermined => { + Err(CondCheckError::<()>::NotYet) + } + QuiesceState::Running => Ok(()), + _ => panic!("unexpected new Nexus quiesce state"), + } + } + Err(error) => { + debug!( + log, + "error fetching new Nexus quiesce state"; + InlineErrorChain::new(&error), + ); + Err(CondCheckError::NotYet) + } + } + }, + &Duration::from_secs(1), + &Duration::from_secs(90), + ) + .await + .expect("waiting for old Nexus to report quiesced"); + info!(log, "new Nexus reports running"; "id" => %id); + } + info!(log, "all new Nexus instances report running!"); + + // Clean up: expunge the old Nexus instances. + let planning_input = + PlanningInputFromDb::assemble(opctx, datastore, chicken_switches) + .await + .expect("planning input"); + let new_nexus = + new_nexus_clients.values().next().expect("one new Nexus client"); + let (_blueprint3, blueprint4) = blueprint_edit_current_target( + log, + &planning_input, + &collection, + new_nexus, + &|builder: &mut BlueprintBuilder| { + for (id, current_zone) in ¤t_nexus_zones { + builder + .sled_expunge_zone(current_zone.sled_id, *id) + .context("expunging zone")?; + } + + Ok(()) + }, + ) + .await + .expect("editing blueprint to expunge old Nexus zones"); + info!( + log, + "wrote new target blueprint with new nexus generation"; + "blueprint_id" => %blueprint4.id + ); + + // Wait for this to get propagated everywhere. + let _latest_collection = blueprint_wait_sled_configs_propagated( + opctx, + datastore, + &blueprint4, + new_nexus, + Duration::from_secs(90), + ) + .await + .expect("waiting for blueprint4 sled configs"); +} diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index 559533d5fcb..df502f26c14 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -24,7 +24,10 @@ use nexus_db_model::EARLIEST_SUPPORTED_VERSION; use nexus_db_model::SchemaUpgradeStep; use nexus_db_model::SchemaVersion; use nexus_types::deployment::BlueprintZoneDisposition; +use nexus_types::deployment::BlueprintZoneType; +use nexus_types::deployment::blueprint_zone_type; use omicron_common::api::external::Error; +use omicron_common::api::external::Generation; use omicron_uuid_kinds::BlueprintUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; @@ -830,24 +833,36 @@ impl DataStore { &self, opctx: &OpContext, blueprint: &nexus_types::deployment::Blueprint, + active_generation: Generation, ) -> Result<(), Error> { opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; - // TODO: Without https://github.com/oxidecomputer/omicron/pull/8863, we - // treat all Nexuses as active. Some will become "not_yet", depending on - // the Nexus Generation, once it exists. - let active_nexus_zones = blueprint + let new_nexuses: Vec<_> = blueprint .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter_map(|(_sled, zone_cfg)| { - if zone_cfg.zone_type.is_nexus() { - Some(zone_cfg) - } else { + .filter_map(|(_sled, z)| { + let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { + nexus_generation, + .. + }) = &z.zone_type + else { + return None; + }; + + if *nexus_generation < active_generation { None + } else if *nexus_generation == active_generation { + Some(DbMetadataNexus::new( + z.id, + DbMetadataNexusState::Active, + )) + } else { + Some(DbMetadataNexus::new( + z.id, + DbMetadataNexusState::NotYet, + )) } - }); - let new_nexuses = active_nexus_zones - .map(|z| DbMetadataNexus::new(z.id, DbMetadataNexusState::Active)) - .collect::>(); + }) + .collect(); let conn = &*self.pool_connection_authorized(&opctx).await?; self.transaction_if_current_blueprint_is( @@ -2192,7 +2207,11 @@ mod test { // Create nexus access records datastore - .database_nexus_access_create(&opctx, &blueprint) + .database_nexus_access_create( + &opctx, + &blueprint, + blueprint.nexus_generation, + ) .await .expect("Failed to create nexus access"); @@ -2260,7 +2279,11 @@ mod test { // Create nexus access records (first time) datastore - .database_nexus_access_create(&opctx, &blueprint) + .database_nexus_access_create( + &opctx, + &blueprint, + blueprint.nexus_generation, + ) .await .expect("Failed to create nexus access (first time)"); @@ -2282,7 +2305,11 @@ mod test { // Creating the record again: not an error. datastore - .database_nexus_access_create(&opctx, &blueprint) + .database_nexus_access_create( + &opctx, + &blueprint, + blueprint.nexus_generation, + ) .await .expect("Failed to create nexus access (first time)"); confirm_state(datastore, nexus_id, DbMetadataNexusState::Active).await; @@ -2303,7 +2330,11 @@ mod test { // Create nexus access records another time - should be idempotent, // but should be "on-conflict, ignore". datastore - .database_nexus_access_create(&opctx, &blueprint) + .database_nexus_access_create( + &opctx, + &blueprint, + blueprint.nexus_generation, + ) .await .expect("Failed to create nexus access (second time)"); confirm_state(datastore, nexus_id, DbMetadataNexusState::Quiesced) @@ -2360,7 +2391,11 @@ mod test { // This should fail because the transaction should check if the // blueprint is the current target let result = datastore - .database_nexus_access_create(&opctx, &non_target_blueprint) + .database_nexus_access_create( + &opctx, + &non_target_blueprint, + non_target_blueprint.nexus_generation, + ) .await; assert!( result.is_err(), @@ -2379,7 +2414,11 @@ mod test { // Verify that using the correct target blueprint works datastore - .database_nexus_access_create(&opctx, &target_blueprint) + .database_nexus_access_create( + &opctx, + &target_blueprint, + target_blueprint.nexus_generation, + ) .await .expect( "Creating nexus access with correct blueprint should succeed", diff --git a/nexus/db-queries/src/db/datastore/vpc.rs b/nexus/db-queries/src/db/datastore/vpc.rs index 31f2111a0c8..ca4c0b24d0d 100644 --- a/nexus/db-queries/src/db/datastore/vpc.rs +++ b/nexus/db-queries/src/db/datastore/vpc.rs @@ -3336,6 +3336,7 @@ mod tests { false, Vec::new(), BlueprintZoneImageSource::InstallDataset, + bp0.nexus_generation, ) .expect("added nexus to third sled"); builder.build() @@ -3411,6 +3412,7 @@ mod tests { false, Vec::new(), BlueprintZoneImageSource::InstallDataset, + bp2.nexus_generation, ) .expect("added nexus to third sled"); } diff --git a/nexus/reconfigurator/execution/src/database.rs b/nexus/reconfigurator/execution/src/database.rs index 9652e53ec1a..41691256cf4 100644 --- a/nexus/reconfigurator/execution/src/database.rs +++ b/nexus/reconfigurator/execution/src/database.rs @@ -8,6 +8,10 @@ use anyhow::anyhow; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintZoneDisposition; +use nexus_types::deployment::blueprint_zone_type; +use omicron_uuid_kinds::OmicronZoneUuid; +use nexus_types::deployment::BlueprintZoneType; /// Idempotently ensure that the Nexus records for the zones are populated /// in the database. @@ -15,9 +19,39 @@ pub(crate) async fn deploy_db_metadata_nexus_records( opctx: &OpContext, datastore: &DataStore, blueprint: &Blueprint, + nexus_id: OmicronZoneUuid, ) -> Result<(), anyhow::Error> { + // To determine what state to use for new records, we need to know which is + // the currently active Nexus generation. That is necessarily the + // generation number of the Nexus instance that's doing the execution. + let nexus_generation = blueprint + // XXX-dap add helper for iterating over in-service Nexus zones + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .find_map(|(_sled_id, z)| { + if z.id != nexus_id { + return None; + } + + let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { + nexus_generation, + .. + }) = &z.zone_type + else { + // This should be impossible. + return None; + }; + + Some(*nexus_generation) + }) + .ok_or_else(|| { + anyhow!( + "did not find nexus generation for current \ + Nexus zone ({nexus_id})" + ) + })?; + datastore - .database_nexus_access_create(opctx, blueprint) + .database_nexus_access_create(opctx, blueprint, nexus_generation) .await .map_err(|err| anyhow!(err))?; Ok(()) diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 1e3cc26ea68..876b6b93e67 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -1557,6 +1557,7 @@ mod test { .sled_add_zone_nexus( sled_id, BlueprintZoneImageSource::InstallDataset, + blueprint.nexus_generation, ) .unwrap(); let blueprint2 = builder.build(); diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index 46070623181..413c651401e 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -204,6 +204,7 @@ pub async fn realize_blueprint( &opctx, datastore, blueprint, + nexus_id, ); register_deploy_sled_configs_step( @@ -405,22 +406,30 @@ fn register_deploy_db_metadata_nexus_records_step<'a>( opctx: &'a OpContext, datastore: &'a DataStore, blueprint: &'a Blueprint, + nexus_id: Option, ) { registrar .new_step( ExecutionStepId::Ensure, "Ensure db_metadata_nexus_state records exist", - async move |_cx| match database::deploy_db_metadata_nexus_records( - opctx, &datastore, &blueprint, - ) - .await - { - Ok(()) => StepSuccess::new(()).into(), - Err(err) => StepWarning::new( - (), - err.context("ensuring db_metadata_nexus_state").to_string(), + async move |_cx| { + let Some(nexus_id) = nexus_id else { + return StepSkipped::new((), "not running as Nexus").into(); + }; + + match database::deploy_db_metadata_nexus_records( + opctx, &datastore, &blueprint, nexus_id, ) - .into(), + .await + { + Ok(()) => StepSuccess::new(()).into(), + Err(err) => StepWarning::new( + (), + err.context("ensuring db_metadata_nexus_state") + .to_string(), + ) + .into(), + } }, ) .register(); diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 90bf09fe1ba..14c6ddb2da8 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -1549,6 +1549,7 @@ impl<'a> BlueprintBuilder<'a> { &mut self, sled_id: SledUuid, image_source: BlueprintZoneImageSource, + nexus_generation: Generation, ) -> Result<(), Error> { // Whether Nexus should use TLS and what the external DNS servers it // should use are currently provided at rack-setup time, and should be @@ -1577,6 +1578,7 @@ impl<'a> BlueprintBuilder<'a> { external_tls, external_dns_servers, image_source, + nexus_generation, ) } @@ -1586,6 +1588,7 @@ impl<'a> BlueprintBuilder<'a> { external_tls: bool, external_dns_servers: Vec, image_source: BlueprintZoneImageSource, + nexus_generation: Generation, ) -> Result<(), Error> { let nexus_id = self.rng.sled_rng(sled_id).next_zone(); let ExternalNetworkingChoice { @@ -1623,7 +1626,7 @@ impl<'a> BlueprintBuilder<'a> { nic, external_tls, external_dns_servers: external_dns_servers.clone(), - nexus_generation: Generation::new(), + nexus_generation, }); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; @@ -3342,6 +3345,7 @@ pub mod test { .map(|sa| sa.sled_id) .expect("no sleds present"), BlueprintZoneImageSource::InstallDataset, + Generation::new(), ) .unwrap_err(); @@ -3444,6 +3448,7 @@ pub mod test { .sled_add_zone_nexus( sled_id, BlueprintZoneImageSource::InstallDataset, + parent.nexus_generation, ) .expect("added nexus zone"); } @@ -3466,6 +3471,7 @@ pub mod test { .sled_add_zone_nexus( sled_id, BlueprintZoneImageSource::InstallDataset, + parent.nexus_generation, ) .expect("added nexus zone"); } @@ -3505,6 +3511,7 @@ pub mod test { .sled_add_zone_nexus( sled_id, BlueprintZoneImageSource::InstallDataset, + parent.nexus_generation, ) .unwrap_err(); diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index 7e2737348ca..919ab594ad1 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -485,6 +485,7 @@ impl ExampleSystemBuilder { false, vec![], image_source.clone(), + initial_blueprint.nexus_generation, ) .unwrap(); } diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 0731ba5350e..b4a3a19287f 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -1172,7 +1172,12 @@ impl<'a> Planner<'a> { .blueprint .sled_add_zone_external_dns(sled_id, image_source)?, DiscretionaryOmicronZone::Nexus => { - self.blueprint.sled_add_zone_nexus(sled_id, image_source)? + self.blueprint.sled_add_zone_nexus( + sled_id, + image_source, + // XXX-dap + self.blueprint.nexus_generation(), + )? } DiscretionaryOmicronZone::Oximeter => self .blueprint From f7c92b318bed5b56feb2337a386de0609684cc48 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 15 Sep 2025 11:29:01 -0700 Subject: [PATCH 07/11] final timeout was too short due to transient Cockroach issue on a4x2 --- live-tests/tests/test_nexus_handoff.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/live-tests/tests/test_nexus_handoff.rs b/live-tests/tests/test_nexus_handoff.rs index bb63dcee320..0ac7db1c140 100644 --- a/live-tests/tests/test_nexus_handoff.rs +++ b/live-tests/tests/test_nexus_handoff.rs @@ -396,7 +396,7 @@ async fn test_nexus_handoff(lc: &LiveTestContext) { .expect("editing blueprint to expunge old Nexus zones"); info!( log, - "wrote new target blueprint with new nexus generation"; + "wrote new target blueprint with expunged zones"; "blueprint_id" => %blueprint4.id ); @@ -406,7 +406,7 @@ async fn test_nexus_handoff(lc: &LiveTestContext) { datastore, &blueprint4, new_nexus, - Duration::from_secs(90), + Duration::from_secs(120), ) .await .expect("waiting for blueprint4 sled configs"); From d45a9958df75ba2ab71168f88e1333e952b1b489 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 15 Sep 2025 14:04:15 -0700 Subject: [PATCH 08/11] remove stuff that will go into separate PRs --- dev-tools/reconfigurator-cli/src/lib.rs | 13 +- live-tests/tests/common/reconfigurator.rs | 171 ++------ live-tests/tests/test_nexus_add_remove.rs | 115 ++--- live-tests/tests/test_nexus_handoff.rs | 413 ------------------ .../src/db/datastore/db_metadata.rs | 75 +--- nexus/db-queries/src/db/datastore/vpc.rs | 2 - .../reconfigurator/execution/src/database.rs | 36 +- nexus/reconfigurator/execution/src/dns.rs | 1 - nexus/reconfigurator/execution/src/lib.rs | 29 +- .../planning/src/blueprint_builder/builder.rs | 9 +- nexus/reconfigurator/planning/src/example.rs | 4 +- nexus/reconfigurator/planning/src/planner.rs | 7 +- 12 files changed, 118 insertions(+), 757 deletions(-) delete mode 100644 live-tests/tests/test_nexus_handoff.rs diff --git a/dev-tools/reconfigurator-cli/src/lib.rs b/dev-tools/reconfigurator-cli/src/lib.rs index b35a1a72e37..ff472002c58 100644 --- a/dev-tools/reconfigurator-cli/src/lib.rs +++ b/dev-tools/reconfigurator-cli/src/lib.rs @@ -695,11 +695,6 @@ enum BlueprintEditCommands { AddNexus { /// sled on which to deploy the new instance sled_id: SledOpt, - - // XXX-dap fix tests - /// generation of the new Nexus instance - nexus_generation: Generation, - /// image source for the new zone /// /// The image source is required if the planning input of the system @@ -2134,11 +2129,7 @@ fn cmd_blueprint_edit( } let label = match args.edit_command { - BlueprintEditCommands::AddNexus { - sled_id, - image_source, - nexus_generation, - } => { + BlueprintEditCommands::AddNexus { sled_id, image_source } => { let sled_id = sled_id.to_sled_id(system.description())?; let image_source = image_source_unwrap_or( image_source, @@ -2146,7 +2137,7 @@ fn cmd_blueprint_edit( ZoneKind::Nexus, )?; builder - .sled_add_zone_nexus(sled_id, image_source, nexus_generation) + .sled_add_zone_nexus(sled_id, image_source) .context("failed to add Nexus zone")?; format!("added Nexus zone to sled {}", sled_id) } diff --git a/live-tests/tests/common/reconfigurator.rs b/live-tests/tests/common/reconfigurator.rs index 31fff46f704..c34ebf6124f 100644 --- a/live-tests/tests/common/reconfigurator.rs +++ b/live-tests/tests/common/reconfigurator.rs @@ -4,55 +4,14 @@ //! Helpers common to Reconfigurator tests -use anyhow::{Context, anyhow, bail, ensure}; -use nexus_client::types::{BackgroundTasksActivateRequest, BlueprintTargetSet}; -use nexus_db_queries::context::OpContext; -use nexus_db_queries::db::DataStore; +use anyhow::{Context, ensure}; +use nexus_client::types::BlueprintTargetSet; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; use nexus_reconfigurator_planning::planner::PlannerRng; use nexus_types::deployment::{Blueprint, PlanningInput}; -use nexus_types::external_api::views::SledState; use nexus_types::inventory::Collection; -use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; use omicron_uuid_kinds::GenericUuid; use slog::{debug, info}; -use slog_error_chain::InlineErrorChain; -use std::time::Duration; - -/// Return the current target blueprint -/// -/// Also validates that it's enabled. If an operator has disabled execution, we -/// don't want to proceed with tests. -pub async fn blueprint_load_target_enabled( - log: &slog::Logger, - nexus: &nexus_client::Client, -) -> Result { - // Fetch the current target configuration. - info!(log, "editing current target blueprint"); - let target_blueprint = nexus - .blueprint_target_view() - .await - .context("fetch current target config")? - .into_inner(); - - debug!(log, "found current target blueprint"; - "blueprint_id" => %target_blueprint.target_id - ); - ensure!( - target_blueprint.enabled, - "refusing to operate on a system with target blueprint disabled" - ); - - let blueprint = nexus - .blueprint_view(target_blueprint.target_id.as_untyped_uuid()) - .await - .context("fetch current target blueprint")? - .into_inner(); - debug!(log, "fetched current target blueprint"; - "blueprint_id" => %target_blueprint.target_id - ); - Ok(blueprint) -} /// Modify the system by editing the current target blueprint /// @@ -85,7 +44,28 @@ pub async fn blueprint_edit_current_target( ) -> Result<(Blueprint, Blueprint), anyhow::Error> { // Fetch the current target configuration. info!(log, "editing current target blueprint"); - let blueprint1 = blueprint_load_target_enabled(log, nexus).await?; + let target_blueprint = nexus + .blueprint_target_view() + .await + .context("fetch current target config")? + .into_inner(); + debug!(log, "found current target blueprint"; + "blueprint_id" => %target_blueprint.target_id + ); + ensure!( + target_blueprint.enabled, + "refusing to modify a system with target blueprint disabled" + ); + + // Fetch the actual blueprint. + let blueprint1 = nexus + .blueprint_view(target_blueprint.target_id.as_untyped_uuid()) + .await + .context("fetch current target blueprint")? + .into_inner(); + debug!(log, "fetched current target blueprint"; + "blueprint_id" => %target_blueprint.target_id + ); // Make a new builder based on that blueprint and use `edit_fn` to edit it. let mut builder = BlueprintBuilder::new_based_on( @@ -103,7 +83,7 @@ pub async fn blueprint_edit_current_target( // Assemble the new blueprint, import it, and make it the new target. let blueprint2 = builder.build(); info!(log, "assembled new blueprint based on target"; - "current_target_id" => %blueprint1.id, + "current_target_id" => %target_blueprint.target_id, "new_blueprint_id" => %blueprint2.id, ); nexus @@ -127,104 +107,3 @@ pub async fn blueprint_edit_current_target( Ok((blueprint1, blueprint2)) } - -/// Returns whether the given blueprint's sled configurations appear to be -/// propagated to all sleds. -/// -/// Returns the inventory collection so that the caller can check additional -/// details if wanted. -pub async fn blueprint_sled_configs_propagated( - opctx: &OpContext, - datastore: &DataStore, - blueprint: &Blueprint, -) -> Result { - let log = &opctx.log; - let latest_collection = datastore - .inventory_get_latest_collection(opctx) - .await - .context("fetching latest collection")? - .ok_or_else(|| anyhow!("have no inventory collections"))?; - debug!(log, "got inventory"; "id" => %latest_collection.id); - for (sled_id, sled_config) in &blueprint.sleds { - if sled_config.state != SledState::Active { - continue; - } - - let agent = latest_collection - .sled_agents - .get(sled_id) - .ok_or_else(|| anyhow!("sled {sled_id}: missing inventory"))?; - let reconciled_config = &agent - .last_reconciliation - .as_ref() - .ok_or_else(|| { - anyhow!("sled {sled_id}: missing last_reconciliation") - })? - .last_reconciled_config; - if reconciled_config.generation < sled_config.sled_agent_generation { - bail!( - "sled {sled_id}: last reconciled generation {}, waiting for {}", - reconciled_config.generation, - sled_config.sled_agent_generation - ); - } - } - - Ok(latest_collection) -} - -/// Waits for the given blueprint's sled configurations to appear to be -/// propagated to all sleds. -/// -/// Returns the inventory collection so that the caller can check additional -/// details if wanted. -pub async fn blueprint_wait_sled_configs_propagated( - opctx: &OpContext, - datastore: &DataStore, - blueprint: &Blueprint, - nexus: &nexus_client::Client, - timeout: Duration, -) -> Result { - wait_for_condition( - || async { - match blueprint_sled_configs_propagated(opctx, datastore, blueprint) - .await - { - Ok(collection) => Ok(collection), - Err(error) => { - debug!( - opctx.log, - "blueprint_wait_sled_configs_propagated"; - InlineErrorChain::new(&*error) - ); - - // Activate the inventory collector. - info!(opctx.log, "activating inventory collector"); - nexus - .bgtask_activate(&BackgroundTasksActivateRequest { - bgtask_names: vec![String::from( - "inventory_collection", - )], - }) - .await - .expect("activating inventory background task"); - - // We don't use the variant of `CondCheckError` that carries - // a permanent error, so we need to put a type here. We - // want the resulting error to impl `ToString`, so we need a - // type that impls that. We pick `String`. - Err(CondCheckError::::NotYet) - } - } - }, - &Duration::from_millis(3000), - &timeout, - ) - .await - .map_err(|error| { - anyhow!( - "waiting for blueprint {}'s sled configs to be propagated: {error}", - blueprint.id - ) - }) -} diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs index 2cf7acfb8d1..dd1e97c9bdc 100644 --- a/live-tests/tests/test_nexus_add_remove.rs +++ b/live-tests/tests/test_nexus_add_remove.rs @@ -2,16 +2,15 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -pub mod common; +mod common; -use crate::common::reconfigurator::blueprint_wait_sled_configs_propagated; use anyhow::Context; use common::LiveTestContext; use common::reconfigurator::blueprint_edit_current_target; use futures::TryStreamExt; use live_tests_macros::live_test; +use nexus_client::types::BackgroundTasksActivateRequest; use nexus_client::types::BlueprintTargetSet; -use nexus_client::types::QuiesceState; use nexus_client::types::Saga; use nexus_client::types::SagaState; use nexus_inventory::CollectionBuilder; @@ -21,10 +20,8 @@ use nexus_reconfigurator_planning::planner::PlannerRng; use nexus_reconfigurator_preparation::PlanningInputFromDb; use nexus_sled_agent_shared::inventory::ZoneKind; use nexus_types::deployment::BlueprintZoneDisposition; -use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::PlannerConfig; use nexus_types::deployment::SledFilter; -use nexus_types::deployment::blueprint_zone_type; use omicron_common::address::NEXUS_INTERNAL_PORT; use omicron_test_utils::dev::poll::CondCheckError; use omicron_test_utils::dev::poll::wait_for_condition; @@ -86,7 +83,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { // would only be wrong if there are existing Nexus zones with // different image sources, which would only be true in the middle // of an update. - let (image_source, nexus_generation) = commissioned_sled_ids + let image_source = commissioned_sled_ids .iter() .find_map(|&sled_id| { builder @@ -95,17 +92,8 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { BlueprintZoneDisposition::is_in_service, ) .find_map(|zone| { - if let BlueprintZoneType::Nexus( - blueprint_zone_type::Nexus { - nexus_generation, - .. - }, - ) = &zone.zone_type - { - Some(( - zone.image_source.clone(), - nexus_generation, - )) + if zone.zone_type.is_nexus() { + Some(zone.image_source.clone()) } else { None } @@ -116,7 +104,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { )?; builder - .sled_add_zone_nexus(sled_id, image_source, *nexus_generation) + .sled_add_zone_nexus(sled_id, image_source) .context("adding Nexus zone")?; Ok(()) @@ -139,27 +127,15 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { // Wait for the new Nexus zone to show up and be usable. let initial_sagas_list = wait_for_condition( || async { - let list = list_sagas(&new_zone_client).await.map_err(|e| { + list_sagas(&new_zone_client).await.map_err(|e| { debug!(log, "waiting for new Nexus to be available: listing sagas: {e:#}" ); CondCheckError::<()>::NotYet - })?; - debug!(log, "new Nexus: listing sagas: ok"); - - let qq = new_zone_client - .quiesce_get() - .await - .expect("fetching quiesce state from new zone"); - debug!(log, "new Nexus: quiesce state"; "state" => ?qq); - if let QuiesceState::Undetermined = qq.state { - Err(CondCheckError::<()>::NotYet) - } else { - Ok(list) - } + }) }, &Duration::from_millis(50), - &Duration::from_secs(90), + &Duration::from_secs(60), ) .await .expect("new Nexus to be usable"); @@ -239,32 +215,57 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { // For that to happen, inventory must first reflect that the Nexus we // expunged is really gone. Then we must run through another planning // round. - let latest_collection = blueprint_wait_sled_configs_propagated( - opctx, - datastore, - &blueprint3, - nexus, - Duration::from_secs(90), + // + // First, kick one Nexus instance's inventory collector. Otherwise, it + // might take a while for the system to notice this zone is gone. Having + // activated the task, it shouldn't take too long to get an inventory + info!(log, "activating inventory collector"); + nexus + .bgtask_activate(&BackgroundTasksActivateRequest { + bgtask_names: vec![String::from("inventory_collection")], + }) + .await + .expect("activating inventory background task"); + let latest_collection = wait_for_condition( + || async { + let latest_collection = datastore + .inventory_get_latest_collection(opctx) + .await + .expect("latest inventory collection") + .expect("have a latest inventory collection"); + debug!(log, "got inventory"; "id" => %latest_collection.id); + let agent = latest_collection.sled_agents.get(&sled_id).expect( + "collection information for the sled we added a Nexus to", + ); + let ledgered_config = agent + .ledgered_sled_config + .as_ref() + .expect("sled should have ledgered config"); + if ledgered_config.zones.iter().any(|z| z.id == new_zone.id) { + debug!(log, "zone still present in ledger"); + return Err(CondCheckError::<()>::NotYet); + } + + let reconciled_config = &agent + .last_reconciliation + .as_ref() + .expect("sled should have reconciled config") + .last_reconciled_config; + if reconciled_config.zones.iter().any(|z| z.id == new_zone.id) { + debug!(log, "zone still present in inventory"); + return Err(CondCheckError::<()>::NotYet); + } + if reconciled_config.generation < expunged_generation { + debug!(log, "sled's reconciled config is too old"); + return Err(CondCheckError::<()>::NotYet); + } + return Ok(latest_collection); + }, + &Duration::from_millis(3000), + &Duration::from_secs(90), ) .await - .expect("waiting for blueprint3 sled configs"); - // These checks are not necessary, but document our assumptions. - let agent = latest_collection - .sled_agents - .get(&sled_id) - .expect("collection information for the sled we added a Nexus to"); - let ledgered_config = agent - .ledgered_sled_config - .as_ref() - .expect("sled should have ledgered config"); - assert!(!ledgered_config.zones.iter().any(|z| z.id == new_zone.id)); - let reconciled_config = &agent - .last_reconciliation - .as_ref() - .expect("sled should have reconciled config") - .last_reconciled_config; - assert!(!reconciled_config.zones.iter().any(|z| z.id == new_zone.id)); - assert!(reconciled_config.generation >= expunged_generation); + .expect("waiting for zone to be gone from inventory"); // Now run through the planner. info!(log, "running through planner"); diff --git a/live-tests/tests/test_nexus_handoff.rs b/live-tests/tests/test_nexus_handoff.rs deleted file mode 100644 index 0ac7db1c140..00000000000 --- a/live-tests/tests/test_nexus_handoff.rs +++ /dev/null @@ -1,413 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -pub mod common; - -use crate::common::reconfigurator::blueprint_load_target_enabled; -use crate::common::reconfigurator::blueprint_wait_sled_configs_propagated; -use anyhow::Context; -use common::LiveTestContext; -use common::reconfigurator::blueprint_edit_current_target; -use live_tests_macros::live_test; -use nexus_client::types::QuiesceState; -use nexus_db_model::DbMetadataNexusState; -use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; -use nexus_reconfigurator_preparation::PlanningInputFromDb; -use nexus_types::deployment::BlueprintZoneDisposition; -use nexus_types::deployment::BlueprintZoneImageSource; -use nexus_types::deployment::BlueprintZoneType; -use nexus_types::deployment::PlannerConfig; -use nexus_types::deployment::blueprint_zone_type; -use omicron_test_utils::dev::poll::CondCheckError; -use omicron_test_utils::dev::poll::wait_for_condition; -use omicron_uuid_kinds::OmicronZoneUuid; -use omicron_uuid_kinds::SledUuid; -use slog::debug; -use slog::info; -use slog_error_chain::InlineErrorChain; -use std::collections::BTreeMap; -use std::time::Duration; - -#[live_test] -async fn test_nexus_handoff(lc: &LiveTestContext) { - // Test setup - let log = lc.log(); - let opctx = lc.opctx(); - let datastore = lc.datastore(); - let initial_nexus_clients = lc.all_internal_nexus_clients().await.unwrap(); - let nexus = initial_nexus_clients.first().expect("internal Nexus client"); - - // Make sure we're starting from a known-normal state. - // First, we have an enabled target blueprint. - let blueprint1 = blueprint_load_target_enabled(log, nexus) - .await - .expect("loading initial target blueprint"); - // That blueprint should be propagated to all sleds. We wait just a bit - // here to deal with races set up by other tests failing or other ongoing - // activity. - let collection = blueprint_wait_sled_configs_propagated( - opctx, - datastore, - &blueprint1, - nexus, - Duration::from_secs(15), - ) - .await - .expect("verifying initial blueprints' sled configs propagated"); - // Check that there's no Nexus handoff already pending. That means that - // there exist no Nexus zones with a generation newer than the blueprint's - // `nexus_generation`. - let new_zones = blueprint1 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter_map(|(_sled_id, z)| { - let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { - nexus_generation, - .. - }) = &z.zone_type - else { - return None; - }; - (*nexus_generation > blueprint1.nexus_generation).then_some(z.id) - }) - .collect::>(); - if !new_zones.is_empty() { - panic!( - "handoff in progress! found zones with generation newer than \ - current blueprint generation ({}): {}", - blueprint1.nexus_generation, - new_zones - .into_iter() - .map(|s| s.to_string()) - .collect::>() - .join(", ") - ); - } - - // XXX-dap make sure autoplanner is off - - // Identify the current generation of Nexus zones. - struct CurrentNexusZone<'a> { - sled_id: SledUuid, - image_source: &'a BlueprintZoneImageSource, - cfg: &'a blueprint_zone_type::Nexus, - } - let current_nexus_zones: BTreeMap = blueprint1 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter_map(|(sled_id, z)| { - let BlueprintZoneType::Nexus( - cfg @ blueprint_zone_type::Nexus { nexus_generation, .. }, - ) = &z.zone_type - else { - return None; - }; - (*nexus_generation == blueprint1.nexus_generation).then(|| { - ( - z.id, - CurrentNexusZone { - sled_id: sled_id, - image_source: &z.image_source, - cfg, - }, - ) - }) - }) - .collect(); - assert!( - !current_nexus_zones.is_empty(), - "found no Nexus instances at current nexus generation" - ); - let nexus_clients: BTreeMap<_, _> = current_nexus_zones - .iter() - .map(|(id, current)| { - ( - id, - lc.specific_internal_nexus_client(current.cfg.internal_address), - ) - }) - .collect(); - - // All Nexus instances ought to be running normally, not quiesced. - info!(log, "checking current Nexus status"); - for (id, nexus) in &nexus_clients { - let qq = nexus - .quiesce_get() - .await - .expect("fetching quiesce state") - .into_inner(); - let QuiesceState::Running = qq.state else { - panic!("Nexus {id} is in unexpected quiesce state: {qq:?}"); - }; - } - info!(log, "all test prerequisites complete"); - - // We're finally ready to start the test! - // - // For each zone in the current generation, create a replacement at the next - // generation. - let next_generation = blueprint1.nexus_generation.next(); - let planner_config = datastore - .reconfigurator_config_get_latest(opctx) - .await - .expect("obtained latest reconfigurator config") - .map_or_else(PlannerConfig::default, |cs| cs.config.planner_config); - let planning_input = - PlanningInputFromDb::assemble(opctx, datastore, planner_config) - .await - .expect("planning input"); - let (_blueprint1, blueprint2) = blueprint_edit_current_target( - log, - &planning_input, - &collection, - &nexus, - &|builder: &mut BlueprintBuilder| { - for current_nexus in current_nexus_zones.values() { - builder - .sled_add_zone_nexus( - current_nexus.sled_id, - current_nexus.image_source.clone(), - next_generation, - ) - .context("adding Nexus zone")?; - } - Ok(()) - }, - ) - .await - .expect("editing blueprint to add zones"); - info!( - log, - "wrote new target blueprint with new Nexus zones"; - "blueprint_id" => %blueprint2.id - ); - - // Find the new Nexus zones and make clients for them. - let new_nexus_clients = blueprint2 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter_map(|(_sled_id, z)| { - let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { - nexus_generation, - internal_address, - .. - }) = &z.zone_type - else { - return None; - }; - (*nexus_generation == next_generation).then(|| { - (z.id, lc.specific_internal_nexus_client(*internal_address)) - }) - }) - .collect::>(); - assert_eq!(new_nexus_clients.len(), current_nexus_zones.len()); - - // Wait for the zones to be running. - // (This does not mean that their Nexus instances are running.) - let collection = blueprint_wait_sled_configs_propagated( - opctx, - datastore, - &blueprint2, - nexus, - Duration::from_secs(180), - ) - .await - .expect("three new Nexus zones running"); - info!(log, "blueprint configs propagated"; "blueprint_id" => %blueprint2.id); - - // Check that the db_metadata_nexus records for the new Nexus instances - // exist. - let new_records = datastore - .database_nexus_access_all( - opctx, - &new_nexus_clients.keys().cloned().collect(), - ) - .await - .expect("fetching db_metadata_nexus records"); - assert_eq!(new_records.len(), new_nexus_clients.len()); - assert!( - new_records.iter().all(|r| r.state() == DbMetadataNexusState::NotYet) - ); - - // Create a demo saga that will hold up quiescing. - let demo_nexus = - nexus_clients.values().next().expect("at least one Nexus client"); - let demo_saga = - demo_nexus.saga_demo_create().await.expect("demo saga create"); - info!(log, "created demo saga"; "demo_saga" => ?demo_saga); - - // Now update the target blueprint to trigger a handoff. - let planning_input = - PlanningInputFromDb::assemble(opctx, datastore, planner_config) - .await - .expect("planning input"); - let (_blueprint2, blueprint3) = blueprint_edit_current_target( - log, - &planning_input, - &collection, - &nexus, - &|builder: &mut BlueprintBuilder| { - builder.set_nexus_generation(next_generation); - Ok(()) - }, - ) - .await - .expect("editing blueprint to bump nexus_generation"); - info!( - log, - "wrote new target blueprint with new nexus generation"; - "blueprint_id" => %blueprint3.id - ); - - // The old Nexus zones should pretty soon report that they're quiescing, but - // not quiesced because of the one outstanding saga. - for (id, nexus) in &nexus_clients { - wait_for_condition( - || async { - let qq = nexus - .quiesce_get() - .await - .expect("fetching quiesce status") - .into_inner(); - debug!( - log, - "Nexus quiesce state"; - "nexus_id" => %id, - "state" => ?qq.state - ); - match qq.state { - QuiesceState::DrainingSagas { .. } => Ok(()), - _ => Err(CondCheckError::<()>::NotYet), - } - }, - &Duration::from_secs(1), - &Duration::from_secs(90), - ) - .await - .expect("waiting for Nexus to report quiescing"); - - info!(log, "Nexus reports quiescing"; "nexus_id" => %id); - } - - // At this point, we should not yet be able to reach the new Nexus instances - // because they're sitting waiting for the database to be ready for handoff. - for (id, nexus) in &new_nexus_clients { - if let Ok(_) = nexus.quiesce_get().await { - panic!("unexpectedly reached Nexus {id}"); - } - } - info!(log, "new Nexus instances are still not reachable (good)"); - - // Complete the demo saga to unblock quiescing. - demo_nexus - .saga_demo_complete(&demo_saga.demo_saga_id) - .await - .expect("complete demos aga"); - - // Now wait for the old Nexus zones to report being fully quiesced. - for (id, nexus) in &nexus_clients { - wait_for_condition( - || async { - let qq = nexus - .quiesce_get() - .await - .expect("fetching quiesce status") - .into_inner(); - debug!( - log, - "Nexus quiesce state"; - "nexus_id" => %id, - "state" => ?qq.state - ); - match qq.state { - QuiesceState::Quiesced { .. } => Ok(()), - _ => Err(CondCheckError::<()>::NotYet), - } - }, - &Duration::from_secs(1), - &Duration::from_secs(90), - ) - .await - .expect("waiting for old Nexus to report quiesced"); - - info!(log, "Nexus reports quiesced"; "nexus_id" => %id); - } - info!(log, "all old Nexus instances report quiescing!"); - - // Now wait for the new Nexus zones to report being online. - for (id, nexus) in &new_nexus_clients { - wait_for_condition( - || async { - match nexus.quiesce_get().await { - Ok(qq) => { - debug!( - log, - "new Nexus quiesce state"; - "state" => ?qq.state - ); - match qq.state { - QuiesceState::Undetermined => { - Err(CondCheckError::<()>::NotYet) - } - QuiesceState::Running => Ok(()), - _ => panic!("unexpected new Nexus quiesce state"), - } - } - Err(error) => { - debug!( - log, - "error fetching new Nexus quiesce state"; - InlineErrorChain::new(&error), - ); - Err(CondCheckError::NotYet) - } - } - }, - &Duration::from_secs(1), - &Duration::from_secs(90), - ) - .await - .expect("waiting for old Nexus to report quiesced"); - info!(log, "new Nexus reports running"; "id" => %id); - } - info!(log, "all new Nexus instances report running!"); - - // Clean up: expunge the old Nexus instances. - let planning_input = - PlanningInputFromDb::assemble(opctx, datastore, planner_config) - .await - .expect("planning input"); - let new_nexus = - new_nexus_clients.values().next().expect("one new Nexus client"); - let (_blueprint3, blueprint4) = blueprint_edit_current_target( - log, - &planning_input, - &collection, - new_nexus, - &|builder: &mut BlueprintBuilder| { - for (id, current_zone) in ¤t_nexus_zones { - builder - .sled_expunge_zone(current_zone.sled_id, *id) - .context("expunging zone")?; - } - - Ok(()) - }, - ) - .await - .expect("editing blueprint to expunge old Nexus zones"); - info!( - log, - "wrote new target blueprint with expunged zones"; - "blueprint_id" => %blueprint4.id - ); - - // Wait for this to get propagated everywhere. - let _latest_collection = blueprint_wait_sled_configs_propagated( - opctx, - datastore, - &blueprint4, - new_nexus, - Duration::from_secs(120), - ) - .await - .expect("waiting for blueprint4 sled configs"); -} diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index 50d88960f8e..e7fdf98777a 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -24,10 +24,7 @@ use nexus_db_model::EARLIEST_SUPPORTED_VERSION; use nexus_db_model::SchemaUpgradeStep; use nexus_db_model::SchemaVersion; use nexus_types::deployment::BlueprintZoneDisposition; -use nexus_types::deployment::BlueprintZoneType; -use nexus_types::deployment::blueprint_zone_type; use omicron_common::api::external::Error; -use omicron_common::api::external::Generation; use omicron_uuid_kinds::BlueprintUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; @@ -836,36 +833,24 @@ impl DataStore { &self, opctx: &OpContext, blueprint: &nexus_types::deployment::Blueprint, - active_generation: Generation, ) -> Result<(), Error> { opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; - let new_nexuses: Vec<_> = blueprint + // TODO: Without https://github.com/oxidecomputer/omicron/pull/8863, we + // treat all Nexuses as active. Some will become "not_yet", depending on + // the Nexus Generation, once it exists. + let active_nexus_zones = blueprint .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter_map(|(_sled, z)| { - let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { - nexus_generation, - .. - }) = &z.zone_type - else { - return None; - }; - - if *nexus_generation < active_generation { - None - } else if *nexus_generation == active_generation { - Some(DbMetadataNexus::new( - z.id, - DbMetadataNexusState::Active, - )) + .filter_map(|(_sled, zone_cfg)| { + if zone_cfg.zone_type.is_nexus() { + Some(zone_cfg) } else { - Some(DbMetadataNexus::new( - z.id, - DbMetadataNexusState::NotYet, - )) + None } - }) - .collect(); + }); + let new_nexuses = active_nexus_zones + .map(|z| DbMetadataNexus::new(z.id, DbMetadataNexusState::Active)) + .collect::>(); let conn = &*self.pool_connection_authorized(&opctx).await?; self.transaction_if_current_blueprint_is( @@ -2248,11 +2233,7 @@ mod test { // Create nexus access records datastore - .database_nexus_access_create( - &opctx, - &blueprint, - blueprint.nexus_generation, - ) + .database_nexus_access_create(&opctx, &blueprint) .await .expect("Failed to create nexus access"); @@ -2320,11 +2301,7 @@ mod test { // Create nexus access records (first time) datastore - .database_nexus_access_create( - &opctx, - &blueprint, - blueprint.nexus_generation, - ) + .database_nexus_access_create(&opctx, &blueprint) .await .expect("Failed to create nexus access (first time)"); @@ -2346,11 +2323,7 @@ mod test { // Creating the record again: not an error. datastore - .database_nexus_access_create( - &opctx, - &blueprint, - blueprint.nexus_generation, - ) + .database_nexus_access_create(&opctx, &blueprint) .await .expect("Failed to create nexus access (first time)"); confirm_state(datastore, nexus_id, DbMetadataNexusState::Active).await; @@ -2371,11 +2344,7 @@ mod test { // Create nexus access records another time - should be idempotent, // but should be "on-conflict, ignore". datastore - .database_nexus_access_create( - &opctx, - &blueprint, - blueprint.nexus_generation, - ) + .database_nexus_access_create(&opctx, &blueprint) .await .expect("Failed to create nexus access (second time)"); confirm_state(datastore, nexus_id, DbMetadataNexusState::Quiesced) @@ -2432,11 +2401,7 @@ mod test { // This should fail because the transaction should check if the // blueprint is the current target let result = datastore - .database_nexus_access_create( - &opctx, - &non_target_blueprint, - non_target_blueprint.nexus_generation, - ) + .database_nexus_access_create(&opctx, &non_target_blueprint) .await; assert!( result.is_err(), @@ -2455,11 +2420,7 @@ mod test { // Verify that using the correct target blueprint works datastore - .database_nexus_access_create( - &opctx, - &target_blueprint, - target_blueprint.nexus_generation, - ) + .database_nexus_access_create(&opctx, &target_blueprint) .await .expect( "Creating nexus access with correct blueprint should succeed", diff --git a/nexus/db-queries/src/db/datastore/vpc.rs b/nexus/db-queries/src/db/datastore/vpc.rs index 6ee6058698c..c832d54a4d7 100644 --- a/nexus/db-queries/src/db/datastore/vpc.rs +++ b/nexus/db-queries/src/db/datastore/vpc.rs @@ -3345,7 +3345,6 @@ mod tests { false, Vec::new(), BlueprintZoneImageSource::InstallDataset, - bp0.nexus_generation, ) .expect("added nexus to third sled"); builder.build() @@ -3421,7 +3420,6 @@ mod tests { false, Vec::new(), BlueprintZoneImageSource::InstallDataset, - bp2.nexus_generation, ) .expect("added nexus to third sled"); } diff --git a/nexus/reconfigurator/execution/src/database.rs b/nexus/reconfigurator/execution/src/database.rs index 41691256cf4..9652e53ec1a 100644 --- a/nexus/reconfigurator/execution/src/database.rs +++ b/nexus/reconfigurator/execution/src/database.rs @@ -8,10 +8,6 @@ use anyhow::anyhow; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; -use nexus_types::deployment::BlueprintZoneDisposition; -use nexus_types::deployment::blueprint_zone_type; -use omicron_uuid_kinds::OmicronZoneUuid; -use nexus_types::deployment::BlueprintZoneType; /// Idempotently ensure that the Nexus records for the zones are populated /// in the database. @@ -19,39 +15,9 @@ pub(crate) async fn deploy_db_metadata_nexus_records( opctx: &OpContext, datastore: &DataStore, blueprint: &Blueprint, - nexus_id: OmicronZoneUuid, ) -> Result<(), anyhow::Error> { - // To determine what state to use for new records, we need to know which is - // the currently active Nexus generation. That is necessarily the - // generation number of the Nexus instance that's doing the execution. - let nexus_generation = blueprint - // XXX-dap add helper for iterating over in-service Nexus zones - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .find_map(|(_sled_id, z)| { - if z.id != nexus_id { - return None; - } - - let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { - nexus_generation, - .. - }) = &z.zone_type - else { - // This should be impossible. - return None; - }; - - Some(*nexus_generation) - }) - .ok_or_else(|| { - anyhow!( - "did not find nexus generation for current \ - Nexus zone ({nexus_id})" - ) - })?; - datastore - .database_nexus_access_create(opctx, blueprint, nexus_generation) + .database_nexus_access_create(opctx, blueprint) .await .map_err(|err| anyhow!(err))?; Ok(()) diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 9fc69bdee9e..4a259ce3873 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -1557,7 +1557,6 @@ mod test { .sled_add_zone_nexus( sled_id, BlueprintZoneImageSource::InstallDataset, - blueprint.nexus_generation, ) .unwrap(); let blueprint2 = builder.build(); diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index 413c651401e..46070623181 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -204,7 +204,6 @@ pub async fn realize_blueprint( &opctx, datastore, blueprint, - nexus_id, ); register_deploy_sled_configs_step( @@ -406,30 +405,22 @@ fn register_deploy_db_metadata_nexus_records_step<'a>( opctx: &'a OpContext, datastore: &'a DataStore, blueprint: &'a Blueprint, - nexus_id: Option, ) { registrar .new_step( ExecutionStepId::Ensure, "Ensure db_metadata_nexus_state records exist", - async move |_cx| { - let Some(nexus_id) = nexus_id else { - return StepSkipped::new((), "not running as Nexus").into(); - }; - - match database::deploy_db_metadata_nexus_records( - opctx, &datastore, &blueprint, nexus_id, + async move |_cx| match database::deploy_db_metadata_nexus_records( + opctx, &datastore, &blueprint, + ) + .await + { + Ok(()) => StepSuccess::new(()).into(), + Err(err) => StepWarning::new( + (), + err.context("ensuring db_metadata_nexus_state").to_string(), ) - .await - { - Ok(()) => StepSuccess::new(()).into(), - Err(err) => StepWarning::new( - (), - err.context("ensuring db_metadata_nexus_state") - .to_string(), - ) - .into(), - } + .into(), }, ) .register(); diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 14c6ddb2da8..90bf09fe1ba 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -1549,7 +1549,6 @@ impl<'a> BlueprintBuilder<'a> { &mut self, sled_id: SledUuid, image_source: BlueprintZoneImageSource, - nexus_generation: Generation, ) -> Result<(), Error> { // Whether Nexus should use TLS and what the external DNS servers it // should use are currently provided at rack-setup time, and should be @@ -1578,7 +1577,6 @@ impl<'a> BlueprintBuilder<'a> { external_tls, external_dns_servers, image_source, - nexus_generation, ) } @@ -1588,7 +1586,6 @@ impl<'a> BlueprintBuilder<'a> { external_tls: bool, external_dns_servers: Vec, image_source: BlueprintZoneImageSource, - nexus_generation: Generation, ) -> Result<(), Error> { let nexus_id = self.rng.sled_rng(sled_id).next_zone(); let ExternalNetworkingChoice { @@ -1626,7 +1623,7 @@ impl<'a> BlueprintBuilder<'a> { nic, external_tls, external_dns_servers: external_dns_servers.clone(), - nexus_generation, + nexus_generation: Generation::new(), }); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; @@ -3345,7 +3342,6 @@ pub mod test { .map(|sa| sa.sled_id) .expect("no sleds present"), BlueprintZoneImageSource::InstallDataset, - Generation::new(), ) .unwrap_err(); @@ -3448,7 +3444,6 @@ pub mod test { .sled_add_zone_nexus( sled_id, BlueprintZoneImageSource::InstallDataset, - parent.nexus_generation, ) .expect("added nexus zone"); } @@ -3471,7 +3466,6 @@ pub mod test { .sled_add_zone_nexus( sled_id, BlueprintZoneImageSource::InstallDataset, - parent.nexus_generation, ) .expect("added nexus zone"); } @@ -3511,7 +3505,6 @@ pub mod test { .sled_add_zone_nexus( sled_id, BlueprintZoneImageSource::InstallDataset, - parent.nexus_generation, ) .unwrap_err(); diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index 919ab594ad1..ce793980b2c 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -157,7 +157,8 @@ pub struct ExampleSystem { pub system: SystemDescription, pub input: PlanningInput, pub collection: Collection, - /// The initial blueprint that was used to describe the system. + /// The initial blueprint that was used to describe the system. This + /// blueprint has sleds but no zones. pub initial_blueprint: Blueprint, } @@ -485,7 +486,6 @@ impl ExampleSystemBuilder { false, vec![], image_source.clone(), - initial_blueprint.nexus_generation, ) .unwrap(); } diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 20b896d0c31..a2c526b31e9 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -1172,12 +1172,7 @@ impl<'a> Planner<'a> { self.blueprint.sled_add_zone_external_dns(sled_id, image)? } DiscretionaryOmicronZone::Nexus => { - self.blueprint.sled_add_zone_nexus( - sled_id, - image, - // XXX-dap - self.blueprint.nexus_generation(), - )? + self.blueprint.sled_add_zone_nexus(sled_id, image)? } DiscretionaryOmicronZone::Oximeter => { self.blueprint.sled_add_zone_oximeter(sled_id, image)? From bdf36a797d401c3c7936e45fd46b1df2158e0f40 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 15 Sep 2025 15:12:05 -0700 Subject: [PATCH 09/11] clean up / refactor --- .../src/db/datastore/db_metadata.rs | 2 - nexus/src/app/mod.rs | 2 +- nexus/src/app/quiesce.rs | 363 ++++++++++++------ 3 files changed, 241 insertions(+), 126 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index e7fdf98777a..04404021ae2 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -747,8 +747,6 @@ impl DataStore { // (We use the lower-level pool interface to bypass that.) let conn = self.pool.claim_quiesced().await?; - // XXX-dap could use a separate user here who only has this one - // privilege. use nexus_db_schema::schema::db_metadata_nexus::dsl; let nexus_id = nexus_db_model::to_db_typed_uuid(nexus_id); diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 2e751e0fca6..04b09c5934a 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -347,7 +347,7 @@ impl Nexus { ); let quiesce = NexusQuiesceHandle::new( db_datastore.clone(), - config.deployment.id.into(), + config.deployment.id, blueprint_load_rx, quiesce_opctx, ); diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index 315610ffefb..bd2ad0e15ea 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -4,9 +4,9 @@ //! Manage Nexus quiesce state +use anyhow::{Context, anyhow, bail}; use assert_matches::assert_matches; use chrono::Utc; -use nexus_db_model::DbMetadataNexus; use nexus_db_model::DbMetadataNexusState; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; @@ -22,6 +22,8 @@ use omicron_common::api::external::LookupResult; use omicron_common::api::external::UpdateResult; use omicron_uuid_kinds::BlueprintUuid; use omicron_uuid_kinds::OmicronZoneUuid; +use slog::{error, info}; +use slog_error_chain::InlineErrorChain; use std::collections::BTreeSet; use std::sync::Arc; use std::time::Duration; @@ -139,11 +141,13 @@ impl NexusQuiesceHandle { } } -async fn do_quiesce(quiesce: NexusQuiesceHandle) { - let saga_quiesce = quiesce.sagas.clone(); +async fn do_quiesce(mut quiesce: NexusQuiesceHandle) { let datastore = quiesce.datastore.clone(); let my_nexus_id = quiesce.my_nexus_id; - let quiesce_opctx = &quiesce.quiesce_opctx; + + /// minimal timeout used just to avoid tight loops when encountering + /// transient errors + const PAUSE_TIMEOUT: Duration = Duration::from_secs(5); assert_matches!( *quiesce.state.borrow(), @@ -178,131 +182,35 @@ async fn do_quiesce(quiesce: NexusQuiesceHandle) { // safe assumption here. They shouldn't be changing at all at this point // unless reacting to something unrelated to the upgrade (e.g., a sled // expungement), and we are checking awfully frequently, too. - const POLL_TIMEOUT: Duration = Duration::from_secs(1); let mut last_recorded_blueprint_id: Option = None; loop { - // Grab the most recently loaded blueprint. As usual, we clone to avoid - // locking the watch channel for the lifetime of this value. - let Some(current_blueprint): Option = quiesce - .latest_blueprint - .borrow() - .as_deref() - .map(|(_target, blueprint)| blueprint) - .cloned() - else { - // XXX-dap log - // XXX-dap need to sleep here, but we won't. Could we use - // wait_until() it's non-None? - continue; - }; - - // Determine our own Nexus generation number. - // - // This doesn't change across iterations of the loop. But we need a - // blueprint to figure it out, and we don't have one until we're in this - // loop. - let Some(my_generation) = current_blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .find_map(|(_sled_id, zone)| { - if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { - (zone.id == my_nexus_id).then_some(nexus.nexus_generation) - } else { - None - } - }) - else { - // XXX-dap log, and also need to sleep here - continue; - }; - - // Wait up to a second for sagas to become drained as of this blueprint. - let Ok(_) = tokio::time::timeout( - POLL_TIMEOUT, - saga_quiesce.wait_for_drained_blueprint(current_blueprint.id), + match check_all_sagas_drained( + &mut quiesce, + &mut last_recorded_blueprint_id, + PAUSE_TIMEOUT, ) .await - else { - // We're still not drained as of this blueprint. But the - // current target blueprint could have changed. So take another - // lap. - continue; - }; - - // We're drained up through this blueprint. First, update our record to - // reflect that, if we haven't already. - match last_recorded_blueprint_id { - Some(blueprint_id) if blueprint_id == current_blueprint.id => (), - _ => { - let try_update = datastore - .database_nexus_access_update_blueprint( - quiesce_opctx, - my_nexus_id, - Some(current_blueprint.id), - ) - .await; - match try_update { - Err(error) => { - // Take another lap. - // XXX-dap log error - continue; - } - Ok(0) => (), - Ok(count) => { - // XXX-dap log - } - } - - last_recorded_blueprint_id = Some(current_blueprint.id); - } - }; - - // Now, see if everybody else is drained up to the same point, or if any - // of them is already quiesced. (That means *they* already saw that we - // were all drained up to the same point, which is good enough for us to - // proceed.) - let other_nexus_ids: BTreeSet = current_blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter_map(|(_sled_id, zone)| { - if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { - (nexus.nexus_generation == my_generation).then_some(zone.id) - } else { - None - } - }) - .collect(); - assert!(!other_nexus_ids.is_empty()); - let Ok(other_records): Result, _> = datastore - .database_nexus_access_all(&quiesce_opctx, &other_nexus_ids) - .await - else { - // XXX-dap log error - continue; - }; - - if other_records - .iter() - .any(|r| r.state() == DbMetadataNexusState::Quiesced) { - // XXX-dap log - break; - } - - if other_records.len() < other_nexus_ids.len() { - // XXX-dap log - continue; - } - - if other_records.iter().all(|r| { - r.last_drained_blueprint_id() == last_recorded_blueprint_id - }) { - // XXX-dap log - break; + Err(error) => { + // Log the error, sleep a bit to avoid spinning rapidly when + // conditions haven't changed, then take another lap. + let log = &quiesce.quiesce_opctx.log; + warn!(log, "not yet quiesced"; InlineErrorChain::new(&*error)); + tokio::time::sleep(PAUSE_TIMEOUT).await; + continue; + } + Ok(_) => { + // We're done with this stage. + let log = &quiesce.quiesce_opctx.log; + info!(log, "sagas quiesced -- moving on"); + break; + } } - - // XXX-dap log take another lap } // We're ready to hand off. + let quiesce_opctx = &quiesce.quiesce_opctx; + let log = &quiesce_opctx.log; quiesce.state.send_modify(|q| { let QuiesceState::DrainingSagas { time_requested, @@ -346,12 +254,20 @@ async fn do_quiesce(quiesce: NexusQuiesceHandle) { match datastore.database_nexus_access_update_quiesced(my_nexus_id).await { Ok(count) => { - // XXX-dap log and proceed + info!( + log, + "updated Nexus record to 'quiesced'"; + "count" => count, + ); break; } Err(error) => { - // XXX-dap log, sleep and try again - tokio::time::sleep(POLL_TIMEOUT).await; + warn!( + log, + "failed to update Nexus record to 'quiesced'"; + InlineErrorChain::new(&error) + ); + tokio::time::sleep(PAUSE_TIMEOUT).await; } } } @@ -390,6 +306,207 @@ async fn do_quiesce(quiesce: NexusQuiesceHandle) { // this Nexus for its quiesce state. } +/// Determines whether the fleet of currently-active Nexus instances have +/// all become drained as of the same blueprint. If some, returns `Ok(())`. +/// Otherwise, returns an error describing why we're not drained. +/// +/// Invoked by the caller in a loop until this function returns that we're +/// quiesced. This function sleeps as necessary to avoid spinning when called +/// in a loop. +async fn check_all_sagas_drained( + quiesce: &mut NexusQuiesceHandle, + last_recorded_blueprint_id: &mut Option, + pause_timeout: Duration, +) -> Result<(), anyhow::Error> { + // See the comment in our caller for an explanation of the big picture here. + // This is factored into a function so that we can more easily bail out with + // an error and handle those uniformly (in the caller). + + let saga_quiesce = quiesce.sagas.clone(); + let datastore = quiesce.datastore.clone(); + let my_nexus_id = quiesce.my_nexus_id; + let quiesce_opctx = &quiesce.quiesce_opctx; + let log = &quiesce_opctx.log; + debug!(log, "try_saga_quiesce(): enter"); + + // Grab the most recently loaded blueprint. + let current_blueprint = quiesce + .latest_blueprint + // Wait for a blueprint to be loaded, if necessary. + .wait_for(|value| value.is_some()) + .await + // This should be impossible + .context("latest_blueprint rx channel closed")? + .as_deref() + // unwrap(): wait_for() returns a value for which the closure + // returns true, and we checked that this is `Some` + .unwrap() + // extract just the blueprint part + .1 + // As usual, we clone to avoid locking the watch channel for the + // lifetime of this value. + .clone(); + debug!( + log, + "try_saga_quiesce(): blueprint"; + "blueprint_id" => %current_blueprint.id + ); + + // Determine our own Nexus generation number. + // + // This doesn't ever change once we've determined it once. But we don't + // know what the value is until we see our first blueprint. + let Some(my_generation) = current_blueprint + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .find_map(|(_sled_id, zone)| { + if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { + (zone.id == my_nexus_id).then_some(nexus.nexus_generation) + } else { + None + } + }) + else { + // This case should generally be impossible. We *are* a working Nexus + // zone, so how do we not exist in the blueprint? Anyway, there's + // really not much we can do here. Sleep a little bit just to avoid + // spinning rapidly, then try again in hopes that this was some + // transient thing. + // + // (Panicking would not be better. That would likely put us into a + // restart loop until SMF gives up. Then nothing would clear that + // condition.) + let error = anyhow!( + "could not find self ({my_nexus_id}) in blueprint {}", + current_blueprint.id + ); + error!( + log, + "try_saga_quiesce(): impossible condition"; + InlineErrorChain::new(&*error) + ); + return Err(error); + }; + + // Wait a bounded amount of time for sagas to become drained as of this + // blueprint. + let Ok(_) = tokio::time::timeout( + pause_timeout, + saga_quiesce.wait_for_drained_blueprint(current_blueprint.id), + ) + .await + else { + // We're still not drained as of this blueprint (or we're drained as of + // a newer one). It's possible that the current target blueprint could + // have changed, so bail out and let the caller invoke us again. + bail!("not locally drained as of blueprint {}", current_blueprint.id); + }; + + // We're drained up through this blueprint. First, update our record to + // reflect that, if we haven't already. + match last_recorded_blueprint_id { + Some(blueprint_id) if *blueprint_id == current_blueprint.id => (), + _ => { + info!( + log, + "locally drained as of blueprint"; + "blueprint_id" => %current_blueprint.id + ); + let try_update = datastore + .database_nexus_access_update_blueprint( + quiesce_opctx, + my_nexus_id, + Some(current_blueprint.id), + ) + .await; + match try_update { + Err(error) => { + return Err(anyhow!(error).context( + "updating our db_metadata_nexus record's drained \ + blueprint", + )); + } + Ok(0) => (), + Ok(count) => { + info!( + log, + "updated our db_metadata_nexus record blueprint id"; + "nexus_id" => %my_nexus_id, + "blueprint_id" => %current_blueprint.id, + "count" => count, + ); + } + } + + *last_recorded_blueprint_id = Some(current_blueprint.id); + } + }; + + // Now, see if everybody else is drained up to the same point, or if any + // of them is already quiesced. That would mean *they* already saw that we + // were all drained up to the same point, which is good enough for us to + // proceed, too. + let other_nexus_ids: BTreeSet = current_blueprint + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter_map(|(_sled_id, zone)| { + if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { + (nexus.nexus_generation == my_generation).then_some(zone.id) + } else { + None + } + }) + .collect(); + assert!(!other_nexus_ids.is_empty()); + let other_records = datastore + .database_nexus_access_all(&quiesce_opctx, &other_nexus_ids) + .await + .context( + "loading db_metadata_nexus records for other Nexus instances", + )?; + + if let Some(other) = other_records + .iter() + .find(|r| r.state() == DbMetadataNexusState::Quiesced) + { + info!( + log, + "found other Nexus instance with 'quiesced' record"; + "other_nexus" => %other.nexus_id(), + ); + return Ok(()); + } + + if other_records.len() < other_nexus_ids.len() { + // This shouldn't ever happen. But if it does, we don't want to + // proceed because we don't know if the missing Nexus zone really is + // drained. + bail!( + "found too few other Nexus records (expected {:?}, found {:?})", + other_nexus_ids, + other_records, + ); + } + + match other_records + .iter() + .find(|r| r.last_drained_blueprint_id() != *last_recorded_blueprint_id) + { + Some(undrained) => Err(anyhow!( + "at least one Nexus instance is not drained as of blueprint \ + {:?}: Nexus {}", + last_recorded_blueprint_id, + undrained.nexus_id() + )), + None => { + info!( + log, + "all Nexus instances are drained as of blueprint {:?}", + last_recorded_blueprint_id + ); + Ok(()) + } + } +} + #[cfg(test)] mod test { use crate::app::quiesce::NexusQuiesceHandle; From 0830d574789018964cc82969402a1a6b0714e325 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 15 Sep 2025 15:31:48 -0700 Subject: [PATCH 10/11] fix wrong comment --- nexus/src/app/quiesce.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index bd2ad0e15ea..72d83a12c23 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -311,8 +311,7 @@ async fn do_quiesce(mut quiesce: NexusQuiesceHandle) { /// Otherwise, returns an error describing why we're not drained. /// /// Invoked by the caller in a loop until this function returns that we're -/// quiesced. This function sleeps as necessary to avoid spinning when called -/// in a loop. +/// quiesced. async fn check_all_sagas_drained( quiesce: &mut NexusQuiesceHandle, last_recorded_blueprint_id: &mut Option, From e8eb204f96309d21e84e43feee2b09ad635d7ddd Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 16 Sep 2025 10:57:15 -0700 Subject: [PATCH 11/11] review feedback --- nexus/db-queries/src/db/datastore/db_metadata.rs | 4 ++-- nexus/db-queries/src/db/pub_test_utils/mod.rs | 6 ++++++ nexus/src/app/quiesce.rs | 12 ++++++------ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index 04404021ae2..2fc349f2d83 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -700,7 +700,7 @@ impl DataStore { let db_nexus_ids: BTreeSet<_> = nexus_ids .iter() - .cloned() + .copied() .map(nexus_db_model::to_db_typed_uuid) .collect(); dsl::db_metadata_nexus @@ -737,7 +737,7 @@ impl DataStore { Ok(count) } - /// Updates the "last_drained_blueprint_id" for the given Nexus id + /// Updates the state for the given Nexus id to "quiesced" pub async fn database_nexus_access_update_quiesced( &self, nexus_id: OmicronZoneUuid, diff --git a/nexus/db-queries/src/db/pub_test_utils/mod.rs b/nexus/db-queries/src/db/pub_test_utils/mod.rs index f4f48d380f6..6662fe8cc06 100644 --- a/nexus/db-queries/src/db/pub_test_utils/mod.rs +++ b/nexus/db-queries/src/db/pub_test_utils/mod.rs @@ -247,6 +247,12 @@ impl TestDatabase { } } + /// Returns a new independent datastore atop a new pool atop the same + /// database + /// + /// This is normally not necessary. You can clone the `Arc` + /// returned by `datastore()`. However, this is important for tests that + /// need separate datastores to test their separate quiesce behaviors. pub async fn extra_datastore(&self, log: &Logger) -> Arc { let pool = new_pool(log, &self.db); Arc::new( diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index 72d83a12c23..b5c97354f1c 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -160,7 +160,7 @@ async fn do_quiesce(mut quiesce: NexusQuiesceHandle) { // // So here's the plan: we keep track of the blueprint id as of which we have // fully drained. That means that we've processed all Nexus expungements up - // to that blueprint, _and_ we've recovered all sagas assigned to us, _and + // to that blueprint, _and_ we've recovered all sagas assigned to us, _and_ // finished running them all. When this blueprint id changes (because we've // processed re-assignments for a new blueprint), we'll do two things: // @@ -444,7 +444,7 @@ async fn check_all_sagas_drained( // of them is already quiesced. That would mean *they* already saw that we // were all drained up to the same point, which is good enough for us to // proceed, too. - let other_nexus_ids: BTreeSet = current_blueprint + let our_gen_nexus_ids: BTreeSet = current_blueprint .all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter_map(|(_sled_id, zone)| { if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { @@ -454,9 +454,9 @@ async fn check_all_sagas_drained( } }) .collect(); - assert!(!other_nexus_ids.is_empty()); + assert!(!our_gen_nexus_ids.is_empty()); let other_records = datastore - .database_nexus_access_all(&quiesce_opctx, &other_nexus_ids) + .database_nexus_access_all(&quiesce_opctx, &our_gen_nexus_ids) .await .context( "loading db_metadata_nexus records for other Nexus instances", @@ -474,13 +474,13 @@ async fn check_all_sagas_drained( return Ok(()); } - if other_records.len() < other_nexus_ids.len() { + if other_records.len() < our_gen_nexus_ids.len() { // This shouldn't ever happen. But if it does, we don't want to // proceed because we don't know if the missing Nexus zone really is // drained. bail!( "found too few other Nexus records (expected {:?}, found {:?})", - other_nexus_ids, + our_gen_nexus_ids, other_records, ); }