-
Notifications
You must be signed in to change notification settings - Fork 87
coordinated Nexus quiesce #9010
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6545d73
cba808d
8dce7c1
7ad5d55
40f75ac
b57236b
ee285d7
f7c92b3
d45a995
bdf36a7
0830d57
fa6d78f
e8eb204
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,10 +25,12 @@ use nexus_db_model::SchemaUpgradeStep; | |
| use nexus_db_model::SchemaVersion; | ||
| use nexus_types::deployment::BlueprintZoneDisposition; | ||
| use omicron_common::api::external::Error; | ||
| use omicron_uuid_kinds::BlueprintUuid; | ||
| use omicron_uuid_kinds::GenericUuid; | ||
| use omicron_uuid_kinds::OmicronZoneUuid; | ||
| use semver::Version; | ||
| use slog::{Logger, error, info, o}; | ||
| use std::collections::BTreeSet; | ||
| use std::ops::Bound; | ||
| use std::str::FromStr; | ||
|
|
||
|
|
@@ -686,6 +688,77 @@ impl DataStore { | |
| Ok(()) | ||
| } | ||
|
|
||
| /// Returns information about access for all of the given Nexus ids | ||
| /// | ||
| /// This set is assumed to be pretty small. | ||
| pub async fn database_nexus_access_all( | ||
| &self, | ||
| opctx: &OpContext, | ||
| nexus_ids: &BTreeSet<OmicronZoneUuid>, | ||
| ) -> Result<Vec<DbMetadataNexus>, Error> { | ||
| use nexus_db_schema::schema::db_metadata_nexus::dsl; | ||
|
|
||
| let db_nexus_ids: BTreeSet<_> = nexus_ids | ||
| .iter() | ||
| .copied() | ||
| .map(nexus_db_model::to_db_typed_uuid) | ||
| .collect(); | ||
| dsl::db_metadata_nexus | ||
| .filter(dsl::nexus_id.eq_any(db_nexus_ids)) | ||
| .load_async(&*self.pool_connection_authorized(opctx).await?) | ||
| .await | ||
| .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) | ||
| } | ||
|
|
||
| /// Updates the "last_drained_blueprint_id" for the given Nexus id | ||
| pub async fn database_nexus_access_update_blueprint( | ||
| &self, | ||
| opctx: &OpContext, | ||
| nexus_id: OmicronZoneUuid, | ||
| blueprint_id: Option<BlueprintUuid>, | ||
| ) -> Result<usize, Error> { | ||
| use nexus_db_schema::schema::db_metadata_nexus::dsl; | ||
|
|
||
| let nexus_id = nexus_db_model::to_db_typed_uuid(nexus_id); | ||
| let blueprint_id = blueprint_id.map(nexus_db_model::to_db_typed_uuid); | ||
|
|
||
| let conn = self.pool_connection_authorized(opctx).await?; | ||
| let count = diesel::update(dsl::db_metadata_nexus) | ||
| .filter(dsl::nexus_id.eq(nexus_id)) | ||
| // To be conservative, we'll only update this value if the record is | ||
| // currently active. There's no reason it should ever not be active | ||
| // if we're calling this function and if there were an easy way to | ||
| // return an error in that case, we'd just do that. | ||
| .filter(dsl::state.eq(DbMetadataNexusState::Active)) | ||
| .set(dsl::last_drained_blueprint_id.eq(blueprint_id)) | ||
| .execute_async(&*conn) | ||
| .await | ||
| .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; | ||
| Ok(count) | ||
| } | ||
|
|
||
| /// Updates the state for the given Nexus id to "quiesced" | ||
| pub async fn database_nexus_access_update_quiesced( | ||
| &self, | ||
| nexus_id: OmicronZoneUuid, | ||
| ) -> Result<usize, Error> { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment here about returning a usize - since we're indexing on |
||
| // A traditional authz check is not possible here because we've quiesced | ||
| // the DataStore, so no further connections are ordinarily available. | ||
| // (We use the lower-level pool interface to bypass that.) | ||
| let conn = self.pool.claim_quiesced().await?; | ||
|
|
||
| use nexus_db_schema::schema::db_metadata_nexus::dsl; | ||
|
|
||
| let nexus_id = nexus_db_model::to_db_typed_uuid(nexus_id); | ||
| let count = diesel::update(dsl::db_metadata_nexus) | ||
| .filter(dsl::nexus_id.eq(nexus_id)) | ||
| .set(dsl::state.eq(DbMetadataNexusState::Quiesced)) | ||
| .execute_async(&*conn) | ||
| .await | ||
| .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; | ||
| Ok(count) | ||
| } | ||
|
|
||
| // Returns the access this Nexus has to the database | ||
| async fn database_nexus_access( | ||
| &self, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -247,6 +247,21 @@ impl TestDatabase { | |
| } | ||
| } | ||
|
|
||
| /// Returns a new independent datastore atop a new pool atop the same | ||
| /// database | ||
| /// | ||
| /// This is normally not necessary. You can clone the `Arc<DataStore>` | ||
| /// returned by `datastore()`. However, this is important for tests that | ||
| /// need separate datastores to test their separate quiesce behaviors. | ||
| pub async fn extra_datastore(&self, log: &Logger) -> Arc<DataStore> { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need this, as opposed to cloning the return value of
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The datastore (really, the underlying pool) has in-memory state associated with being quiesced. In our test, we want independent instances of this so that they can be quiesced independently. edit: I'll add a comment about this.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, fair enough. Probably worth at least a doc comment noting how this is different? I'm tempted to suggest a more descriptive name too, but I'm not sure what.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment added in e8eb204. |
||
| let pool = new_pool(log, &self.db); | ||
| Arc::new( | ||
| DataStore::new(&log, pool, None, IdentityCheckPolicy::DontCare) | ||
| .await | ||
| .unwrap(), | ||
| ) | ||
| } | ||
|
|
||
| pub fn opctx(&self) -> &OpContext { | ||
| match &self.kind { | ||
| TestKind::NoPool | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -140,6 +140,8 @@ use nexus_config::DnsTasksConfig; | |
| use nexus_db_model::DnsGroup; | ||
| use nexus_db_queries::context::OpContext; | ||
| use nexus_db_queries::db::DataStore; | ||
| use nexus_types::deployment::Blueprint; | ||
| use nexus_types::deployment::BlueprintTarget; | ||
| use nexus_types::deployment::PendingMgsUpdates; | ||
| use omicron_uuid_kinds::OmicronZoneUuid; | ||
| use oximeter::types::ProducerRegistry; | ||
|
|
@@ -426,8 +428,10 @@ impl BackgroundTasksInitializer { | |
| // Background task: blueprint loader | ||
| // | ||
| // Registration is below so that it can watch the planner. | ||
| let blueprint_loader = | ||
| blueprint_load::TargetBlueprintLoader::new(datastore.clone()); | ||
| let blueprint_loader = blueprint_load::TargetBlueprintLoader::new( | ||
| datastore.clone(), | ||
| args.blueprint_load_tx, | ||
| ); | ||
| let rx_blueprint = blueprint_loader.watcher(); | ||
|
|
||
| // Background task: blueprint executor | ||
|
|
@@ -1023,6 +1027,9 @@ pub struct BackgroundTasksData { | |
| pub saga_recovery: saga_recovery::SagaRecoveryHelpers<Arc<Nexus>>, | ||
| /// Channel for TUF repository artifacts to be replicated out to sleds | ||
| pub tuf_artifact_replication_rx: mpsc::Receiver<ArtifactsWithPlan>, | ||
| /// Channel for exposing the latest loaded blueprint | ||
| pub blueprint_load_tx: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just making sure I understand: we have this field now because in
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup. |
||
| watch::Sender<Option<Arc<(BlueprintTarget, Blueprint)>>>, | ||
| /// `reqwest::Client` for webhook delivery requests. | ||
| /// | ||
| /// This is shared with the external API as it's also used when sending | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit, should we update this to return a bool? It's always going to be 0 or 1 rows updated, right?
Alternatively - we could just return an error if the "count == 0", right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we want to return an error for the
0case because that could just mean we've already updated it to this blueprint id and that's fine. The caller needs to retry errors, but not that case.I can see the appeal of the bool. I want to log the value either way, which feels slightly more idiomatic at the caller level, so I'm going to leave it.