diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index c4088d41c8f..d4833c6d187 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -45,6 +45,7 @@ use clap::builder::PossibleValuesParser; use clap::builder::TypedValueParser; use db_metadata::DbMetadataArgs; use db_metadata::DbMetadataCommands; +use db_metadata::cmd_db_metadata_force_mark_nexus_quiesced; use db_metadata::cmd_db_metadata_list_nexus; use diesel::BoolExpressionMethods; use diesel::ExpressionMethods; @@ -1148,6 +1149,12 @@ impl DbArgs { }) => { cmd_db_metadata_list_nexus(&opctx, &datastore).await } + DbCommands::DbMetadata(DbMetadataArgs { + command: DbMetadataCommands::ForceMarkNexusQuiesced(args), + }) => { + let token = omdb.check_allow_destructive()?; + cmd_db_metadata_force_mark_nexus_quiesced(&opctx, &datastore, args, token).await + } DbCommands::CrucibleDataset(CrucibleDatasetArgs { command: CrucibleDatasetCommands::List, }) => { diff --git a/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs b/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs index 57c6973c8c0..bf2fb50dbe9 100644 --- a/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs +++ b/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs @@ -5,7 +5,12 @@ //! `omdb db db_metadata` subcommands use super::display_option_blank; + +use crate::check_allow_destructive::DestructiveOperationToken; +use crate::helpers::ConfirmationPrompt; use anyhow::Context; +use anyhow::bail; +use clap::ArgAction; use clap::Args; use clap::Subcommand; use nexus_db_model::DbMetadataNexusState; @@ -27,8 +32,42 @@ pub struct DbMetadataArgs { #[derive(Debug, Subcommand, Clone)] pub enum DbMetadataCommands { + /// Lists the `db_metadata_nexus` records for all Nexuses. #[clap(alias = "ls-nexus")] ListNexus, + + /// !!! DANGEROUS !!! Updates a `db_metadata_nexus` record to 'Quiesced' + /// + /// THIS OPERATION IS DANGEROUS. It is the responsibility of the caller + /// to ensure that the specified Nexus zone is not running. + /// + /// If the Nexus being updated is actually running, this operation + /// may cause arbitrary data corruption, as it can allow multiple Nexuses + /// at distinct database verions to inadvertently be running concurrently. + /// + /// This operation is intended to assist in the explicit case where a Nexus + /// is unable to finish marking itself quiesced during the handoff process, + /// and cannot be expunged. + ForceMarkNexusQuiesced(ForceMarkNexusQuiescedArgs), +} + +#[derive(Debug, Args, Clone)] +pub struct ForceMarkNexusQuiescedArgs { + /// The UUID of the Nexus zone to be marked quiesced + id: OmicronZoneUuid, + + /// Skip checking the target blueprint to determine whether Nexus zone `id` + /// is from the generation of Nexus zones that could be active or handing + /// off. + /// + /// Manually marking Nexus quiesced is already an unsafe operation; this + /// makes it even less safe. Use with caution. + #[arg(long, action=ArgAction::SetTrue)] + skip_blueprint_validation: bool, + + /// Skip confirmation prompt to verify that this operation is intended. + #[arg(long, action=ArgAction::SetTrue)] + skip_confirmation: bool, } // DB Metadata @@ -152,3 +191,51 @@ pub async fn cmd_db_metadata_list_nexus( Ok(()) } + +pub async fn cmd_db_metadata_force_mark_nexus_quiesced( + opctx: &OpContext, + datastore: &DataStore, + args: &ForceMarkNexusQuiescedArgs, + _destruction_token: DestructiveOperationToken, +) -> Result<(), anyhow::Error> { + if !args.skip_confirmation { + println!( + "\nDo you want to mark Nexus {} as quiesced in the database?", + args.id + ); + let mut prompt = ConfirmationPrompt::new(); + prompt.read_and_validate("y/N", "y")?; + } + + if !args.skip_blueprint_validation { + let (_, current_target_blueprint) = datastore + .blueprint_target_get_current_full(opctx) + .await + .context("loading current target blueprint")?; + let nexus_generation = current_target_blueprint + .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .find_map(|(_, zone, nexus_zone)| { + if zone.id == args.id { + Some(nexus_zone.nexus_generation) + } else { + None + } + }); + + let Some(gen) = nexus_generation else { + bail!("Nexus {} not found in blueprint", args.id); + }; + let bp_gen = current_target_blueprint.nexus_generation; + if bp_gen <= gen { + bail!( + "Nexus {} not ready to quiesce (nexus generation {gen} >= blueprint gen {bp_gen})", + args.id + ); + } + } + + datastore.database_nexus_access_update_quiesced(args.id).await?; + println!("Marked {} quiesced", args.id); + + Ok(()) +} diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index fc0fd5ccfde..3d0550ab324 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -11,6 +11,16 @@ stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable note: database schema version matches expected () ============================================= +EXECUTING COMMAND: omdb ["--destructive", "db", "db-metadata", "force-mark-nexus-quiesced", "--skip-confirmation", "....................."] +termination: Exited(1) +--------------------------------------------- +stdout: +--------------------------------------------- +stderr: +note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable +note: database schema version matches expected () +Error: Nexus ..................... not ready to quiesce (nexus generation 1 >= blueprint gen 1) +============================================= EXECUTING COMMAND: omdb ["db", "disks", "list"] termination: Exited(0) --------------------------------------------- @@ -1756,3 +1766,13 @@ note: database schema version matches expected () assembling reconfigurator state ... done wrote ============================================= +EXECUTING COMMAND: omdb ["--destructive", "db", "db-metadata", "force-mark-nexus-quiesced", "--skip-confirmation", "--skip-blueprint-validation", "....................."] +termination: Exited(0) +--------------------------------------------- +stdout: +Marked ..................... quiesced +--------------------------------------------- +stderr: +note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable +note: database schema version matches expected () +============================================= diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index 3aaffbf98ec..e8807753032 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -178,6 +178,20 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { let invocations: &[&[&str]] = &[ &["db", "db-metadata", "ls-nexus"], + // We expect this operation to fail (the nexus generation is the same + // as the one in the target blueprint - it shouldn't be trying to + // quiesce yet). + // + // We test a version of this command which sets this record to quiesced + // anyway as the final invocation. + &[ + "--destructive", + "db", + "db-metadata", + "force-mark-nexus-quiesced", + "--skip-confirmation", + &cptestctx.server.server_context().nexus.id().to_string(), + ], &["db", "disks", "list"], &["db", "dns", "show"], &["db", "dns", "diff", "external", "2"], @@ -274,6 +288,21 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { // We can't easily test the sled agent output because that's only // provided by a real sled agent, which is not available in the // ControlPlaneTestContext. + + // This operation will set the "db_metadata_nexus" state to quiesced. + // + // This would normally only be set by a Nexus as it shuts itself down; + // save it for last to avoid causing a weird state while testing other + // commands. + &[ + "--destructive", + "db", + "db-metadata", + "force-mark-nexus-quiesced", + "--skip-confirmation", + "--skip-blueprint-validation", + &cptestctx.server.server_context().nexus.id().to_string(), + ], ]; let mut redactor = Redactor::default();