diff --git a/sled-agent/src/bin/sled-agent.rs b/sled-agent/src/bin/sled-agent.rs index 6c24ed541a4..c2989ee0280 100644 --- a/sled-agent/src/bin/sled-agent.rs +++ b/sled-agent/src/bin/sled-agent.rs @@ -13,9 +13,9 @@ use omicron_common::api::external::Error; use omicron_common::cmd::fatal; use omicron_common::cmd::CmdError; use omicron_sled_agent::bootstrap::{ - config::Config as BootstrapConfig, config::SetupServiceConfig as RssConfig, - server as bootstrap_server, + config::Config as BootstrapConfig, server as bootstrap_server, }; +use omicron_sled_agent::rack_setup::config::SetupServiceConfig as RssConfig; use omicron_sled_agent::{config::Config as SledConfig, server as sled_server}; use std::path::PathBuf; use structopt::StructOpt; diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index c0576b3788c..9f85cf6d1c6 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -10,6 +10,7 @@ use super::trust_quorum::{ self, RackSecret, ShareDistribution, TrustQuorumError, }; use super::views::ShareResponse; +use crate::rack_setup::service::Service as RackSetupService; use omicron_common::api::external::Error as ExternalError; use omicron_common::backoff::{ internal_service_policy, retry_notify, BackoffError, @@ -19,13 +20,11 @@ use slog::Logger; use std::io; use std::path::Path; use thiserror::Error; +use tokio::sync::Mutex; /// Describes errors which may occur while operating the bootstrap service. #[derive(Error, Debug)] pub enum BootstrapError { - #[error("Cannot deserialize TOML file")] - Toml(#[from] toml::de::Error), - #[error("Error accessing filesystem: {0}")] Io(#[from] std::io::Error), @@ -35,17 +34,8 @@ pub enum BootstrapError { #[error("Error modifying SMF service: {0}")] SmfAdm(#[from] smf::AdmError), - #[error("Error making HTTP request to Sled Agent: {0}")] - SledApi(#[from] sled_agent_client::Error), - - #[error("Error making HTTP request to Nexus: {0}")] - NexusApi(#[from] nexus_client::Error), - #[error(transparent)] TrustQuorum(#[from] TrustQuorumError), - - #[error("Configuration changed")] - Configuration, } impl From for ExternalError { @@ -82,13 +72,15 @@ pub(crate) struct Agent { log: Logger, peer_monitor: discovery::PeerMonitor, share: Option, + + rss: Mutex>, } impl Agent { pub fn new(log: Logger) -> Result { let peer_monitor = discovery::PeerMonitor::new(&log)?; let share = read_key_share()?; - Ok(Agent { log, peer_monitor, share }) + Ok(Agent { log, peer_monitor, share, rss: Mutex::new(None) }) } /// Implements the "request share" API. @@ -207,169 +199,14 @@ impl Agent { Ok(()) } - // In lieu of having an operator send requests to all sleds via an - // initialization service, the sled-agent configuration may allow for the - // automated injection of setup requests from a sled. - async fn inject_rack_setup_service_requests( - &self, - config: &Config, - ) -> Result<(), BootstrapError> { + // Initializes the Rack Setup Service. + async fn start_rss(&self, config: &Config) -> Result<(), BootstrapError> { if let Some(rss_config) = &config.rss_config { - info!(self.log, "Injecting RSS configuration: {:#?}", rss_config); - - let serialized_config = toml::Value::try_from(&config) - .expect("Cannot serialize configuration"); - let config_str = toml::to_string(&serialized_config) - .expect("Cannot turn config to string"); - - // First, check if this request has previously been made. - // - // Normally, the rack setup service is run with a human-in-the-loop, - // but with this automated injection, we need a way to determine the - // (destructive) initialization has occurred. - // - // We do this by storing the configuration at "rss_config_path" - // after successfully performing initialization. - let rss_config_path = - std::path::Path::new(crate::OMICRON_CONFIG_PATH) - .join("config-rss.toml"); - if rss_config_path.exists() { - info!( - self.log, - "RSS configuration already exists at {}", - rss_config_path.to_string_lossy() - ); - let old_config: Config = toml::from_str( - &tokio::fs::read_to_string(&rss_config_path).await?, - )?; - if &old_config == config { - info!( - self.log, - "RSS config already applied from: {}", - rss_config_path.to_string_lossy() - ); - return Ok(()); - } - - // TODO(https://github.com/oxidecomputer/omicron/issues/724): - // We could potentially handle this case by deleting all - // partitions (in preparation for applying the new - // configuration), but at the moment it's an error. - warn!( - self.log, - "Rack Setup Service Config was already applied, but has changed. - This means that you may have partitions set up on this sled, but they - may not match the ones requested by the supplied configuration.\n - To re-initialize this sled: - - Disable all Oxide services - - Delete all partitions within the attached zpool - - Delete the configuration file ({}) - - Restart the sled agent", - rss_config_path.to_string_lossy() - ); - return Err(BootstrapError::Configuration); - } else { - info!( - self.log, - "No RSS configuration found at {}", - rss_config_path.to_string_lossy() - ); - } - - // Issue the dataset initialization requests to all sleds. - futures::future::join_all( - rss_config.requests.iter().map(|request| async move { - info!(self.log, "observing request: {:#?}", request); - let dur = std::time::Duration::from_secs(60); - let client = reqwest::ClientBuilder::new() - .connect_timeout(dur) - .timeout(dur) - .build() - .map_err(|e| nexus_client::Error::::from(e))?; - let client = sled_agent_client::Client::new_with_client( - &format!("http://{}", request.sled_address), - client, - self.log.new(o!("SledAgentClient" => request.sled_address)), - ); - - info!(self.log, "sending partition requests..."); - for partition in &request.partitions { - let filesystem_put = || async { - info!(self.log, "creating new filesystem: {:?}", partition); - client.filesystem_put(&partition.clone().into()) - .await - .map_err(BackoffError::transient)?; - Ok::< - (), - BackoffError< - sled_agent_client::Error, - >, - >(()) - }; - let log_failure = |error, _| { - warn!(self.log, "failed to create filesystem"; "error" => ?error); - }; - retry_notify( - internal_service_policy(), - filesystem_put, - log_failure, - ).await?; - } - Ok(()) - }) - ).await.into_iter().collect::, BootstrapError>>()?; - - // Issue service initialization requests. - // - // Note that this must happen *after* the partition initialization, - // to ensure that CockroachDB has been initialized before Nexus - // starts. - futures::future::join_all( - rss_config.requests.iter().map(|request| async move { - info!(self.log, "observing request: {:#?}", request); - let dur = std::time::Duration::from_secs(60); - let client = reqwest::ClientBuilder::new() - .connect_timeout(dur) - .timeout(dur) - .build() - .map_err(|e| nexus_client::Error::::from(e))?; - let client = sled_agent_client::Client::new_with_client( - &format!("http://{}", request.sled_address), - client, - self.log.new(o!("SledAgentClient" => request.sled_address)), - ); - - info!(self.log, "sending service requests..."); - let services_put = || async { - info!(self.log, "initializing sled services: {:?}", request.services); - client.services_put( - &sled_agent_client::types::ServiceEnsureBody { - services: request.services.iter().map(|s| s.clone().into()).collect() - }) - .await - .map_err(BackoffError::transient)?; - Ok::< - (), - BackoffError< - sled_agent_client::Error, - >, - >(()) - }; - let log_failure = |error, _| { - warn!(self.log, "failed to initialize services"; "error" => ?error); - }; - retry_notify( - internal_service_policy(), - services_put, - log_failure, - ).await?; - Ok::<(), BootstrapError>(()) - }) - ).await.into_iter().collect::, BootstrapError>>()?; - - // Finally, make sure the configuration is saved so we don't inject - // the requests on the next iteration. - tokio::fs::write(rss_config_path, config_str).await?; + let rss = RackSetupService::new( + self.log.new(o!("component" => "RSS")), + rss_config.clone(), + ); + self.rss.lock().await.replace(rss); } Ok(()) } @@ -391,7 +228,7 @@ impl Agent { self.establish_sled_quorum().await?; } - self.inject_rack_setup_service_requests(config).await?; + self.start_rss(config).await?; Ok(()) } diff --git a/sled-agent/src/bootstrap/config.rs b/sled-agent/src/bootstrap/config.rs index 15ab42f8246..1fec659b5b3 100644 --- a/sled-agent/src/bootstrap/config.rs +++ b/sled-agent/src/bootstrap/config.rs @@ -4,14 +4,10 @@ //! Interfaces for working with bootstrap agent configuration -use crate::config::ConfigError; -use crate::params::{DatasetEnsureBody, ServiceRequest}; use dropshot::ConfigDropshot; use dropshot::ConfigLogging; use serde::Deserialize; use serde::Serialize; -use std::net::SocketAddr; -use std::path::Path; use uuid::Uuid; /// Configuration for a bootstrap agent @@ -21,45 +17,5 @@ pub struct Config { pub dropshot: ConfigDropshot, pub log: ConfigLogging, - pub rss_config: Option, -} - -/// Configuration for the "rack setup service", which is controlled during -/// bootstrap. -/// -/// The Rack Setup Service should be responsible for one-time setup actions, -/// such as CockroachDB placement and initialization. Without operator -/// intervention, however, these actions need a way to be automated in our -/// deployment. -/// -/// By injecting this (optional) configuration into the bootstrap agent, it -/// can act as a stand-in initialization service. -#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] -pub struct SetupServiceConfig { - #[serde(default, rename = "request")] - pub requests: Vec, -} - -/// A request to initialize a sled. -#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] -pub struct SledRequest { - /// The Sled Agent address receiving these requests. - pub sled_address: SocketAddr, - - /// Partitions to be created. - #[serde(default, rename = "partition")] - pub partitions: Vec, - - /// Services to be instantiated. - #[serde(default, rename = "service")] - pub services: Vec, -} - -impl SetupServiceConfig { - pub fn from_file>(path: P) -> Result { - let path = path.as_ref(); - let contents = std::fs::read_to_string(path)?; - let config = toml::from_str(&contents)?; - Ok(config) - } + pub rss_config: Option, } diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 245af13ab21..fb93dfc5fe1 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -27,6 +27,7 @@ mod instance; mod instance_manager; mod nexus; mod params; +pub mod rack_setup; pub mod server; mod services; mod sled_agent; diff --git a/sled-agent/src/rack_setup/config.rs b/sled-agent/src/rack_setup/config.rs new file mode 100644 index 00000000000..4d284cfed7b --- /dev/null +++ b/sled-agent/src/rack_setup/config.rs @@ -0,0 +1,52 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Interfaces for working with RSS config. + +use crate::config::ConfigError; +use crate::params::{DatasetEnsureBody, ServiceRequest}; +use serde::Deserialize; +use serde::Serialize; +use std::net::SocketAddr; +use std::path::Path; + +/// Configuration for the "rack setup service", which is controlled during +/// bootstrap. +/// +/// The Rack Setup Service should be responsible for one-time setup actions, +/// such as CockroachDB placement and initialization. Without operator +/// intervention, however, these actions need a way to be automated in our +/// deployment. +/// +/// By injecting this (optional) configuration into the bootstrap agent, it +/// can act as a stand-in initialization service. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] +pub struct SetupServiceConfig { + #[serde(default, rename = "request")] + pub requests: Vec, +} + +/// A request to initialize a sled. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] +pub struct SledRequest { + /// The Sled Agent address receiving these requests. + pub sled_address: SocketAddr, + + /// Partitions to be created. + #[serde(default, rename = "partition")] + pub partitions: Vec, + + /// Services to be instantiated. + #[serde(default, rename = "service")] + pub services: Vec, +} + +impl SetupServiceConfig { + pub fn from_file>(path: P) -> Result { + let path = path.as_ref(); + let contents = std::fs::read_to_string(path)?; + let config = toml::from_str(&contents)?; + Ok(config) + } +} diff --git a/sled-agent/src/rack_setup/mod.rs b/sled-agent/src/rack_setup/mod.rs new file mode 100644 index 00000000000..e947ff99ef0 --- /dev/null +++ b/sled-agent/src/rack_setup/mod.rs @@ -0,0 +1,8 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Rack Setup Service + +pub mod config; +pub mod service; diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs new file mode 100644 index 00000000000..470725b961d --- /dev/null +++ b/sled-agent/src/rack_setup/service.rs @@ -0,0 +1,225 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Rack Setup Service implementation + +use super::config::SetupServiceConfig as Config; +use omicron_common::backoff::{ + internal_service_policy, retry_notify, BackoffError, +}; +use slog::Logger; +use thiserror::Error; + +/// Describes errors which may occur while operating the setup service. +#[derive(Error, Debug)] +pub enum SetupServiceError { + #[error("Error accessing filesystem: {0}")] + Io(#[from] std::io::Error), + + #[error("Error making HTTP request to Sled Agent: {0}")] + SledApi(#[from] sled_agent_client::Error), + + #[error("Cannot deserialize TOML file")] + Toml(#[from] toml::de::Error), + + #[error(transparent)] + Http(#[from] reqwest::Error), + + #[error("Configuration changed")] + Configuration, +} + +/// The interface to the Rack Setup Service. +pub struct Service { + handle: tokio::task::JoinHandle>, +} + +impl Service { + pub fn new(log: Logger, config: Config) -> Self { + let handle = tokio::task::spawn(async move { + let svc = ServiceInner::new(log); + svc.inject_rack_setup_requests(&config).await + }); + + Service { handle } + } + + /// Awaits the completion of the RSS service. + pub async fn join(self) -> Result<(), SetupServiceError> { + self.handle.await.expect("Rack Setup Service Task panicked") + } +} + +/// The implementation of the Rack Setup Service. +struct ServiceInner { + log: Logger, +} + +impl ServiceInner { + pub fn new(log: Logger) -> Self { + ServiceInner { log } + } + + // In lieu of having an operator send requests to all sleds via an + // initialization service, the sled-agent configuration may allow for the + // automated injection of setup requests from a sled. + async fn inject_rack_setup_requests( + &self, + config: &Config, + ) -> Result<(), SetupServiceError> { + info!(self.log, "Injecting RSS configuration: {:#?}", config); + + let serialized_config = toml::Value::try_from(&config) + .expect("Cannot serialize configuration"); + let config_str = toml::to_string(&serialized_config) + .expect("Cannot turn config to string"); + + // First, check if this request has previously been made. + // + // Normally, the rack setup service is run with a human-in-the-loop, + // but with this automated injection, we need a way to determine the + // (destructive) initialization has occurred. + // + // We do this by storing the configuration at "rss_config_path" + // after successfully performing initialization. + let rss_config_path = std::path::Path::new(crate::OMICRON_CONFIG_PATH) + .join("config-rss.toml"); + if rss_config_path.exists() { + info!( + self.log, + "RSS configuration already exists at {}", + rss_config_path.to_string_lossy() + ); + let old_config: Config = toml::from_str( + &tokio::fs::read_to_string(&rss_config_path).await?, + )?; + if &old_config == config { + info!( + self.log, + "RSS config already applied from: {}", + rss_config_path.to_string_lossy() + ); + return Ok(()); + } + + // TODO(https://github.com/oxidecomputer/omicron/issues/724): + // We could potentially handle this case by deleting all + // partitions (in preparation for applying the new + // configuration), but at the moment it's an error. + warn!( + self.log, + "Rack Setup Service Config was already applied, but has changed. + This means that you may have partitions set up on this sled, but they + may not match the ones requested by the supplied configuration.\n + To re-initialize this sled: + - Disable all Oxide services + - Delete all partitions within the attached zpool + - Delete the configuration file ({}) + - Restart the sled agent", + rss_config_path.to_string_lossy() + ); + return Err(SetupServiceError::Configuration); + } else { + info!( + self.log, + "No RSS configuration found at {}", + rss_config_path.to_string_lossy() + ); + } + + // Issue the dataset initialization requests to all sleds. + futures::future::join_all( + config.requests.iter().map(|request| async move { + info!(self.log, "observing request: {:#?}", request); + let dur = std::time::Duration::from_secs(60); + let client = reqwest::ClientBuilder::new() + .connect_timeout(dur) + .timeout(dur) + .build()?; + let client = sled_agent_client::Client::new_with_client( + &format!("http://{}", request.sled_address), + client, + self.log.new(o!("SledAgentClient" => request.sled_address)), + ); + + info!(self.log, "sending partition requests..."); + for partition in &request.partitions { + let filesystem_put = || async { + info!(self.log, "creating new filesystem: {:?}", partition); + client.filesystem_put(&partition.clone().into()) + .await + .map_err(BackoffError::transient)?; + Ok::< + (), + BackoffError< + sled_agent_client::Error, + >, + >(()) + }; + let log_failure = |error, _| { + warn!(self.log, "failed to create filesystem"; "error" => ?error); + }; + retry_notify( + internal_service_policy(), + filesystem_put, + log_failure, + ).await?; + } + Ok(()) + }) + ).await.into_iter().collect::, SetupServiceError>>()?; + + // Issue service initialization requests. + // + // Note that this must happen *after* the partition initialization, + // to ensure that CockroachDB has been initialized before Nexus + // starts. + futures::future::join_all( + config.requests.iter().map(|request| async move { + info!(self.log, "observing request: {:#?}", request); + let dur = std::time::Duration::from_secs(60); + let client = reqwest::ClientBuilder::new() + .connect_timeout(dur) + .timeout(dur) + .build()?; + let client = sled_agent_client::Client::new_with_client( + &format!("http://{}", request.sled_address), + client, + self.log.new(o!("SledAgentClient" => request.sled_address)), + ); + + info!(self.log, "sending service requests..."); + let services_put = || async { + info!(self.log, "initializing sled services: {:?}", request.services); + client.services_put( + &sled_agent_client::types::ServiceEnsureBody { + services: request.services.iter().map(|s| s.clone().into()).collect() + }) + .await + .map_err(BackoffError::transient)?; + Ok::< + (), + BackoffError< + sled_agent_client::Error, + >, + >(()) + }; + let log_failure = |error, _| { + warn!(self.log, "failed to initialize services"; "error" => ?error); + }; + retry_notify( + internal_service_policy(), + services_put, + log_failure, + ).await?; + Ok::<(), SetupServiceError>(()) + }) + ).await.into_iter().collect::, SetupServiceError>>()?; + + // Finally, make sure the configuration is saved so we don't inject + // the requests on the next iteration. + tokio::fs::write(rss_config_path, config_str).await?; + Ok(()) + } +}