Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions sled-agent/src/bin/sled-agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ use omicron_common::api::external::Error;
use omicron_common::cmd::fatal;
use omicron_common::cmd::CmdError;
use omicron_sled_agent::bootstrap::{
config::Config as BootstrapConfig, config::SetupServiceConfig as RssConfig,
server as bootstrap_server,
config::Config as BootstrapConfig, server as bootstrap_server,
};
use omicron_sled_agent::rack_setup::config::SetupServiceConfig as RssConfig;
use omicron_sled_agent::{config::Config as SledConfig, server as sled_server};
use std::path::PathBuf;
use structopt::StructOpt;
Expand Down
189 changes: 13 additions & 176 deletions sled-agent/src/bootstrap/agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use super::trust_quorum::{
self, RackSecret, ShareDistribution, TrustQuorumError,
};
use super::views::ShareResponse;
use crate::rack_setup::service::Service as RackSetupService;
use omicron_common::api::external::Error as ExternalError;
use omicron_common::backoff::{
internal_service_policy, retry_notify, BackoffError,
Expand All @@ -19,13 +20,11 @@ use slog::Logger;
use std::io;
use std::path::Path;
use thiserror::Error;
use tokio::sync::Mutex;

/// Describes errors which may occur while operating the bootstrap service.
#[derive(Error, Debug)]
pub enum BootstrapError {
#[error("Cannot deserialize TOML file")]
Toml(#[from] toml::de::Error),

#[error("Error accessing filesystem: {0}")]
Io(#[from] std::io::Error),

Expand All @@ -35,17 +34,8 @@ pub enum BootstrapError {
#[error("Error modifying SMF service: {0}")]
SmfAdm(#[from] smf::AdmError),

#[error("Error making HTTP request to Sled Agent: {0}")]
SledApi(#[from] sled_agent_client::Error<sled_agent_client::types::Error>),

#[error("Error making HTTP request to Nexus: {0}")]
NexusApi(#[from] nexus_client::Error<nexus_client::types::Error>),

#[error(transparent)]
TrustQuorum(#[from] TrustQuorumError),

#[error("Configuration changed")]
Configuration,
}

impl From<BootstrapError> for ExternalError {
Expand Down Expand Up @@ -82,13 +72,15 @@ pub(crate) struct Agent {
log: Logger,
peer_monitor: discovery::PeerMonitor,
share: Option<ShareDistribution>,

rss: Mutex<Option<RackSetupService>>,
}

impl Agent {
pub fn new(log: Logger) -> Result<Self, BootstrapError> {
let peer_monitor = discovery::PeerMonitor::new(&log)?;
let share = read_key_share()?;
Ok(Agent { log, peer_monitor, share })
Ok(Agent { log, peer_monitor, share, rss: Mutex::new(None) })
}

/// Implements the "request share" API.
Expand Down Expand Up @@ -207,169 +199,14 @@ impl Agent {
Ok(())
}

// In lieu of having an operator send requests to all sleds via an
// initialization service, the sled-agent configuration may allow for the
// automated injection of setup requests from a sled.
async fn inject_rack_setup_service_requests(
&self,
config: &Config,
) -> Result<(), BootstrapError> {
// Initializes the Rack Setup Service.
async fn start_rss(&self, config: &Config) -> Result<(), BootstrapError> {
if let Some(rss_config) = &config.rss_config {
info!(self.log, "Injecting RSS configuration: {:#?}", rss_config);

let serialized_config = toml::Value::try_from(&config)
.expect("Cannot serialize configuration");
let config_str = toml::to_string(&serialized_config)
.expect("Cannot turn config to string");

// First, check if this request has previously been made.
//
// Normally, the rack setup service is run with a human-in-the-loop,
// but with this automated injection, we need a way to determine the
// (destructive) initialization has occurred.
//
// We do this by storing the configuration at "rss_config_path"
// after successfully performing initialization.
let rss_config_path =
std::path::Path::new(crate::OMICRON_CONFIG_PATH)
.join("config-rss.toml");
if rss_config_path.exists() {
info!(
self.log,
"RSS configuration already exists at {}",
rss_config_path.to_string_lossy()
);
let old_config: Config = toml::from_str(
&tokio::fs::read_to_string(&rss_config_path).await?,
)?;
if &old_config == config {
info!(
self.log,
"RSS config already applied from: {}",
rss_config_path.to_string_lossy()
);
return Ok(());
}

// TODO(https://github.com/oxidecomputer/omicron/issues/724):
// We could potentially handle this case by deleting all
// partitions (in preparation for applying the new
// configuration), but at the moment it's an error.
warn!(
self.log,
"Rack Setup Service Config was already applied, but has changed.
This means that you may have partitions set up on this sled, but they
may not match the ones requested by the supplied configuration.\n
To re-initialize this sled:
- Disable all Oxide services
- Delete all partitions within the attached zpool
- Delete the configuration file ({})
- Restart the sled agent",
rss_config_path.to_string_lossy()
);
return Err(BootstrapError::Configuration);
} else {
info!(
self.log,
"No RSS configuration found at {}",
rss_config_path.to_string_lossy()
);
}

// Issue the dataset initialization requests to all sleds.
futures::future::join_all(
rss_config.requests.iter().map(|request| async move {
info!(self.log, "observing request: {:#?}", request);
let dur = std::time::Duration::from_secs(60);
let client = reqwest::ClientBuilder::new()
.connect_timeout(dur)
.timeout(dur)
.build()
.map_err(|e| nexus_client::Error::<nexus_client::types::Error>::from(e))?;
let client = sled_agent_client::Client::new_with_client(
&format!("http://{}", request.sled_address),
client,
self.log.new(o!("SledAgentClient" => request.sled_address)),
);

info!(self.log, "sending partition requests...");
for partition in &request.partitions {
let filesystem_put = || async {
info!(self.log, "creating new filesystem: {:?}", partition);
client.filesystem_put(&partition.clone().into())
.await
.map_err(BackoffError::transient)?;
Ok::<
(),
BackoffError<
sled_agent_client::Error<sled_agent_client::types::Error>,
>,
>(())
};
let log_failure = |error, _| {
warn!(self.log, "failed to create filesystem"; "error" => ?error);
};
retry_notify(
internal_service_policy(),
filesystem_put,
log_failure,
).await?;
}
Ok(())
})
).await.into_iter().collect::<Result<Vec<()>, BootstrapError>>()?;

// Issue service initialization requests.
//
// Note that this must happen *after* the partition initialization,
// to ensure that CockroachDB has been initialized before Nexus
// starts.
futures::future::join_all(
rss_config.requests.iter().map(|request| async move {
info!(self.log, "observing request: {:#?}", request);
let dur = std::time::Duration::from_secs(60);
let client = reqwest::ClientBuilder::new()
.connect_timeout(dur)
.timeout(dur)
.build()
.map_err(|e| nexus_client::Error::<nexus_client::types::Error>::from(e))?;
let client = sled_agent_client::Client::new_with_client(
&format!("http://{}", request.sled_address),
client,
self.log.new(o!("SledAgentClient" => request.sled_address)),
);

info!(self.log, "sending service requests...");
let services_put = || async {
info!(self.log, "initializing sled services: {:?}", request.services);
client.services_put(
&sled_agent_client::types::ServiceEnsureBody {
services: request.services.iter().map(|s| s.clone().into()).collect()
})
.await
.map_err(BackoffError::transient)?;
Ok::<
(),
BackoffError<
sled_agent_client::Error<sled_agent_client::types::Error>,
>,
>(())
};
let log_failure = |error, _| {
warn!(self.log, "failed to initialize services"; "error" => ?error);
};
retry_notify(
internal_service_policy(),
services_put,
log_failure,
).await?;
Ok::<(), BootstrapError>(())
})
).await.into_iter().collect::<Result<Vec<()>, BootstrapError>>()?;

// Finally, make sure the configuration is saved so we don't inject
// the requests on the next iteration.
tokio::fs::write(rss_config_path, config_str).await?;
let rss = RackSetupService::new(
self.log.new(o!("component" => "RSS")),
rss_config.clone(),
);
self.rss.lock().await.replace(rss);
}
Ok(())
}
Expand All @@ -391,7 +228,7 @@ impl Agent {
self.establish_sled_quorum().await?;
}

self.inject_rack_setup_service_requests(config).await?;
self.start_rss(config).await?;

Ok(())
}
Expand Down
46 changes: 1 addition & 45 deletions sled-agent/src/bootstrap/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,10 @@

//! Interfaces for working with bootstrap agent configuration

use crate::config::ConfigError;
use crate::params::{DatasetEnsureBody, ServiceRequest};
use dropshot::ConfigDropshot;
use dropshot::ConfigLogging;
use serde::Deserialize;
use serde::Serialize;
use std::net::SocketAddr;
use std::path::Path;
use uuid::Uuid;

/// Configuration for a bootstrap agent
Expand All @@ -21,45 +17,5 @@ pub struct Config {
pub dropshot: ConfigDropshot,
pub log: ConfigLogging,

pub rss_config: Option<SetupServiceConfig>,
}

/// Configuration for the "rack setup service", which is controlled during
/// bootstrap.
///
/// The Rack Setup Service should be responsible for one-time setup actions,
/// such as CockroachDB placement and initialization. Without operator
/// intervention, however, these actions need a way to be automated in our
/// deployment.
///
/// By injecting this (optional) configuration into the bootstrap agent, it
/// can act as a stand-in initialization service.
#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)]
pub struct SetupServiceConfig {
#[serde(default, rename = "request")]
pub requests: Vec<SledRequest>,
}

/// A request to initialize a sled.
#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)]
pub struct SledRequest {
/// The Sled Agent address receiving these requests.
pub sled_address: SocketAddr,

/// Partitions to be created.
#[serde(default, rename = "partition")]
pub partitions: Vec<DatasetEnsureBody>,

/// Services to be instantiated.
#[serde(default, rename = "service")]
pub services: Vec<ServiceRequest>,
}

impl SetupServiceConfig {
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self, ConfigError> {
let path = path.as_ref();
let contents = std::fs::read_to_string(path)?;
let config = toml::from_str(&contents)?;
Ok(config)
}
pub rss_config: Option<crate::rack_setup::config::SetupServiceConfig>,
}
1 change: 1 addition & 0 deletions sled-agent/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ mod instance;
mod instance_manager;
mod nexus;
mod params;
pub mod rack_setup;
pub mod server;
mod services;
mod sled_agent;
Expand Down
52 changes: 52 additions & 0 deletions sled-agent/src/rack_setup/config.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Interfaces for working with RSS config.

use crate::config::ConfigError;
use crate::params::{DatasetEnsureBody, ServiceRequest};
use serde::Deserialize;
use serde::Serialize;
use std::net::SocketAddr;
use std::path::Path;

/// Configuration for the "rack setup service", which is controlled during
/// bootstrap.
///
/// The Rack Setup Service should be responsible for one-time setup actions,
/// such as CockroachDB placement and initialization. Without operator
/// intervention, however, these actions need a way to be automated in our
/// deployment.
///
/// By injecting this (optional) configuration into the bootstrap agent, it
/// can act as a stand-in initialization service.
#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)]
pub struct SetupServiceConfig {
#[serde(default, rename = "request")]
pub requests: Vec<SledRequest>,
}

/// A request to initialize a sled.
#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)]
pub struct SledRequest {
/// The Sled Agent address receiving these requests.
pub sled_address: SocketAddr,

/// Partitions to be created.
#[serde(default, rename = "partition")]
pub partitions: Vec<DatasetEnsureBody>,
Comment thread
bnaecker marked this conversation as resolved.

/// Services to be instantiated.
#[serde(default, rename = "service")]
pub services: Vec<ServiceRequest>,
}

impl SetupServiceConfig {
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self, ConfigError> {
let path = path.as_ref();
let contents = std::fs::read_to_string(path)?;
let config = toml::from_str(&contents)?;
Ok(config)
}
}
8 changes: 8 additions & 0 deletions sled-agent/src/rack_setup/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Rack Setup Service

pub mod config;
pub mod service;
Loading