diff --git a/Cargo.lock b/Cargo.lock index 1bd1014166d..f266bf4172e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7160,13 +7160,16 @@ dependencies = [ "chrono", "ereport-types", "iddqd", + "nexus-db-model", "nexus-inventory", "nexus-reconfigurator-planning", "nexus-types", "omicron-common", + "omicron-rpaths", "omicron-test-utils", "omicron-uuid-kinds", "omicron-workspace-hack", + "pq-sys", "rand 0.9.2", "serde", "serde_json", diff --git a/common/src/lib.rs b/common/src/lib.rs index 162c5fd2c6d..2ce4140b728 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -60,8 +60,22 @@ impl slog::KV for FileKv { /// This exists because the database doesn't store nanosecond-precision, so if /// we store nanosecond-precision timestamps, then DateTime conversion is lossy /// when round-tripping through the database. That's rather inconvenient. +/// +/// To truncate an arbitrary timestamp to database precision, use +/// [`timestamp_db_precision`]. pub fn now_db_precision() -> chrono::DateTime { - let ts = chrono::Utc::now(); + timestamp_db_precision(chrono::Utc::now()) +} + +/// Truncates a [`chrono::DateTime`]`<`[`chrono::Utc`]`>` to the previous +/// microsecond. +/// +/// This exists because the database doesn't store nanosecond-precision, so if +/// we store nanosecond-precision timestamps, then `DateTime`conversion is lossy +/// when round-tripping through the database. That's rather inconvenient. +pub fn timestamp_db_precision( + ts: chrono::DateTime, +) -> chrono::DateTime { let nanosecs = ts.timestamp_subsec_nanos(); let micros = ts.timestamp_subsec_micros(); let only_nanos = nanosecs - micros * 1000; diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index d9653772928..f6ef520bb8c 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -9,7 +9,6 @@ use super::check_limit; use crate::Omdb; use crate::helpers::CONNECTION_OPTIONS_HEADING; use crate::helpers::const_max_len; -use crate::helpers::datetime_opt_rfc3339_concise; use crate::helpers::datetime_rfc3339_concise; use crate::helpers::display_option_blank; use crate::helpers::display_option_invalid; @@ -23,20 +22,24 @@ use chrono::Utc; use clap::Args; use clap::Subcommand; use diesel::AggregateExpressionMethods; -use diesel::dsl::{count, min}; +use diesel::dsl::{count, max}; use diesel::prelude::*; use internal_dns_types::names::ServiceName; use nexus_db_lookup::DbConnection; +use nexus_db_model::EreporterRestart; use nexus_db_model::ereport as model; use nexus_db_model::ereport::DbEna; use nexus_db_queries::db; use nexus_db_queries::db::DataStore; use nexus_db_queries::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; use nexus_db_schema::schema::ereport::dsl; +use nexus_db_schema::schema::ereporter_restart::dsl as restart_dsl; use nexus_types::fm::ereport::Ena; use nexus_types::fm::ereport::Reporter; use omicron_uuid_kinds::EreporterRestartUuid; use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::SledUuid; +use std::collections::HashMap; use tabled::Tabled; use uuid::Uuid; @@ -145,6 +148,7 @@ struct ReportersArgs { #[clap(long = "slot", short = 's', requires = "slot_type")] slot: Option, + #[clap(long = "serial")] serial: Option, } @@ -508,69 +512,74 @@ async fn cmd_db_ereporters( datastore: &DataStore, args: &ReportersArgs, ) -> anyhow::Result<()> { - let &ReportersArgs { slot, slot_type, ref serial } = args; + let &ReportersArgs { slot_type, slot, serial: ref want_serial } = args; let slot_type = slot_type.map(nexus_db_model::SpType::from); let conn = datastore.pool_connection_for_tests().await?; - let reporters = (*conn).transaction_async({ - let serial = serial.clone(); - async move |conn| { - // Selecting all reporters may require a full table scan, depending - // on filters. + let (restarts, ereport_info) = (*conn) + .transaction_async(async move |conn| { conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await?; - let mut query = dsl::ereport - .group_by(( - dsl::restart_id, - dsl::reporter, - dsl::sled_id, - dsl::slot_type, - dsl::slot, - dsl::serial_number, - dsl::part_number - )) - .select(( - dsl::restart_id, - model::Reporter::as_select(), - dsl::serial_number, - dsl::part_number, - min(dsl::time_collected), - count(dsl::ena).aggregate_distinct(), - )) - .into_boxed(); - - if let Some(slot) = slot { - if slot_type.is_some() { - query = query - .filter(dsl::slot.eq(db::model::SqlU16::new(slot))); - } else { - anyhow::bail!( - "cannot filter reporters by slot without a value for `--type`" - ) - } - } + // The canonical list of reporter restarts comes from the + // `ereporter_restart` table. + let mut query = restart_dsl::ereporter_restart + .select(EreporterRestart::as_select()) + .into_boxed(); if let Some(slot_type) = slot_type { + query = query.filter(restart_dsl::slot_type.eq(slot_type)); + } + if let Some(slot) = slot { query = query - .filter(dsl::slot_type.eq(slot_type)); + .filter(restart_dsl::slot.eq(db::model::SqlU16::new(slot))); } + let restarts = query + .load_async::(&conn) + .await + .context("listing ereporter restarts")?; + + // The `ereporter_restart` table doesn't record the reporter's sled + // ID, VPD identity, or ereport count, so gather those from the + // `ereport` table, grouped by restart ID. The sled ID is part of + // the `GROUP BY` (rather than aggregated) since it is constant for + // a given restart ID; the VPD identity may start out NULL and be + // filled in later, so we take its `MAX` (ignoring NULLs) as we do + // for the slot number. This may require a full table scan. + let ereport_info = dsl::ereport + .group_by((dsl::restart_id, dsl::sled_id)) + .select(( + dsl::restart_id, + dsl::sled_id, + max(dsl::serial_number), + max(dsl::part_number), + count(dsl::ena).aggregate_distinct(), + )) + .load_async::<( + Uuid, + Option, + Option, + Option, + i64, + )>(&conn) + .await + .context("querying ereport metadata by restart ID")?; - if let Some(serial) = serial { - query = query.filter(dsl::serial_number.eq(serial.clone())); - } + Ok::<_, anyhow::Error>((restarts, ereport_info)) + }) + .await?; - query - .load_async::<(Uuid, model::Reporter, Option, Option, Option>, i64)>( - &conn, - ) - .await.context("listing reporter entries") - } - }).await?; + // Index the per-restart `ereport` metadata by restart ID. + let mut ereport_info = ereport_info + .into_iter() + .map(|(id, sled_id, serial, part_number, ereports)| { + (id, (sled_id, serial, part_number, ereports)) + }) + .collect::>(); #[derive(Tabled)] #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] struct ReporterRow { - #[tabled(display_with = "datetime_opt_rfc3339_concise")] - first_seen: Option>, + #[tabled(display_with = "datetime_rfc3339_concise")] + first_seen: DateTime, id: Uuid, #[tabled(display_with = "display_option_invalid")] identity: Option, @@ -581,27 +590,45 @@ async fn cmd_db_ereporters( ereports: i64, } - let mut rows = reporters + let mut rows = restarts .into_iter() - .map(|(id, reporter, serial, part_number, first_seen, ereports)| { - let identity = match reporter.try_into() { + .filter_map(|restart| { + let id = restart.id.into_untyped_uuid(); + let (sled_id, serial, part_number, ereports) = + ereport_info.remove(&id).unwrap_or((None, None, None, 0)); + // If we are filtering by serial number, skip any that don't match. + if want_serial.is_some() && serial.as_ref() != want_serial.as_ref() + { + return None; + } + // The reporter identity combines the restart's location (from + // `ereporter_restart`) with the sled ID (from `ereport`). + let identity = match (model::Reporter { + reporter: restart.reporter, + sled_id: sled_id + .map(|sled| SledUuid::from_untyped_uuid(sled).into()), + slot_type: restart.slot_type, + slot: restart.slot, + }) + .try_into() + { Ok(reporter) => Some(reporter), Err(err) => { eprintln!( - "error: encounted an invalid reporter entry for \ + "error: encountered an invalid reporter entry for \ reporter ID {id}: {err}", ); None } }; - ReporterRow { - first_seen, + Some(ReporterRow { + first_seen: restart.time_first_seen, id, identity, serial, part_number, ereports, - } + }) }) .collect::>(); rows.sort_by_key(|row| row.first_seen); diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index d0b05f5b1b4..6aeac9f1049 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -711,6 +711,7 @@ task: "fm_analysis" parent sitrep: ..................... inventory collection: ..................... + total known ereport restart IDs: 0 no new ereports since the parent sitrep no cases copied forward @@ -1409,6 +1410,7 @@ task: "fm_analysis" parent sitrep: ..................... inventory collection: ..................... + total known ereport restart IDs: 0 no new ereports since the parent sitrep no cases copied forward diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index c17b9c83901..96b9fcc4801 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -650,15 +650,13 @@ termination: Exited(0) stdout: List ereport reporters -Usage: omdb db ereport reporters [OPTIONS] [SERIAL] - -Arguments: - [SERIAL] +Usage: omdb db ereport reporters [OPTIONS] Options: --log-level log level filter [env: LOG_LEVEL=] [default: warn] -t, --type -s, --slot + --serial --color Color output [default: auto] [possible values: auto, always, never] -h, --help Print help diff --git a/nexus/db-model/src/ereporter_restart.rs b/nexus/db-model/src/ereporter_restart.rs new file mode 100644 index 00000000000..60b8edd7107 --- /dev/null +++ b/nexus/db-model/src/ereporter_restart.rs @@ -0,0 +1,43 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::DbTypedUuid; +use crate::EreporterType; +use crate::SpMgsSlot; +use crate::SpType; +use chrono::DateTime; +use chrono::Utc; +use nexus_db_schema::schema::ereporter_restart; +use omicron_uuid_kinds::EreporterRestartKind; +use omicron_uuid_kinds::EreporterRestartUuid; + +#[derive(Clone, Debug, Insertable, Queryable, Selectable)] +#[diesel(table_name = ereporter_restart)] +pub struct EreporterRestart { + pub id: DbTypedUuid, + pub time_first_seen: DateTime, + pub reporter: EreporterType, + pub slot_type: SpType, + pub slot: Option, +} + +impl EreporterRestart { + pub fn slot_number(&self) -> Option { + self.slot.map(|slot| (*slot).0) + } + + pub fn id(&self) -> &EreporterRestartUuid { + &self.id.0 + } +} + +impl iddqd::IdOrdItem for EreporterRestart { + type Key<'a> = &'a EreporterRestartUuid; + + fn key(&self) -> Self::Key<'_> { + self.id() + } + + iddqd::id_upcast!(); +} diff --git a/nexus/db-model/src/ereporter_type.rs b/nexus/db-model/src/ereporter_type.rs index 933675bc5fd..8bd142a95b4 100644 --- a/nexus/db-model/src/ereporter_type.rs +++ b/nexus/db-model/src/ereporter_type.rs @@ -14,6 +14,7 @@ impl_enum_type!( Clone, Debug, PartialEq, + Eq, Serialize, Deserialize, AsExpression, diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index ecd3402e45b..3705e2839aa 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -148,6 +148,7 @@ mod disk_type_local_storage; mod dns; mod downstairs; pub mod ereport; +mod ereporter_restart; mod ereporter_type; mod external_ip; mod external_subnet; @@ -301,6 +302,7 @@ pub use disk_type_local_storage::*; pub use dns::*; pub use downstairs::*; pub use ereport::Ereport; +pub use ereporter_restart::*; pub use ereporter_type::*; pub use external_ip::*; pub use external_subnet::*; diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 88fc1a6bb63..9e54b5d1957 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(271, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(272, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ pub static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(272, "ereporter-restart-order-v2"), KnownVersion::new(271, "inv-fmd"), KnownVersion::new(270, "fm-alert-resource-deletion"), KnownVersion::new(269, "fm-disk-de-and-facts"), diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 166eaf47467..a23b89d7519 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -9,11 +9,11 @@ use crate::authz; use crate::context::OpContext; use crate::db::datastore::RunnableQuery; use crate::db::model::Ereport; +use crate::db::model::EreporterRestart; +use crate::db::model::EreporterType; use crate::db::model::SpMgsSlot; use crate::db::model::SpType; use crate::db::model::SqlU16; -use crate::db::model::SqlU32; -use crate::db::model::ereport as model; use crate::db::model::ereport::DbEna; use crate::db::pagination::{paginated, paginated_multicolumn}; use crate::db::raw_query_builder::QueryBuilder; @@ -21,14 +21,15 @@ use crate::db::raw_query_builder::TypedSqlQuery; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::DateTime; use chrono::Utc; -use diesel::AggregateExpressionMethods; -use diesel::dsl::{count, min}; +use diesel::pg::Pg; use diesel::prelude::*; +use diesel::query_builder::*; use diesel::sql_types; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_lookup::DbConnection; use nexus_db_schema::schema::ereport::dsl; +use nexus_db_schema::schema::ereporter_restart::dsl as restart_dsl; use nexus_types::fm::ereport as fm; use nexus_types::fm::ereport::EreportFilters; use nexus_types::fm::ereport::EreportId; @@ -45,14 +46,6 @@ use uuid::Uuid; type EreportIdTuple = (Uuid, DbEna); -#[derive(Clone, Debug)] -pub struct EreporterRestartBySerial { - pub id: EreporterRestartUuid, - pub first_seen_at: DateTime, - pub reporter_kind: fm::Reporter, - pub ereports: u32, -} - impl DataStore { /// Fetch an ereport by its restart ID and ENA. /// @@ -158,60 +151,18 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } - /// List unique ereporter restarts for the given serial number. - pub async fn ereporter_restart_list_by_serial( + /// Lists ereporter restarts, paginated by restart ID. + pub async fn ereporter_restart_list( &self, opctx: &OpContext, - serial: String, - ) -> ListResultVec { + pagparams: &DataPageParams<'_, Uuid>, + ) -> ListResultVec { opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; - - let conn = &*self.pool_connection_authorized(opctx).await?; - let rows = Self::restart_list_by_serial_query(serial.clone()) - .load_async::<(Uuid, model::Reporter, Option>, SqlU32)>( - conn, - ) + paginated(restart_dsl::ereporter_restart, restart_dsl::id, pagparams) + .select(EreporterRestart::as_select()) + .load_async(&*self.pool_connection_authorized(opctx).await?) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - - let restarts = rows.into_iter().map(|(restart_id, reporter, first_seen, ereports)| { - let first_seen_at = first_seen.expect( - "`min(time_collected)` should never return `NULL`, since the \ - `time_collected` column is not nullable, and the `SELECT` clause \ - should return nothing if the result set is empty" - ); - EreporterRestartBySerial { - id: EreporterRestartUuid::from_untyped_uuid(restart_id), - reporter_kind: reporter.try_into().unwrap(), - first_seen_at, - ereports: ereports.into(), - } - }).collect(); - - Ok(restarts) - } - - fn restart_list_by_serial_query( - serial: String, - ) -> impl RunnableQuery<(Uuid, model::Reporter, Option>, SqlU32)> - { - dsl::ereport - .filter(dsl::serial_number.eq(serial.clone())) - .filter(dsl::time_deleted.is_null()) - .group_by(( - dsl::restart_id, - dsl::reporter, - dsl::slot_type, - dsl::slot, - dsl::sled_id, - )) - .select(( - dsl::restart_id, - model::Reporter::as_select(), - min(dsl::time_collected), - count(dsl::ena).aggregate_distinct(), - )) - .order_by(dsl::restart_id) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } pub async fn latest_ereport_id( @@ -286,39 +237,187 @@ impl DataStore { .select((dsl::restart_id, dsl::ena)) } + /// Inserts the provided tranche of `ereports` into the `ereport` table, and + /// potentially updates the `ereporter_restart` table if the restart ID of + /// the inserted ereports has not been seen before. + /// + /// # Returns + /// + /// This function returns a tuple containing the number of new `ereport` + /// rows that were created, along with the newest [`EreportId`] for the same + /// reporter index as the inserted ereports. + /// + /// The returned ereport ID is intended to provide the caller with the + /// latest ENA to use in a subsequent request to ingest ereports from the + /// same reporter. In some cases, it may: + /// + /// - be a newer ENA than the highest ENA in the inserted ereports, if + /// additional ereports were inserted concurrently + /// - have a different restart ID than the one provided, if ereports from + /// a newer restart of that reporter were inserted concurrently pub async fn ereports_insert( &self, opctx: &OpContext, + restart_id: EreporterRestartUuid, + time_collected: DateTime, reporter: fm::Reporter, ereports: impl IntoIterator, ) -> CreateResult<(usize, Option)> { opctx.authorize(authz::Action::CreateChild, &authz::FLEET).await?; let conn = self.pool_connection_authorized(opctx).await?; + let ereports = ereports .into_iter() .map(|data| Ereport::new(data, reporter)) .collect::>(); - let created = diesel::insert_into(dsl::ereport) - .values(ereports) - .on_conflict((dsl::restart_id, dsl::ena)) - .do_nothing() - .execute_async(&*conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - .internal_context("failed to insert ereports") - })?; + let n_ereports = ereports.len(); + let created = Self::ereports_insert_query( + restart_id, + time_collected, + reporter, + ereports, + ) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server).internal_context( + format!( + "failed to insert {n_ereports} ereports from {reporter} \ + (restart {restart_id})", + ), + ) + })?; + let latest = self .latest_ereport_id_on_conn(&conn, reporter) .await .map_err(|e| { e.internal_context(format!( - "failed to refresh latest ereport ID for {reporter}" + "failed to refresh latest ereport ID for {reporter}", )) })?; Ok((created, latest)) } + /// Returns a query for inserting ereports into the database and updating + /// the restart history table if the restart ID of the provided ereports is + /// not already present. + /// + /// The two operations of inserting the ereports into the `ereport` table + /// and inserting an entry into the `ereporter_restart` table for the + /// restart ID are performed in a single atomic CTE. It's necessary for + /// these two operations to be done atomically, because if they are + /// non-atomic, it is possible for a Nexus to die, or go out to lunch for a + /// bit, between inserting the ereport records and creating the + /// `ereporter_restart` record for a tranche of ereports from a not-yet-seen + /// restart ID. If this happens, another Nexus could insert some more + /// ereports from that restart ID, and try to put something in the restart + /// ID table with a timestamp that is later than the actual first time + /// ereports from that ID were observed. + /// + /// The SQL generated here is output to + /// `tests/output/ereports_insert_host.sql` and + /// `tests/output/ereports_insert_sp.sql`, for host-OS and SP ereports, + /// respectively. + fn ereports_insert_query( + restart_id: EreporterRestartUuid, + time_collected: chrono::DateTime, + reporter: fm::Reporter, + ereports: Vec, + ) -> impl RunnableQuery { + /// This is basically just a big pile of ceremony for combining two + /// little Diesel queries into a CTE... + struct EreportInsertQuery { + insert_ereports: IE, + insert_reporter: IR, + slot: Option, + } + + impl QueryId for EreportInsertQuery { + type QueryId = (); + const HAS_STATIC_QUERY_ID: bool = false; + } + + impl Query for EreportInsertQuery { + type SqlType = sql_types::BigInt; + } + + impl diesel::RunQueryDsl for EreportInsertQuery {} + + impl QueryFragment for EreportInsertQuery + where + IE: QueryFragment, + IR: QueryFragment, + { + fn walk_ast<'b>( + &'b self, + mut out: AstPass<'_, 'b, Pg>, + ) -> QueryResult<()> { + out.push_sql("WITH inserted_ereports AS ( "); + self.insert_ereports.walk_ast(out.reborrow())?; + + out.push_sql("), inserted_reporter AS ("); + self.insert_reporter.walk_ast(out.reborrow())?; + out.push_sql(" ON CONFLICT (id) DO "); + // If we have a slot number, update it so that a previously-null + // slot number is filled in; if we do not, do nothing on + // conflict so a previously non-NULL slot is not clobbered. + if let Some(ref slot) = self.slot { + out.push_sql("UPDATE SET \"slot\" = "); + out.push_bind_param::(slot)?; + } else { + out.push_sql("NOTHING"); + } + // We don't actually need this, but `WITH` clauses have to + // return something, sooo.... + out.push_sql(" RETURNING id) "); + out.push_sql("SELECT count(*) FROM inserted_ereports"); + Ok(()) + } + } + + let (reporter, slot_type, slot) = match reporter { + fm::Reporter::HostOs { slot, .. } => { + (EreporterType::Host, SpType::Sled, slot.map(SqlU16::from)) + } + fm::Reporter::Sp { sp_type, slot } => { + let sp_type = sp_type.into(); + let slot = SqlU16::from(slot); + (EreporterType::Sp, sp_type, Some(slot)) + } + }; + + // The query fragment to insert ereports into the `ereport` table. + let insert_ereports = diesel::insert_into(dsl::ereport) + .values(ereports) + // Some or all of the ereports collected in this batch may already + // exist in the database because they were ingested by another + // Nexus. If the same ENAs exist for this restart ID, that's fine; + // don't overwrite them. + .on_conflict((dsl::restart_id, dsl::ena)) + .do_nothing() + .returning(dsl::ena); + + // Query fragment to insert the reporter restart entry into the + // ereporter restart table, or update the existing entry's slot column + // if one exists and the slot column is null. The null behavior will be + // added by the `walk_ast()` method on `EreporterInsertQuery`, because + // it depends on whether or not there is a slot number to insert, and I + // couldn't figure out how to get diesel to let me type erase an INSERT + // statement that may have one of multiple ON CONFLICT clauses... + let insert_reporter = diesel::insert_into( + restart_dsl::ereporter_restart, + ) + .values(crate::db::model::EreporterRestart { + id: restart_id.into(), + time_first_seen: time_collected, + reporter, + slot_type, + slot: slot.map(SpMgsSlot::from), + }); + EreportInsertQuery { insert_ereports, insert_reporter, slot } + } + /// Lists ereports which have not been marked as **definitely seen** /// (included in a committed sitrep) in the database, restricted to /// ereports whose `class` is one of `classes`, paginated by the reporter @@ -489,12 +588,15 @@ impl DataStore { mod tests { use super::*; use crate::db::explain::ExplainableAsync; + use crate::db::pagination::Paginator; use crate::db::pub_test_utils::TestDatabase; use crate::db::pub_test_utils::explain::assert_uses_partial_index_only; use crate::db::raw_query_builder::expectorate_query_contents; use dropshot::PaginationOrder; + use ereport_types::Ena; use omicron_test_utils::dev; use omicron_uuid_kinds::OmicronZoneUuid; + use std::collections::BTreeMap; use std::num::NonZeroU32; use std::time::Duration; @@ -551,31 +653,6 @@ mod tests { logctx.cleanup_successful(); } - #[tokio::test] - async fn explain_restart_list_by_serial() { - let logctx = dev::test_setup_log("explain_restart_list_by_serial"); - let db = TestDatabase::new_with_pool(&logctx.log).await; - let pool = db.pool(); - let conn = pool.claim().await.unwrap(); - - let query = DataStore::restart_list_by_serial_query(String::new()); - let explanation = query - .explain_async(&conn) - .await - .expect("Failed to explain query - is it valid SQL?"); - - eprintln!("{explanation}"); - - assert!( - !explanation.contains("FULL SCAN"), - "Found an unexpected FULL SCAN: {}", - explanation - ); - - db.terminate().await; - logctx.cleanup_successful(); - } - #[tokio::test] async fn explain_ereport_fetch_matching_default() { explain_fetch_matching_query( @@ -835,6 +912,98 @@ mod tests { logctx.cleanup_successful(); } + #[tokio::test] + async fn expectorate_ereports_insert_sp() { + let restart_id = EreporterRestartUuid::nil(); + let collector_id = OmicronZoneUuid::nil(); + let reporter = + fm::Reporter::Sp { sp_type: SpType::Sled.into(), slot: 16 }; + let query = DataStore::ereports_insert_query( + restart_id, + DateTime::::MIN_UTC, + reporter, + vec![ + Ereport { + restart_id: restart_id.into(), + ena: Ena(2).into(), + time_collected: DateTime::::MIN_UTC, + collector_id: collector_id.into(), + part_number: Some("my cool CPN".to_string()), + serial_number: Some("my cool serial".to_string()), + class: Some("my cool ereport".to_string()), + report: serde_json::json!({}), + marked_seen_in: None, + time_deleted: None, + reporter: reporter.into(), + }, + Ereport { + restart_id: restart_id.into(), + ena: Ena(3).into(), + time_collected: DateTime::::MIN_UTC, + collector_id: collector_id.into(), + part_number: Some("my cool CPN".to_string()), + serial_number: Some("my cool serial".to_string()), + class: Some("my other ereport".to_string()), + report: serde_json::json!({}), + marked_seen_in: None, + time_deleted: None, + reporter: reporter.into(), + }, + ], + ); + expectorate_query_contents( + &query, + "tests/output/ereports_insert_sp.sql", + ) + .await; + } + + #[tokio::test] + async fn expectorate_ereports_insert_host() { + let restart_id = EreporterRestartUuid::nil(); + let collector_id = OmicronZoneUuid::nil(); + let reporter = + fm::Reporter::HostOs { slot: Some(16), sled: SledUuid::nil() }; + let query = DataStore::ereports_insert_query( + restart_id, + DateTime::::MIN_UTC, + reporter, + vec![ + Ereport { + restart_id: restart_id.into(), + ena: Ena(2).into(), + time_collected: DateTime::::MIN_UTC, + collector_id: collector_id.into(), + part_number: Some("my cool CPN".to_string()), + serial_number: Some("my cool serial".to_string()), + class: Some("my cool ereport".to_string()), + report: serde_json::json!({}), + marked_seen_in: None, + time_deleted: None, + reporter: reporter.into(), + }, + Ereport { + restart_id: restart_id.into(), + ena: Ena(3).into(), + time_collected: DateTime::::MIN_UTC, + collector_id: collector_id.into(), + part_number: Some("my cool CPN".to_string()), + serial_number: Some("my cool serial".to_string()), + class: Some("my other ereport".to_string()), + report: serde_json::json!({}), + marked_seen_in: None, + time_deleted: None, + reporter: reporter.into(), + }, + ], + ); + expectorate_query_contents( + &query, + "tests/output/ereports_insert_host.sql", + ) + .await; + } + // This test tests that the `ereport_fetch_matching` queries succeed with // filters that only select ereports over a time range, and the default (no // filters). @@ -852,13 +1021,12 @@ mod tests { let db = TestDatabase::new_with_datastore(&logctx.log).await; let (opctx, datastore) = (db.opctx(), db.datastore()); - let id = fm::EreportId { - restart_id: EreporterRestartUuid::new_v4(), - ena: ereport_types::Ena(2), - }; + let restart_id = EreporterRestartUuid::new_v4(); + let id = fm::EreportId { restart_id, ena: ereport_types::Ena(2) }; + let time_collected = Utc::now(); let ereport = fm::EreportData { id, - time_collected: Utc::now(), + time_collected, collector_id: OmicronZoneUuid::new_v4(), part_number: Some("my cool CPN".to_string()), serial_number: Some("my cool serial".to_string()), @@ -868,6 +1036,8 @@ mod tests { datastore .ereports_insert( &opctx, + restart_id, + time_collected, fm::Reporter::Sp { sp_type: nexus_types::inventory::SpType::Sled, slot: 19, @@ -976,6 +1146,8 @@ mod tests { datastore .ereports_insert( &opctx, + restart_id, + Utc::now(), fm::Reporter::Sp { sp_type: nexus_types::inventory::SpType::Sled, slot: 0, @@ -1038,4 +1210,265 @@ mod tests { db.terminate().await; logctx.cleanup_successful(); } + + async fn fetch_restarts( + datastore: &DataStore, + opctx: &OpContext, + ) -> BTreeMap { + // use a very small batch size in tests so that we are also exercising + // pagination. + const BATCH_SIZE: NonZeroU32 = match NonZeroU32::new(2) { + Some(b) => b, + None => panic!("2 is nonzero"), + }; + + let mut restarts = BTreeMap::new(); + let mut paginator = + Paginator::new(BATCH_SIZE, dropshot::PaginationOrder::Ascending); + while let Some(p) = paginator.next() { + let batch = datastore + .ereporter_restart_list(opctx, &p.current_pagparams()) + .await + .expect("listing ereporter restarts should succeed"); + paginator = + p.found_batch(&batch[..], &|r| r.id.into_untyped_uuid()); + for r in batch { + let id = r.id.into(); + if let Some(_) = restarts.insert(id, r) { + unreachable!( + "duplicate ereporter restart ID {id}; this should be \ + impossible as it is the primary key" + ) + } + } + } + + restarts + } + + #[tokio::test] + async fn test_ereporter_restarts() { + fn ereport_data( + restart_id: EreporterRestartUuid, + ena: u64, + time_collected: DateTime, + ) -> fm::EreportData { + fm::EreportData { + id: fm::EreportId { restart_id, ena: Ena(ena) }, + time_collected, + collector_id: OmicronZoneUuid::new_v4(), + part_number: None, + serial_number: None, + class: None, + report: serde_json::json!({}), + } + } + + let logctx = dev::test_setup_log("test_ereporter_restarts"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let t0 = Utc::now(); + + let timestamp = move |secs: usize| -> DateTime { + let t1 = omicron_common::timestamp_db_precision(t0); + let t2 = t1 + Duration::from_secs(secs as u64); + let t2 = omicron_common::timestamp_db_precision(t2); + assert_ne!( + t1, t2, + "two timestamps intended to be distinct would be equal after \ + truncating to database precision (advanced by {secs} seconds)" + ); + t2 + }; + // For host OS reporters, the sled's physical location may not be known + // immediately, so we may insert ereports with a NULL slot number. We + // wish to ensure that, if subsequent ereports are inserted for the same + // restart ID with a non-NULL slot number, the previous slot number is + // updated to the new value, but the first seen timestamp is not + // changed. + let host0_restart_id = EreporterRestartUuid::new_v4(); + let host0_first_seen = timestamp(100); + let host0_slot = 7; + let sled = SledUuid::new_v4(); + datastore + .ereports_insert( + opctx, + host0_restart_id, + host0_first_seen, + fm::Reporter::HostOs { sled, slot: None }, + vec![ereport_data(host0_restart_id, 1, host0_first_seen)], + ) + .await + .unwrap(); + { + let time_collected = timestamp(200); + datastore + .ereports_insert( + opctx, + host0_restart_id, + time_collected, + fm::Reporter::HostOs { sled, slot: Some(host0_slot) }, + vec![ + ereport_data(host0_restart_id, 2, time_collected), + ereport_data(host0_restart_id, 3, time_collected), + ], + ) + .await + .unwrap(); + } + + // Let's also test that if we insert ereports from a reporter with a + // non-NULL slot number, and then insert ereports from the same reporter + // with a NULL slot number, the original slot number is preserved. This + // shouldn't happen in practice, but we should make sure the behavior + // here is correct anyway. + let host1_restart_id = EreporterRestartUuid::new_v4(); + let host1_first_seen = timestamp(150); + let host1_slot = 3; + let sled = SledUuid::new_v4(); + datastore + .ereports_insert( + opctx, + host1_restart_id, + host1_first_seen, + fm::Reporter::HostOs { sled, slot: Some(host1_slot) }, + vec![ + ereport_data(host1_restart_id, 1, host1_first_seen), + ereport_data(host1_restart_id, 2, host1_first_seen), + ], + ) + .await + .unwrap(); + datastore + .ereports_insert( + opctx, + host1_restart_id, + timestamp(250), + fm::Reporter::HostOs { sled, slot: None }, + vec![ereport_data(host1_restart_id, 3, timestamp(250))], + ) + .await + .unwrap(); + + // Finally, let's do some SP ereports. These are mostly uncomplicated, + // since the slot number will always be present. + let sp0_restart_id0 = EreporterRestartUuid::new_v4(); + let sp0_first_seen = timestamp(300); + let sp0_slot = 1; + datastore + .ereports_insert( + opctx, + sp0_restart_id0, + sp0_first_seen, + fm::Reporter::Sp { + sp_type: SpType::Switch.into(), + slot: sp0_slot, + }, + vec![ + ereport_data(sp0_restart_id0, 1, sp0_first_seen), + ereport_data(sp0_restart_id0, 2, sp0_first_seen), + ], + ) + .await + .unwrap(); + { + let time_collected = timestamp(350); + datastore + .ereports_insert( + opctx, + sp0_restart_id0, + time_collected, + fm::Reporter::Sp { + sp_type: SpType::Switch.into(), + slot: sp0_slot, + }, + vec![ereport_data(sp0_restart_id0, 3, time_collected)], + ) + .await + .unwrap(); + } + // And a subsequent restart of the same SP slot. + let sp0_restart_id1 = EreporterRestartUuid::new_v4(); + let sp0_restart1_first_seen = timestamp(360); + datastore + .ereports_insert( + opctx, + sp0_restart_id1, + sp0_restart1_first_seen, + fm::Reporter::Sp { + sp_type: SpType::Switch.into(), + slot: sp0_slot, + }, + vec![ + ereport_data(sp0_restart_id1, 1, sp0_restart1_first_seen), + ereport_data(sp0_restart_id1, 2, sp0_restart1_first_seen), + ], + ) + .await + .unwrap(); + + let mut restarts = fetch_restarts(datastore, opctx).await; + + // host 0 restart 0 + let Some(host0) = restarts.remove(&host0_restart_id) else { + panic!( + "expected host 0 restart 0 ({host0_restart_id}) to be present" + ); + }; + assert_eq!(host0.reporter, EreporterType::Host); + assert_eq!(host0.slot_type, SpType::Sled); + assert_eq!( + host0.slot_number(), + Some(host0_slot), + "a NULL slot should be filled in once it is known", + ); + assert_eq!( + host0.time_first_seen, host0_first_seen, + "time_first_seen comes from the first insert and is not updated", + ); + + // host 1 restart 0 + let Some(host1) = restarts.remove(&host1_restart_id) else { + panic!( + "expected host 1 restart 0 ({host1_restart_id}) to be present" + ); + }; + assert_eq!(host1.reporter, EreporterType::Host); + assert_eq!(host1.slot_type, SpType::Sled); + assert_eq!( + host1.slot_number(), + Some(host1_slot), + "a NULL slot should be filled in once it is known", + ); + assert_eq!( + host1.time_first_seen, host1_first_seen, + "time_first_seen comes from the first insert and is not updated", + ); + + // SP 0 restart 0 + let Some(sp0_restart_0) = restarts.remove(&sp0_restart_id0) else { + panic!("expected sp 0 restart 0 ({sp0_restart_id0}) to be present"); + }; + assert_eq!(sp0_restart_0.reporter, EreporterType::Sp); + assert_eq!(sp0_restart_0.slot_type, SpType::Switch); + assert_eq!(sp0_restart_0.slot_number(), Some(sp0_slot)); + assert_eq!(sp0_restart_0.time_first_seen, sp0_first_seen); + + // SP 0 restart 1 + let Some(sp0_restart_1) = restarts.remove(&sp0_restart_id1) else { + panic!("expected sp 0 restart 1 ({sp0_restart_id1}) to be present"); + }; + assert_eq!(sp0_restart_1.reporter, EreporterType::Sp); + assert_eq!(sp0_restart_1.slot_type, SpType::Switch); + assert_eq!(sp0_restart_1.slot_number(), Some(1)); + assert_eq!(sp0_restart_1.time_first_seen, sp0_restart1_first_seen); + + assert!( + restarts.is_empty(), + "unexpected restart entries remaining: {restarts:?}" + ); + + db.terminate().await; + logctx.cleanup_successful(); + } } diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 905b359ce6a..8aac68d8d44 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -2227,10 +2227,11 @@ mod tests { // we'll make those here first. let restart_id = omicron_uuid_kinds::EreporterRestartUuid::new_v4(); let collector_id = OmicronZoneUuid::new_v4(); + let time_collected = Utc::now(); let ereport1 = EreportData { id: fm::EreportId { restart_id, ena: ereport_types::Ena(2) }, - time_collected: Utc::now(), + time_collected, collector_id, part_number: Some("930-55555".to_string()), serial_number: Some("BRM6900420".to_string()), @@ -2240,7 +2241,7 @@ mod tests { let ereport2 = EreportData { id: fm::EreportId { restart_id, ena: ereport_types::Ena(3) }, - time_collected: Utc::now(), + time_collected, collector_id, part_number: Some("930-55555".to_string()), serial_number: Some("BRM6900420".to_string()), @@ -2257,6 +2258,8 @@ mod tests { datastore .ereports_insert( &opctx, + restart_id, + time_collected, reporter, vec![ereport1.clone(), ereport2.clone()], ) @@ -2459,6 +2462,7 @@ mod tests { let input_report = InputReport { parent_sitrep_id: sitrep.parent_id(), parent_inv_id: None, + num_ereporter_restarts: 0, inv_id: sitrep.inv_id(), new_ereport_ids: Default::default(), open_cases: Default::default(), @@ -3080,6 +3084,7 @@ mod tests { new_ereport_ids: Default::default(), open_cases: Default::default(), closed_cases_copied_forward: Default::default(), + num_ereporter_restarts: 0, in_service_disks: Default::default(), }; let analysis_report = AnalysisReport { diff --git a/nexus/db-queries/tests/output/ereports_insert_host.sql b/nexus/db-queries/tests/output/ereports_insert_host.sql new file mode 100644 index 00000000000..545818ab538 --- /dev/null +++ b/nexus/db-queries/tests/output/ereports_insert_host.sql @@ -0,0 +1,50 @@ +WITH + inserted_ereports + AS ( + INSERT + INTO + ereport + ( + restart_id, + ena, + time_deleted, + time_collected, + collector_id, + part_number, + serial_number, + class, + report, + reporter, + sled_id, + slot_type, + slot, + marked_seen_in + ) + VALUES + ($1, $2, DEFAULT, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, DEFAULT), + ($13, $14, DEFAULT, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, DEFAULT) + ON CONFLICT + (restart_id, ena) + DO + NOTHING + RETURNING + ereport.ena + ), + inserted_reporter + AS ( + INSERT + INTO + ereporter_restart (id, time_first_seen, reporter, slot_type, slot) + VALUES + ($25, $26, $27, $28, $29) + ON CONFLICT + (id) + DO + UPDATE SET slot = $30 + RETURNING + id + ) +SELECT + count(*) +FROM + inserted_ereports diff --git a/nexus/db-queries/tests/output/ereports_insert_sp.sql b/nexus/db-queries/tests/output/ereports_insert_sp.sql new file mode 100644 index 00000000000..5c7e4a64d98 --- /dev/null +++ b/nexus/db-queries/tests/output/ereports_insert_sp.sql @@ -0,0 +1,50 @@ +WITH + inserted_ereports + AS ( + INSERT + INTO + ereport + ( + restart_id, + ena, + time_deleted, + time_collected, + collector_id, + part_number, + serial_number, + class, + report, + reporter, + sled_id, + slot_type, + slot, + marked_seen_in + ) + VALUES + ($1, $2, DEFAULT, $3, $4, $5, $6, $7, $8, $9, DEFAULT, $10, $11, DEFAULT), + ($12, $13, DEFAULT, $14, $15, $16, $17, $18, $19, $20, DEFAULT, $21, $22, DEFAULT) + ON CONFLICT + (restart_id, ena) + DO + NOTHING + RETURNING + ereport.ena + ), + inserted_reporter + AS ( + INSERT + INTO + ereporter_restart (id, time_first_seen, reporter, slot_type, slot) + VALUES + ($23, $24, $25, $26, $27) + ON CONFLICT + (id) + DO + UPDATE SET slot = $28 + RETURNING + id + ) +SELECT + count(*) +FROM + inserted_ereports diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 34069389338..0396bd83c82 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -3005,6 +3005,16 @@ table! { } } +table! { + ereporter_restart (id) { + id -> Uuid, + time_first_seen -> Timestamptz, + reporter -> crate::enums::EreporterTypeEnum, + slot_type -> crate::enums::SpTypeEnum, + slot -> Nullable, + } +} + table! { user_data_export (id) { id -> Uuid, diff --git a/nexus/fm/Cargo.toml b/nexus/fm/Cargo.toml index 68d50bc77d0..0defe566368 100644 --- a/nexus/fm/Cargo.toml +++ b/nexus/fm/Cargo.toml @@ -13,13 +13,19 @@ testing = [ "ereport-types", ] +[build-dependencies] +omicron-rpaths.workspace = true + [dependencies] anyhow.workspace = true chrono.workspace = true iddqd.workspace = true nexus-types.workspace = true +nexus-db-model.workspace = true omicron-common.workspace = true omicron-uuid-kinds.workspace = true +# See omicron-rpaths for more about the "pq-sys" dependency. +pq-sys = "*" rand.workspace = true serde.workspace = true serde_json.workspace = true diff --git a/nexus/fm/build.rs b/nexus/fm/build.rs new file mode 100644 index 00000000000..dbdb6a3c8b0 --- /dev/null +++ b/nexus/fm/build.rs @@ -0,0 +1,9 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +fn main() { + // See omicron-rpaths for documentation. NOTE: This file MUST be kept in + // sync with the other build.rs files in this repository. + omicron_rpaths::configure_default_omicron_rpaths(); +} diff --git a/nexus/fm/src/analysis_input.rs b/nexus/fm/src/analysis_input.rs index 008c868d792..8db94e48030 100644 --- a/nexus/fm/src/analysis_input.rs +++ b/nexus/fm/src/analysis_input.rs @@ -6,6 +6,7 @@ use chrono::{DateTime, Utc}; use iddqd::IdOrdMap; +use nexus_db_model::EreporterRestart; use nexus_types::fm::analysis_reports::ClosedCaseReport; use nexus_types::fm::{self, Sitrep, SitrepVersion}; use nexus_types::in_service_disk::InServiceDisk; @@ -43,6 +44,7 @@ pub struct Input { new_ereports: IdOrdMap, open_cases: IdOrdMap, closed_cases_copied_forward: IdOrdMap, + ereporter_restarts: IdOrdMap, /// Indicates whether `Builder::build` dropped any closed case /// from the carry-forward list whose alert request set was non-empty. /// ORed with [`crate::builder::AllCases::alert_set_changed`] in @@ -72,6 +74,10 @@ impl Input { &self.open_cases } + pub fn ereporter_restarts(&self) -> &IdOrdMap { + &self.ereporter_restarts + } + pub(crate) fn closed_cases_copied_forward(&self) -> &IdOrdMap { &self.closed_cases_copied_forward } @@ -118,6 +124,7 @@ impl Input { inv, in_service_disks, new_ereports: IdOrdMap::default(), + ereporter_restarts: IdOrdMap::default(), unmarked_seen_ereports: BTreeSet::default(), marked_alert_requests: HashSet::new(), }) @@ -153,6 +160,7 @@ pub struct Builder { /// copied forwards due to containing unmarked ereports. unmarked_seen_ereports: BTreeSet, + ereporter_restarts: IdOrdMap, /// The IDs of alert requests on the parent sitrep's closed cases that /// already have a marker row in `rendezvous_alert_created`. A closed-case /// request absent from this set is outstanding work, and (like an unmarked @@ -207,6 +215,19 @@ impl Builder { self.new_ereports.len() } + /// Adds a set of ereport restart IDs to the input. + pub fn add_ereporter_restarts( + &mut self, + restarts: impl IntoIterator, + ) { + self.ereporter_restarts.extend(restarts) + } + + /// Borrows the map of known ereport reporter restart IDs. + pub fn ereporter_restarts(&self) -> &IdOrdMap { + &self.ereporter_restarts + } + /// Finish constructing the [`Input`] and return it, along with a [`Report`] /// that provides a human-readable summary of how the inputs were /// constructed. @@ -230,6 +251,7 @@ impl Builder { .iter() .map(|e| *e.id()) .collect(), + num_ereporter_restarts: self.ereporter_restarts.len(), open_cases: BTreeMap::new(), closed_cases_copied_forward: BTreeMap::new(), in_service_disks: self @@ -308,6 +330,7 @@ impl Builder { new_ereports: self.new_ereports, open_cases, closed_cases_copied_forward, + ereporter_restarts: self.ereporter_restarts, alerts_changed, in_service_disks: self.in_service_disks, }; diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index a853d58d9b1..afd800ca9cf 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -291,7 +291,13 @@ impl Ingester { }); let created = match self .datastore - .ereports_insert(&opctx, reporter, db_ereports) + .ereports_insert( + &opctx, + restart_id, + time_collected, + reporter, + db_ereports, + ) .await { Ok((created, latest)) => { diff --git a/nexus/src/app/background/tasks/fm_analysis.rs b/nexus/src/app/background/tasks/fm_analysis.rs index c86a485eb29..0f4ad5b88bb 100644 --- a/nexus/src/app/background/tasks/fm_analysis.rs +++ b/nexus/src/app/background/tasks/fm_analysis.rs @@ -245,6 +245,9 @@ impl FmAnalysis { inv, in_service_disks, )?; + self.load_ereporter_restarts(opctx, &mut builder) + .await + .context("failed to load ereporter restarts")?; self.load_new_ereports(opctx, &mut builder, &mut warnings) .await .context("failed to load new ereports")?; @@ -321,6 +324,35 @@ impl FmAnalysis { Ok(in_service_disks) } + async fn load_ereporter_restarts( + &mut self, + opctx: &OpContext, + builder: &mut fm::analysis_input::Builder, + ) -> anyhow::Result<()> { + let mut nbatches = 0; + let mut paginator = Paginator::new( + nexus_db_queries::db::datastore::SQL_BATCH_SIZE, + dropshot::PaginationOrder::Ascending, + ); + while let Some(p) = paginator.next() { + nbatches += 1; + let batch = self + .datastore + .ereporter_restart_list(opctx, &p.current_pagparams()) + .await?; + paginator = p.found_batch(&batch, &|e| e.id().into_untyped_uuid()); + builder.add_ereporter_restarts(batch); + } + + slog::debug!( + opctx.log, + "loaded {} ereporter restarts (in {nbatches} batches)", + builder.ereporter_restarts().len(), + ); + + Ok(()) + } + async fn load_new_ereports( &mut self, opctx: &OpContext, @@ -343,21 +375,31 @@ impl FmAnalysis { (e.restart_id.into_untyped_uuid(), e.ena) }); let loaded = batch.len(); + let mut invalid = 0; - builder.add_unmarked_ereports(batch.into_iter().filter_map( - |ereport| { - let ereport = match fm::Ereport::try_from(ereport) { - Ok(ereport) => ereport, - Err(e) => { - invalid += 1; - warnings.push(e.to_string()); - return None; - } - }; - - Some(ereport) - }, - )); + for ereport in batch { + let ereport = match fm::Ereport::try_from(ereport) { + Ok(ereport) => ereport, + Err(e) => { + invalid += 1; + warnings.push(e.to_string()); + continue; + } + }; + + // Check if this is a reporter we know about, and issue a + // warning if it is not. + let id = ereport.id(); + if !builder.ereporter_restarts().contains_key(&id.restart_id) { + let msg = format!( + "ereport {id} has a restart ID not contained in the \ + `ereporter_restart` table" + ); + slog::warn!(&opctx.log, "{msg}"); + warnings.push(msg); + } + builder.add_unmarked_ereports(std::iter::once(ereport)); + } let total = builder.num_ereports(); let new = total - prev_total; diff --git a/nexus/src/app/background/tasks/fm_rendezvous.rs b/nexus/src/app/background/tasks/fm_rendezvous.rs index 856f796ae9e..21cfbc56384 100644 --- a/nexus/src/app/background/tasks/fm_rendezvous.rs +++ b/nexus/src/app/background/tasks/fm_rendezvous.rs @@ -1046,6 +1046,7 @@ mod tests { let restart_id = EreporterRestartUuid::new_v4(); let collector_id = OmicronZoneUuid::new_v4(); + let time_collected = Utc::now(); let reporter = Reporter::Sp { sp_type: nexus_types::inventory::SpType::Sled, slot: 0, @@ -1056,7 +1057,7 @@ mod tests { restart_id, ena: ereport_types::Ena(2), }, - time_collected: Utc::now(), + time_collected, collector_id, part_number: Some("9130000019".to_string()), serial_number: Some("BRM420069".to_string()), @@ -1068,7 +1069,7 @@ mod tests { restart_id, ena: ereport_types::Ena(3), }, - time_collected: Utc::now(), + time_collected, collector_id, part_number: Some("9130000019".to_string()), serial_number: Some("BRM420069".to_string()), @@ -1080,7 +1081,7 @@ mod tests { restart_id, ena: ereport_types::Ena(4), }, - time_collected: Utc::now(), + time_collected, collector_id, part_number: Some("9130000019".to_string()), serial_number: Some("BRM420069".to_string()), @@ -1089,7 +1090,9 @@ mod tests { }; datastore .ereports_insert( - opctx, + &opctx, + restart_id, + time_collected, reporter, vec![ ereport1_data.clone(), @@ -1285,13 +1288,13 @@ mod tests { sp_type: nexus_types::inventory::SpType::Sled, slot: 1, }; - + let time_collected = Utc::now(); let ereport1_data = EreportData { id: ereport_types::EreportId { restart_id, ena: ereport_types::Ena(2), }, - time_collected: Utc::now(), + time_collected, collector_id, part_number: Some("9130000019".to_string()), serial_number: Some("BRM420069".to_string()), @@ -1303,7 +1306,7 @@ mod tests { restart_id, ena: ereport_types::Ena(3), }, - time_collected: Utc::now(), + time_collected, collector_id, part_number: Some("9130000019".to_string()), serial_number: Some("BRM420069".to_string()), @@ -1315,7 +1318,7 @@ mod tests { restart_id, ena: ereport_types::Ena(4), }, - time_collected: Utc::now(), + time_collected, collector_id, part_number: Some("9130000019".to_string()), serial_number: Some("BRM420069".to_string()), @@ -1325,6 +1328,8 @@ mod tests { datastore .ereports_insert( opctx, + restart_id, + time_collected, reporter, vec![ ereport1_data.clone(), diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 467b7a6255d..823ae4f0b04 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -861,10 +861,11 @@ mod test { const GIMLET_PN: &str = "9130000019"; // Make some SP ereports... let sp_restart_id = EreporterRestartUuid::new_v4(); - datastore.ereports_insert(&opctx, Reporter::Sp { sp_type: SpType::Sled, slot: SLED_SLOT}, vec![ + let time_collected = chrono::Utc::now(); + datastore.ereports_insert(&opctx, sp_restart_id, time_collected, Reporter::Sp { sp_type: SpType::Sled, slot: SLED_SLOT}, vec![ EreportData { id: EreportId { restart_id: sp_restart_id, ena: ereport_types::Ena(1) }, - time_collected: chrono::Utc::now(), + time_collected, collector_id: OmicronZoneUuid::new_v4(), part_number: Some(GIMLET_PN.to_string()), serial_number: Some(SP_SERIAL.to_string()), @@ -873,7 +874,7 @@ mod test { }, EreportData { id: EreportId { restart_id: sp_restart_id, ena: ereport_types::Ena(2) }, - time_collected: chrono::Utc::now(), + time_collected, collector_id: OmicronZoneUuid::new_v4(), part_number: Some(GIMLET_PN.to_string()), serial_number: Some(SP_SERIAL.to_string()), @@ -882,7 +883,7 @@ mod test { }, EreportData { id: EreportId { restart_id: EreporterRestartUuid::new_v4(), ena: ereport_types::Ena(1) }, - time_collected: chrono::Utc::now(), + time_collected, collector_id: OmicronZoneUuid::new_v4(), // Let's do a silly one! No VPD, to make sure that's also // handled correctly. @@ -895,16 +896,20 @@ mod test { // And one from a different serial. N.B. that I made sure the number of // host-OS and SP ereports are different for when we make assertions // about the bundle report. + let sp2_restart_id = EreporterRestartUuid::new_v4(); + let time_collected = chrono::Utc::now(); datastore .ereports_insert( &opctx, + sp2_restart_id, + time_collected, Reporter::Sp { sp_type: SpType::Switch, slot: 1 }, vec![EreportData { id: EreportId { - restart_id: EreporterRestartUuid::new_v4(), + restart_id: sp2_restart_id, ena: ereport_types::Ena(1), }, - time_collected: chrono::Utc::now(), + time_collected, collector_id: OmicronZoneUuid::new_v4(), part_number: Some("9130000006".to_string()), serial_number: Some("BRM41000555".to_string()), @@ -915,10 +920,13 @@ mod test { .await .expect("failed to insert another fake SP ereport"); // And some host OS ones... - let restart_id = EreporterRestartUuid::new_v4(); + let sled1_restart_id = EreporterRestartUuid::new_v4(); + let time_collected = chrono::Utc::now(); datastore .ereports_insert( &opctx, + sled1_restart_id, + time_collected, Reporter::HostOs { sled: SledUuid::new_v4(), slot: Some(SLED_SLOT), @@ -926,10 +934,10 @@ mod test { vec![ EreportData { id: EreportId { - restart_id, + restart_id: sled1_restart_id, ena: ereport_types::Ena(1), }, - time_collected: chrono::Utc::now(), + time_collected, collector_id: OmicronZoneUuid::new_v4(), serial_number: Some(HOST_SERIAL.to_string()), part_number: Some(GIMLET_PN.to_string()), @@ -938,10 +946,10 @@ mod test { }, EreportData { id: EreportId { - restart_id, + restart_id: sled1_restart_id, ena: ereport_types::Ena(2), }, - time_collected: chrono::Utc::now(), + time_collected, collector_id: OmicronZoneUuid::new_v4(), serial_number: Some(HOST_SERIAL.to_string()), part_number: Some(GIMLET_PN.to_string()), @@ -952,14 +960,18 @@ mod test { ) .await .expect("failed to insert fake host OS ereports"); + let sled2_restart_id = EreporterRestartUuid::new_v4(); + let time_collected = chrono::Utc::now(); datastore .ereports_insert( &opctx, + sled2_restart_id, + time_collected, Reporter::HostOs { sled: SledUuid::new_v4(), slot: Some(SLED_SLOT) }, vec![ EreportData { - id: EreportId { restart_id: EreporterRestartUuid::new_v4(), ena: ereport_types::Ena(1) }, - time_collected: chrono::Utc::now(), + id: EreportId { restart_id: sled2_restart_id, ena: ereport_types::Ena(1) }, + time_collected, collector_id: OmicronZoneUuid::new_v4(), serial_number: Some(HOST_SERIAL.to_string()), part_number: Some(GIMLET_PN.to_string()), diff --git a/nexus/tests/integration_tests/data_migrations/ereporter_restart_order_v2.rs b/nexus/tests/integration_tests/data_migrations/ereporter_restart_order_v2.rs new file mode 100644 index 00000000000..b3a731a8f54 --- /dev/null +++ b/nexus/tests/integration_tests/data_migrations/ereporter_restart_order_v2.rs @@ -0,0 +1,275 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::super::schema::{DataMigrationFns, MigrationContext}; +use chrono::{DateTime, Utc}; +use futures::future::BoxFuture; +use pretty_assertions::assert_eq; +use std::collections::HashMap; +use uuid::Uuid; + +const COLLECTOR: Uuid = Uuid::from_u128(0xeb020000_0000_0000_0000_000000000001); + +// A sled SP restart ID with a single ereport. +const SP_SLED: Uuid = Uuid::from_u128(0xeb020001_0000_0000_0000_000000000001); +// A switch SP reporter with multiple ereports at different times. +const SP_SWITCH: Uuid = Uuid::from_u128(0xeb020001_0000_0000_0000_000000000002); +// A host OS reporter whose slot is NULL in its earliest ereports and then +// resolved to a non-NULL value. The non-NULL slot should be recovered, and +// `time_first_seen` should still come from the earliest (NULL-slot) ereport. +const HOST_NULL_THEN_SLOT: Uuid = + Uuid::from_u128(0xeb020001_0000_0000_0000_000000000003); +// A host OS reporter whose slot number is NULL in *every* ereport. A row for +// this restart ID should still created, with a NULL slot. +const HOST_SLOT_ALL_NULL: Uuid = + Uuid::from_u128(0xeb020001_0000_0000_0000_000000000004); +// A host OS reporter whose slot is known in its earliest ereport and then NULL +// in a later one. The known slot must not be clobbered back to NULL. +const HOST_SLOT_THEN_NULL: Uuid = + Uuid::from_u128(0xeb020001_0000_0000_0000_000000000005); +// A host OS reporter whose only ereport is soft-deleted. +const HOST_ONLY_DELETED: Uuid = + Uuid::from_u128(0xeb020001_0000_0000_0000_000000000006); +// A host OS reporter with a mix of soft-deleted and live ereports, where the +// *earliest* ereport is soft-deleted. The restart ID is included, and +// `time_first_seen` must come from that earliest (soft-deleted) ereport. +const HOST_SOME_DELETED: Uuid = + Uuid::from_u128(0xeb020001_0000_0000_0000_000000000007); + +// Sled IDs for the host OS reporters (host ereports require a non-NULL +// `sled_id`, which is otherwise irrelevant to the test). +const SLED_A: Uuid = Uuid::from_u128(0xeb020002_0000_0000_0000_000000000001); +const SLED_B: Uuid = Uuid::from_u128(0xeb020002_0000_0000_0000_000000000002); +const SLED_C: Uuid = Uuid::from_u128(0xeb020002_0000_0000_0000_000000000003); +const SLED_D: Uuid = Uuid::from_u128(0xeb020002_0000_0000_0000_000000000004); +const SLED_E: Uuid = Uuid::from_u128(0xeb020002_0000_0000_0000_000000000005); + +// Some hard-coded timestamps that will parse as both `TIMESTAMPTZ` literals and +// via `str::parse::>`. +const T_05: &str = "2024-01-01T00:00:05Z"; +const T_10: &str = "2024-01-01T00:00:10Z"; +const T_15: &str = "2024-01-01T00:00:15Z"; +const T_20: &str = "2024-01-01T00:00:20Z"; +const T_25: &str = "2024-01-01T00:00:25Z"; +const T_30: &str = "2024-01-01T00:00:30Z"; +const T_40: &str = "2024-01-01T00:00:40Z"; +const T_50: &str = "2024-01-01T00:00:50Z"; +const T_60: &str = "2024-01-01T00:01:00Z"; +const T_70: &str = "2024-01-01T00:01:10Z"; +const T_120: &str = "2024-01-01T00:02:00Z"; +const T_180: &str = "2024-01-01T00:03:00Z"; +const T_240: &str = "2024-01-01T00:04:00Z"; +const T_270: &str = "2024-01-01T00:04:30Z"; +const T_300: &str = "2024-01-01T00:05:00Z"; + +fn ts(s: &str) -> DateTime { + s.parse().expect("test timestamp should be valid RFC 3339") +} + +pub(crate) fn checks() -> DataMigrationFns { + DataMigrationFns::new().before(before).after(after) +} + +fn before<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { + Box::pin(async move { + ctx.client + .batch_execute(&format!( + " + INSERT INTO omicron.public.ereport ( + restart_id, ena, time_collected, collector_id, report, + reporter, slot_type, slot, sled_id, time_deleted + ) VALUES + ('{SP_SLED}', 1, '{T_10}', '{COLLECTOR}', '{{}}', + 'sp', 'sled', 5, NULL, NULL), + + ('{SP_SWITCH}', 1, '{T_30}', '{COLLECTOR}', '{{}}', + 'sp', 'switch', 1, NULL, NULL), + ('{SP_SWITCH}', 2, '{T_10}', '{COLLECTOR}', '{{}}', + 'sp', 'switch', 1, NULL, NULL), + ('{SP_SWITCH}', 3, '{T_20}', '{COLLECTOR}', '{{}}', + 'sp', 'switch', 1, NULL, NULL), + + ('{HOST_NULL_THEN_SLOT}', 1, '{T_05}', '{COLLECTOR}', '{{}}', + 'host', 'sled', NULL, '{SLED_A}', NULL), + ('{HOST_NULL_THEN_SLOT}', 2, '{T_15}', '{COLLECTOR}', '{{}}', + 'host', 'sled', NULL, '{SLED_A}', NULL), + ('{HOST_NULL_THEN_SLOT}', 3, '{T_25}', '{COLLECTOR}', '{{}}', + 'host', 'sled', 7, '{SLED_A}', NULL), + + ('{HOST_SLOT_ALL_NULL}', 1, '{T_40}', '{COLLECTOR}', '{{}}', + 'host', 'sled', NULL, '{SLED_B}', NULL), + ('{HOST_SLOT_ALL_NULL}', 2, '{T_50}', '{COLLECTOR}', '{{}}', + 'host', 'sled', NULL, '{SLED_B}', NULL), + + ('{HOST_SLOT_THEN_NULL}', 1, '{T_60}', '{COLLECTOR}', '{{}}', + 'host', 'sled', 3, '{SLED_C}', NULL), + ('{HOST_SLOT_THEN_NULL}', 2, '{T_70}', '{COLLECTOR}', '{{}}', + 'host', 'sled', NULL, '{SLED_C}', NULL), + + ('{HOST_ONLY_DELETED}', 1, '{T_120}', '{COLLECTOR}', '{{}}', + 'host', 'sled', 4, '{SLED_D}', '{T_180}'), + + ('{HOST_SOME_DELETED}', 1, '{T_240}', '{COLLECTOR}', '{{}}', + 'host', 'sled', NULL, '{SLED_E}', '{T_300}'), + ('{HOST_SOME_DELETED}', 2, '{T_270}', '{COLLECTOR}', '{{}}', + 'host', 'sled', 8, '{SLED_E}', NULL); + " + )) + .await + .expect("failed to insert test ereports"); + }) +} + +fn after<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { + Box::pin(async move { + let rows = ctx + .client + .query( + &format!( + "SELECT id, reporter::text, slot_type::text, slot, + time_first_seen + FROM omicron.public.ereporter_restart + WHERE id IN ( + '{SP_SLED}', '{SP_SWITCH}', '{HOST_NULL_THEN_SLOT}', + '{HOST_SLOT_ALL_NULL}', '{HOST_SLOT_THEN_NULL}', + '{HOST_ONLY_DELETED}', '{HOST_SOME_DELETED}' + )" + ), + &[], + ) + .await + .expect("failed to query backfilled ereporter_restart rows"); + + assert_eq!(rows.len(), 7, "expected one row per restart ID"); + + let results: HashMap< + Uuid, + (String, String, Option, DateTime), + > = rows + .iter() + .map(|row| { + let id: Uuid = row.get("id"); + let reporter: String = row.get("reporter"); + let slot_type: String = row.get("slot_type"); + let slot: Option = row.get("slot"); + let time_first_seen: DateTime = row.get("time_first_seen"); + (id, (reporter, slot_type, slot, time_first_seen)) + }) + .collect(); + + // SP sled: reports its slot and (only) collection time. + let (reporter, slot_type, slot, first_seen) = &results[&SP_SLED]; + assert_eq!(reporter, "sp"); + assert_eq!(slot_type, "sled"); + assert_eq!(*slot, Some(5)); + assert_eq!(*first_seen, ts(T_10)); + + // SP switch: time_first_seen is the earliest of the three ereports. + let (reporter, slot_type, slot, first_seen) = &results[&SP_SWITCH]; + assert_eq!(reporter, "sp"); + assert_eq!(slot_type, "switch",); + assert_eq!(*slot, Some(1)); + assert_eq!( + *first_seen, + ts(T_10), + "time_first_seen should be the earliest time_collected of \ + multiple ereports" + ); + + // Host, slot number NULL then resolved. The slot should be set to the + // value from the non-NULL value, while time_first_seen comes from the + // earliest (NULL-slot) ereport. + let (reporter, slot_type, slot, first_seen) = + &results[&HOST_NULL_THEN_SLOT]; + assert_eq!(reporter, "host"); + assert_eq!(slot_type, "sled"); + assert_eq!( + *slot, + Some(7), + "a host's NULL slot should be backfilled from a later non-NULL one" + ); + assert_eq!( + *first_seen, + ts(T_05), + "time_first_seen should span the NULL-slot ereports" + ); + + // Host, all NULL: a row is created with a NULL slot. + let (reporter, slot_type, slot, first_seen) = + &results[&HOST_SLOT_ALL_NULL]; + assert_eq!(reporter, "host"); + assert_eq!(slot_type, "sled"); + assert_eq!( + *slot, None, + "a host reporter with no known slot should have a NULL slot" + ); + assert_eq!(*first_seen, ts(T_40), "all-NULL host time_first_seen"); + + // Host, slot known then NULL: the known slot is not clobbered. This is + // an edge case that shouldn't occur in production but which is + // nonetheless worth checking. + let (reporter, slot_type, slot, first_seen) = + &results[&HOST_SLOT_THEN_NULL]; + assert_eq!(reporter, "host"); + assert_eq!(slot_type, "sled"); + assert_eq!( + *slot, + Some(3), + "a known slot must not be clobbered by a later NULL one" + ); + assert_eq!( + *first_seen, + ts(T_60), + "known-then-NULL host time_first_seen" + ); + + // All ereports soft-deleted: there should still be an entry for the + // reporter. + let (reporter, slot_type, slot, first_seen) = + &results[&HOST_ONLY_DELETED]; + assert_eq!(reporter, "host",); + assert_eq!(slot_type, "sled"); + assert_eq!(*slot, Some(4)); + assert_eq!( + *first_seen, + ts(T_120), + "time_first_seen should be from the soft-deleted ereport" + ); + + // Mixed deleted/live: included, with the slot recovered from the live + // ereport (8) and time_first_seen from the earliest ereport, even + // though that ereport is soft-deleted. + let (reporter, slot_type, slot, first_seen) = + &results[&HOST_SOME_DELETED]; + assert_eq!(reporter, "host",); + assert_eq!(slot_type, "sled"); + assert_eq!(*slot, Some(8)); + assert_eq!( + *first_seen, + ts(T_240), + "time_first_seen should span the soft-deleted earliest ereport" + ); + + // Clean up test data. + ctx.client + .batch_execute(&format!( + " + DELETE FROM omicron.public.ereporter_restart + WHERE id IN ( + '{SP_SLED}', '{SP_SWITCH}', '{HOST_NULL_THEN_SLOT}', + '{HOST_SLOT_ALL_NULL}', '{HOST_SLOT_THEN_NULL}', + '{HOST_ONLY_DELETED}', '{HOST_SOME_DELETED}' + ); + DELETE FROM omicron.public.ereport + WHERE restart_id IN ( + '{SP_SLED}', '{SP_SWITCH}', '{HOST_NULL_THEN_SLOT}', + '{HOST_SLOT_ALL_NULL}', '{HOST_SLOT_THEN_NULL}', + '{HOST_ONLY_DELETED}', '{HOST_SOME_DELETED}' + ); + " + )) + .await + .expect("failed to clean up ereporter-restart-order-v2 test data"); + }) +} diff --git a/nexus/tests/integration_tests/data_migrations/mod.rs b/nexus/tests/integration_tests/data_migrations/mod.rs index 3f8e9ae56f8..6f4f663e2fc 100644 --- a/nexus/tests/integration_tests/data_migrations/mod.rs +++ b/nexus/tests/integration_tests/data_migrations/mod.rs @@ -36,6 +36,7 @@ mod disk_types; mod drop_uninitialized_svc_enabled_not_online_state; mod ereport_everyone_gets_a_slot; mod ereport_trim_serial_trailing_nulls; +mod ereporter_restart_order_v2; mod fix_leaked_bp_oximeter_read_policy_rows; mod fix_session_token_column_order; mod inv_clear_mupdate_override; @@ -94,6 +95,7 @@ pub(crate) fn get_migration_checks() -> BTreeMap { register!(bgp_unnumbered_peer_cleanup); register!(ereport_trim_serial_trailing_nulls); register!(sled_resource_vmm_state); + register!(ereporter_restart_order_v2); map } diff --git a/nexus/types/output/analysis_input_report_empty.out b/nexus/types/output/analysis_input_report_empty.out index 491434e16f6..4dbc6273abf 100644 --- a/nexus/types/output/analysis_input_report_empty.out +++ b/nexus/types/output/analysis_input_report_empty.out @@ -1,5 +1,6 @@ parent sitrep: inventory collection: bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb +total known ereport restart IDs: 0 no new ereports since the parent sitrep no cases copied forward diff --git a/nexus/types/output/analysis_input_report_same_inv.out b/nexus/types/output/analysis_input_report_same_inv.out index 9cd8ff958d6..a99f292c1e7 100644 --- a/nexus/types/output/analysis_input_report_same_inv.out +++ b/nexus/types/output/analysis_input_report_same_inv.out @@ -1,6 +1,7 @@ parent sitrep: aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa inventory collection: bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb --> same collection as parent sitrep +total known ereport restart IDs: 420 no new ereports since the parent sitrep no cases copied forward diff --git a/nexus/types/output/analysis_input_report_with_cases.out b/nexus/types/output/analysis_input_report_with_cases.out index e989fcf06db..811c3b271ed 100644 --- a/nexus/types/output/analysis_input_report_with_cases.out +++ b/nexus/types/output/analysis_input_report_with_cases.out @@ -1,6 +1,7 @@ parent sitrep: aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa inventory collection: bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb --> different from parent sitrep (collection cccccccc-cccc-cccc-cccc-cccccccccccc) +total known ereport restart IDs: 420 new ereports (2 total): * ereport dddddddd-dddd-dddd-dddd-dddddddddddd:3 diff --git a/nexus/types/src/fm/analysis_reports.rs b/nexus/types/src/fm/analysis_reports.rs index 8793b613bd5..4ff44798149 100644 --- a/nexus/types/src/fm/analysis_reports.rs +++ b/nexus/types/src/fm/analysis_reports.rs @@ -228,6 +228,8 @@ pub struct InputReport { /// Cases which have closed, but which have been copied forwards as they /// contain ereports which have not yet been marked seen. pub closed_cases_copied_forward: BTreeMap, + /// Number of entries in the ereporter restart table. + pub num_ereporter_restarts: usize, /// All control-plane-managed physical disks visible to the diagnosis /// engines for this analysis pass. pub in_service_disks: BTreeSet, @@ -262,6 +264,7 @@ impl fmt::Display for InputReportMultilineDisplay<'_> { new_ereport_ids, open_cases, closed_cases_copied_forward, + num_ereporter_restarts, in_service_disks, }, indent, @@ -285,6 +288,13 @@ impl fmt::Display for InputReportMultilineDisplay<'_> { )?; } + writeln!( + f, + "{:indent$}total known ereport restart IDs: \ + {num_ereporter_restarts}", + "", + )?; + if !new_ereport_ids.is_empty() { writeln!( f, @@ -531,6 +541,7 @@ mod tests { parent_sitrep_id: Some(parent_sitrep_id), parent_inv_id: Some(parent_inv_id), inv_id, + num_ereporter_restarts: 420, new_ereport_ids, open_cases, closed_cases_copied_forward, @@ -547,6 +558,7 @@ mod tests { parent_sitrep_id: None, parent_inv_id: None, inv_id, + num_ereporter_restarts: 0, new_ereport_ids: BTreeSet::new(), open_cases: BTreeMap::new(), closed_cases_copied_forward: BTreeMap::new(), @@ -566,6 +578,7 @@ mod tests { parent_sitrep_id: Some(parent_sitrep_id), parent_inv_id: Some(inv_id), inv_id, + num_ereporter_restarts: 420, new_ereport_ids: BTreeSet::new(), open_cases: BTreeMap::new(), closed_cases_copied_forward: BTreeMap::new(), diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index e01583800a8..44380352dc1 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -7696,6 +7696,42 @@ WHERE marked_seen_in IS NULL AND time_deleted IS NULL; +-- Table tracking the timestamp of the first ereport received from each restart +-- ID. +CREATE TABLE IF NOT EXISTS omicron.public.ereporter_restart ( + -- The reporter restart ID. + -- + -- This corresponds to the `restart_id` column in `omicron.public.ereport`. + id UUID PRIMARY KEY, + -- The timestamp at which the first ereport received from this restart ID + -- was received. + time_first_seen TIMESTAMPTZ NOT NULL, + -- Whether this ereport was generated by SP firmware or the host OS. + reporter omicron.public.ereporter_type NOT NULL, + -- The type of the physical slot occupied by the reporter. + slot_type omicron.public.sp_type NOT NULL, + -- The number of the physical slot occupied by the reporter. + -- + -- For sled host OS reporters, this may be NULL if the sled's location is + -- not known to the system when the ereport was received. If the physical + -- location of the sled is determined later, subsequent attempts to insert + -- ereports will update this field. + slot INT4, + + CONSTRAINT reporter_validity CHECK ( + ( + -- ereports from SPs must always have a SP type and slot, so we need + -- not worry about temporarily accepting NULL slots so that they can + -- be back-populated later as we do for host OS reporters. + reporter = 'sp' AND slot IS NOT NULL + ) OR ( + -- ereports from the sled host OS must have the 'sled' slot type (as + -- switches and PSCs do not have a host OS) + reporter = 'host' AND slot_type = 'sled' + ) + ) +); + /* * Fault management situation reports (and accessories) * @@ -8893,7 +8929,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '271.0.0', NULL) + (TRUE, NOW(), NOW(), '272.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/ereporter-restart-order-v2/up01.sql b/schema/crdb/ereporter-restart-order-v2/up01.sql new file mode 100644 index 00000000000..b2cc84125d0 --- /dev/null +++ b/schema/crdb/ereporter-restart-order-v2/up01.sql @@ -0,0 +1,35 @@ +-- Table tracking the timestamp of the first ereport received from each restart +-- ID. +CREATE TABLE IF NOT EXISTS omicron.public.ereporter_restart ( + -- The reporter restart ID. + -- + -- This corresponds to the `restart_id` column in `omicron.public.ereport`. + id UUID PRIMARY KEY, + -- The timestamp at which the first ereport received from this restart ID + -- was received. + time_first_seen TIMESTAMPTZ NOT NULL, + -- Whether this ereport was generated by SP firmware or the host OS. + reporter omicron.public.ereporter_type NOT NULL, + -- The type of the physical slot occupied by the reporter. + slot_type omicron.public.sp_type NOT NULL, + -- The number of the physical slot occupied by the reporter. + -- + -- For sled host OS reporters, this may be NULL if the sled's location is + -- not known to the system when the ereport was received. If the physical + -- location of the sled is determined later, subsequent attempts to insert + -- ereports will update this field. + slot INT4, + + CONSTRAINT reporter_validity CHECK ( + ( + -- ereports from SPs must always have a SP type and slot, so we need + -- not worry about temporarily accepting NULL slots so that they can + -- be back-populated later as we do for host OS reporters. + reporter = 'sp' AND slot IS NOT NULL + ) OR ( + -- ereports from the sled host OS must have the 'sled' slot type (as + -- switches and PSCs do not have a host OS) + reporter = 'host' AND slot_type = 'sled' + ) + ) +); diff --git a/schema/crdb/ereporter-restart-order-v2/up02.sql b/schema/crdb/ereporter-restart-order-v2/up02.sql new file mode 100644 index 00000000000..9055d0e1ee7 --- /dev/null +++ b/schema/crdb/ereporter-restart-order-v2/up02.sql @@ -0,0 +1,41 @@ +SET LOCAL disallow_full_table_scans = off; + +-- Backfill the ereporter_restart table from existing ereport data. +-- +-- Insert one row for each unique restart ID present in the ereport table: +-- +-- * `time_first_seen` is the earliest `time_collected` of any ereport with +-- that restart ID. +-- * `reporter` and `slot_type` are constant for a given restart ID (a restart +-- is a single reporter incarnation occupying a single physical slot), so we +-- read them directly from the group. +-- * The slot number may be NULL in some of a restart ID's ereports and +-- non-NULL in others: a sled host OS can emit ereports before its physical +-- location is known, and then emit more once it has been resolved. SQL's +-- `MAX` ignores NULLs, so `MAX(slot)` recovers the non-NULL slot if *any* +-- ereport for the restart ID had one. If every ereport's slot was NULL, the +-- restart's slot is left NULL, which the column permits. +-- +-- `ON CONFLICT (id) DO NOTHING` makes this insert idempotent, and also tolerates +-- the (schema-prohibited, domain-impossible) case of a single restart ID having +-- inconsistent reporter/slot_type values across its ereports. +INSERT INTO omicron.public.ereporter_restart ( + id, + time_first_seen, + reporter, + slot_type, + slot +) +SELECT + restart_id, + MIN(time_collected), + reporter, + slot_type, + MAX(slot) +FROM + omicron.public.ereport +GROUP BY + restart_id, + reporter, + slot_type +ON CONFLICT (id) DO NOTHING;