diff --git a/Cargo.lock b/Cargo.lock index 9637a9a9263..ea1c2ee705f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8335,6 +8335,7 @@ dependencies = [ "expectorate", "iddqd", "indent_write", + "itertools 0.14.0", "newtype_derive", "omicron-test-utils", "omicron-workspace-hack", diff --git a/dev-tools/ls-apis/Cargo.toml b/dev-tools/ls-apis/Cargo.toml index 4b24b7d1b7d..b1942608144 100644 --- a/dev-tools/ls-apis/Cargo.toml +++ b/dev-tools/ls-apis/Cargo.toml @@ -14,6 +14,7 @@ cargo_metadata.workspace = true clap.workspace = true iddqd.workspace = true indent_write.workspace = true +itertools.workspace = true newtype_derive.workspace = true parse-display.workspace = true petgraph.workspace = true diff --git a/dev-tools/ls-apis/api-manifest.toml b/dev-tools/ls-apis/api-manifest.toml index 5a9c64560d5..926217f3063 100644 --- a/dev-tools/ls-apis/api-manifest.toml +++ b/dev-tools/ls-apis/api-manifest.toml @@ -593,3 +593,201 @@ note = """ Sled Agent uses the Crucible Agent client types only, and only in the simulated sled agent. """ + +################################################################################ +# Intra-deployment-unit-only edges +# +# These edges are excluded from the deployment unit dependency graph because +# they represent communication that only happens locally within a *single +# instance* of a single deployment unit, not across deployment unit boundaries. +# A single deployment unit is updated atomically, so there's no risk of version +# skew with intra_deployment_unit-only edges. +# +# Note that things do not belong here when they cross different instances of the +# same deployment unit. For example, Sled Agent and MGS are in the same +# deployment unit (the host OS), but Sled Agent's use of MGS crosses different +# instances of the deployment unit (i.e., different sleds). Those don't belong +# in this category. +################################################################################ + +[[intra_deployment_unit_only_edges]] +server = "ddmd" +client = "dpd-client" +note = """ +ddmd runs in both the global zone and the switch zone. The switch zone is +configured with dpd_host=[::1] (services.rs:3127). The global zone configures +"dendrite" to false, meaning it does not expose a dendrite server +(ddm/manifest.xml:25). +""" +permalinks = [ + "https://github.com/oxidecomputer/omicron/blob/6cef874/sled-agent/src/services.rs#L3127", + "https://github.com/oxidecomputer/maghemite/blob/396bb3c/smf/ddm/manifest.xml#L25", +] + +[[intra_deployment_unit_only_edges]] +server = "dpd" +client = "gateway-client" +note = """ +dpd defaults to [::1]:MGS_PORT (switch_identifiers.rs:60), and the SMF start +script doesn't pass in --mgs-address. +""" +permalinks = [ + "https://github.com/oxidecomputer/dendrite/blob/606c0be/dpd/src/switch_identifiers.rs#L60", + "https://github.com/oxidecomputer/dendrite/blob/606c0be/dpd/misc/dpd.xml", + "https://github.com/oxidecomputer/dendrite/blob/606c0be/tools/svc-dpd", +] + +[[intra_deployment_unit_only_edges]] +server = "lldpd" +client = "dpd-client" +note = """ +lldpd defaults to localhost for dpd (dendrite.rs:194), and the SMF start +script doesn't pass in --host. +""" +permalinks = [ + "https://github.com/oxidecomputer/lldp/blob/61479b6/lldpd/src/dendrite.rs#L194", + "https://github.com/oxidecomputer/lldp/blob/61479b6/lldpd/misc/lldpd.xml", + "https://github.com/oxidecomputer/lldp/blob/61479b6/lldpd/misc/svc-lldpd", +] + +[[intra_deployment_unit_only_edges]] +server = "mgd" +client = "ddm-admin-client" +note = "mg-lower hardcodes localhost:8000." +permalinks = [ + "https://github.com/oxidecomputer/maghemite/blob/396bb3c/mg-lower/src/ddm.rs#L110", +] + +[[intra_deployment_unit_only_edges]] +server = "mgd" +client = "dpd-client" +note = "mg-lower hardcodes localhost." +permalinks = [ + "https://github.com/oxidecomputer/maghemite/blob/396bb3c/mg-lower/src/dendrite.rs#L491", +] + +[[intra_deployment_unit_only_edges]] +server = "mgd" +client = "gateway-client" +note = """ +mgd defaults to [::1]:12225 (mgd main.rs:93), and the SMF start script +doesn't set --mgs-addr. +""" +permalinks = [ + "https://github.com/oxidecomputer/maghemite/blob/396bb3c/mgd/src/main.rs#L93", + "https://github.com/oxidecomputer/maghemite/blob/396bb3c/smf/mgd/manifest.xml", + "https://github.com/oxidecomputer/maghemite/blob/396bb3c/smf/mgd_method_script.sh", +] + +[[intra_deployment_unit_only_edges]] +server = "omicron-sled-agent" +client = "ddm-admin-client" +note = "Sled Agent uses Client::localhost() to connect to ddm." +permalinks = [ + "https://github.com/oxidecomputer/omicron/blob/6cef874/sled-agent/src/ddm_reconciler.rs#L56", +] + +[[intra_deployment_unit_only_edges]] +server = "omicron-sled-agent" +client = "mg-admin-client" +note = """ +Sled Agent only queries the switch zone on the same sled, +which is within the same deployment unit as the global +zone. +""" +permalinks = [ + "https://github.com/oxidecomputer/omicron/blob/6cef874/sled-agent/src/bootstrap/early_networking.rs#L483", +] + +[[intra_deployment_unit_only_edges]] +server = "omicron-sled-agent" +client = "propolis-client" +note = "Unsure: it looks like this only queries the Propolis server out of the same deployment unit?" +permalinks = [ + "https://github.com/oxidecomputer/omicron/blob/6cef874/sled-agent/src/instance.rs#L2283", +] + +# NOTE: The following sled-agent edges are bugs: they cross deployment units. +# They are kept here to avoid breaking the build, but need to be fixed, either +# through client-side versioning or by some other means. + +[[intra_deployment_unit_only_edges]] +server = "omicron-sled-agent" +client = "gateway-client" +note = """ +BUG: Sled Agent creates two MGS clients. One of them +(early_networking.rs:376) queries the switch zone on +the same sled, which is within the same deployment unit +as the global zone, so this is okay. + +The other one (early_networking.rs:277) queries both +switch zones, so it crosses deployment units. This needs +to be fixed. +""" +permalinks = [ + "https://github.com/oxidecomputer/omicron/blob/6cef874/sled-agent/src/bootstrap/early_networking.rs#L376", + "https://github.com/oxidecomputer/omicron/blob/6cef874/sled-agent/src/bootstrap/early_networking.rs#L277-L280", +] + +[[intra_deployment_unit_only_edges]] +server = "omicron-sled-agent" +client = "dpd-client" +note = """ +BUG: Sled Agent reaches out to both switch zones, which crosses deployment units. +""" +permalinks = [ + "https://github.com/oxidecomputer/omicron/blob/6cef874/sled-agent/src/services.rs#L1090", +] + +[[intra_deployment_unit_only_edges]] +server = "tfportd" +client = "dpd-client" +note = """ +The tfportd service has the dpd_host SMF property (tfport.xml:52) set to [::1] +(services.rs:2852). tfportd has a --dpd-host option (main.rs:81) but the SMF +start script (svc-tfportd:12) does not use it. +""" +permalinks = [ + "https://github.com/oxidecomputer/omicron/blob/6cef874/sled-agent/src/services.rs#L2852", + "https://github.com/oxidecomputer/dendrite/blob/606c0be/tfportd/misc/tfport.xml#L52-L53", + "https://github.com/oxidecomputer/dendrite/blob/cf31be9/tfportd/src/main.rs#L81-L83", + "https://github.com/oxidecomputer/dendrite/blob/606c0be/tools/svc-tfportd#L12", +] + +[[intra_deployment_unit_only_edges]] +server = "tfportd" +client = "lldpd-client" +note = "tfportd hardcodes localhost for its lldpd-client (tfport.rs:88)." +permalinks = [ + "https://github.com/oxidecomputer/dendrite/blob/606c0be/tfportd/src/tfport.rs#L88", +] + +[[intra_deployment_unit_only_edges]] +server = "wicketd" +client = "ddm-admin-client" +note = "wicketd uses DdmAdminClient::localhost() (bootstrap_addrs.rs:162)." +permalinks = [ + "https://github.com/oxidecomputer/omicron/blob/6cef874/wicketd/src/bootstrap_addrs.rs#L162", +] + +[[intra_deployment_unit_only_edges]] +server = "wicketd" +client = "dpd-client" +note = "wicketd hardcodes [::1] (uplink.rs:83)." +permalinks = [ + "https://github.com/oxidecomputer/omicron/blob/6cef874/wicketd/src/preflight_check/uplink.rs#L83", +] + +[[intra_deployment_unit_only_edges]] +server = "wicketd" +client = "gateway-client" +note = """ +wicketd's mgs-address config (wicketd.rs:35) is always set to [::1] +(services.rs:2586). This config is passed into wicketd at startup +(manifest.xml:20). +""" +permalinks = [ + "https://github.com/oxidecomputer/omicron/blob/6cef874/sled-agent/src/services.rs#L2685-L2689", + "https://github.com/oxidecomputer/omicron/blob/6cef874/wicketd/src/bin/wicketd.rs#L35", + "https://github.com/oxidecomputer/omicron/blob/6cef874/smf/wicketd/manifest.xml#L20", +] diff --git a/dev-tools/ls-apis/src/api_metadata.rs b/dev-tools/ls-apis/src/api_metadata.rs index b0e3dfa4247..0b26f19e750 100644 --- a/dev-tools/ls-apis/src/api_metadata.rs +++ b/dev-tools/ls-apis/src/api_metadata.rs @@ -30,6 +30,7 @@ pub struct AllApiMetadata { deployment_units: BTreeMap, dependency_rules: BTreeMap>, ignored_non_clients: BTreeSet, + intra_deployment_unit_only_edges: Vec, } impl AllApiMetadata { @@ -73,6 +74,13 @@ impl AllApiMetadata { &self.ignored_non_clients } + /// Returns the list of intra-deployment-unit-only edges + pub fn intra_deployment_unit_only_edges( + &self, + ) -> &[IntraDeploymentUnitOnlyEdge] { + &self.intra_deployment_unit_only_edges + } + /// Returns how we should filter the given dependency pub(crate) fn evaluate_dependency( &self, @@ -138,6 +146,7 @@ struct RawApiMetadata { deployment_units: Vec, dependency_filter_rules: Vec, ignored_non_clients: Vec, + intra_deployment_unit_only_edges: Vec, } impl TryFrom for AllApiMetadata { @@ -188,17 +197,41 @@ impl TryFrom for AllApiMetadata { for client_pkg in raw.ignored_non_clients { if !ignored_non_clients.insert(client_pkg.clone()) { bail!( - "entry in ignored_non_clients appearead twice: {:?}", + "entry in ignored_non_clients appeared twice: {:?}", &client_pkg ); } } + // Validate that IDU-only edges reference only known server components + // and APIs. + let known_components: BTreeSet<_> = + deployment_units.values().flat_map(|u| u.packages.iter()).collect(); + for edge in &raw.intra_deployment_unit_only_edges { + if !known_components.contains(&edge.server) { + bail!( + "intra_deployment_unit_only_edges: \ + unknown server component {:?}", + edge.server + ); + } + + if !apis.contains_key(&edge.client) { + bail!( + "intra_deployment_unit_only_edges: \ + unknown client {:?}", + edge.client, + ); + } + } + Ok(AllApiMetadata { apis, deployment_units, dependency_rules, ignored_non_clients, + intra_deployment_unit_only_edges: raw + .intra_deployment_unit_only_edges, }) } } @@ -416,3 +449,30 @@ pub enum Evaluation { /// This dependency should be part of the update DAG Dag, } + +/// An edge that should be excluded from the deployment unit dependency graph +/// because it represents communication that only happens locally within a +/// single instance of a single deployment unit. +#[derive(Deserialize)] +#[serde(deny_unknown_fields)] +pub struct IntraDeploymentUnitOnlyEdge { + /// The server component that consumes the API. + pub server: ServerComponentName, + /// The client package consumed. + pub client: ClientPackageName, + /// Explanation of why this edge is intra-deployment-unit-only. + pub note: String, + /// Permalinks to source code referenced by `note` + pub permalinks: Vec, +} + +impl IntraDeploymentUnitOnlyEdge { + /// Returns true if this rule matches the given (server, client) pair. + pub fn matches( + &self, + server: &ServerComponentName, + client: &ClientPackageName, + ) -> bool { + self.server == *server && self.client == *client + } +} diff --git a/dev-tools/ls-apis/src/bin/ls-apis.rs b/dev-tools/ls-apis/src/bin/ls-apis.rs index df7fd9617a6..7768c988e16 100644 --- a/dev-tools/ls-apis/src/bin/ls-apis.rs +++ b/dev-tools/ls-apis/src/bin/ls-apis.rs @@ -252,7 +252,14 @@ fn print_server_components<'a>( ); } for (c, path) in apis.component_apis_consumed(s, filter)? { - println!("{} consumes: {}", prefix, c); + if apis.idu_only_edge_note(s, c).is_some() { + println!( + "{} consumes: {} (intra-deployment-unit-only)", + prefix, c + ); + } else { + println!("{} consumes: {}", prefix, c); + } if show_deps { for p in path.nodes() { println!("{} via: {}", prefix, p); diff --git a/dev-tools/ls-apis/src/system_apis.rs b/dev-tools/ls-apis/src/system_apis.rs index e61af0b7c50..6cb8e51e007 100644 --- a/dev-tools/ls-apis/src/system_apis.rs +++ b/dev-tools/ls-apis/src/system_apis.rs @@ -27,8 +27,10 @@ use cargo_metadata::Package; use iddqd::IdOrdItem; use iddqd::IdOrdMap; use iddqd::id_upcast; +use itertools::Itertools; use parse_display::{Display, FromStr}; use petgraph::dot::Dot; +use petgraph::graph::Graph; use petgraph::graph::NodeIndex; use std::collections::BTreeMap; use std::collections::BTreeSet; @@ -286,6 +288,51 @@ impl SystemApis { } } + // Validate that the IDU-only edges' components belong to the same + // deployment unit. + for edge in api_metadata.intra_deployment_unit_only_edges() { + let server = &edge.server; + let Some(server_unit) = server_component_units.get(server) else { + // This was validated earlier, but there's not an easy way to + // express this in the type system, so we just handle it + // gracefully. + bail!( + "internal error: intra_deployment_unit_only specifies \ + server {:?} that does not exist in server components", + server, + ); + }; + + let client = &edge.client; + let Some(producers) = api_producers.get(client) else { + // This was validated earlier, but there's not an easy way to + // express this in the type system, so we just handle it + // gracefully. + bail!( + "internal error: intra_deployment_unit_only specifies \ + client {:?} that does not correspond to a known API", + client, + ); + }; + + if !producers.iter().any(|(p, _)| { + server_component_units + .get(p) + .map(|producer_unit| producer_unit == server_unit) + .unwrap_or(false) + }) { + bail!( + "error: intra_deployment_unit_only specifies server \ + {:?} in deployment unit {:?}, but none of the producers \ + of client {:?} are in that deployment_unit: {}", + server, + server_unit, + client, + producers.iter().map(|(p, _)| p.as_str()).join(", "), + ); + } + } + Ok(SystemApis { server_component_units, unit_server_components, @@ -330,6 +377,19 @@ impl SystemApis { &self.api_metadata } + /// Returns the note for a intra-deployment-unit-only edge, if one matches. + pub fn idu_only_edge_note( + &self, + server: &ServerComponentName, + client: &ClientPackageName, + ) -> Option<&str> { + self.api_metadata + .intra_deployment_unit_only_edges() + .iter() + .find(|edge| edge.matches(server, client)) + .map(|edge| edge.note.as_str()) + } + /// Given a server component, return the APIs consumed by this component pub fn component_apis_consumed( &self, @@ -474,7 +534,23 @@ impl SystemApis { /// Returns a string that can be passed to `dot(1)` to render a graph of /// API dependencies among deployment units pub fn dot_by_unit(&self, filter: ApiDependencyFilter) -> Result { - let mut graph = petgraph::graph::Graph::new(); + let (graph, _) = + self.make_deployment_unit_graph(filter, EdgeFilter::All)?; + Ok(Dot::new(&graph).to_string()) + } + + // The complex type below is only used in this one place: the return value + // of this internal helper function. A type alias doesn't seem better. + #[allow(clippy::type_complexity)] + fn make_deployment_unit_graph( + &self, + dependency_filter: ApiDependencyFilter, + edge_filter: EdgeFilter<'_>, + ) -> Result<( + Graph<&DeploymentUnitName, &ClientPackageName>, + BTreeMap<&DeploymentUnitName, NodeIndex>, + )> { + let mut graph = Graph::new(); let nodes: BTreeMap<_, _> = self .deployment_units() .map(|name| (name, graph.add_node(name))) @@ -489,8 +565,34 @@ impl SystemApis { let my_node = nodes.get(deployment_unit).unwrap(); for server_pkg in server_components { for (client_pkg, _) in - self.component_apis_consumed(server_pkg, filter)? + self.component_apis_consumed(server_pkg, dependency_filter)? { + if let EdgeFilter::DagOnly(idu_edges) = edge_filter { + let api = self + .api_metadata + .client_pkgname_lookup(client_pkg) + .unwrap(); + // Filtering DAG-only edges means ignoring everything + // that's not server-side-versioned. + if api.versioned_how != VersionedHow::Server { + continue; + } + + // When filtering DAG-only edges, also skip edges that + // represent intra-deployment-unit-only communication + // (communication within one instance of one deployment + // unit). That's because intra-deployment-unit edges + // would look like a cycle in the dependency graph, but + // aren't one that we care about since they're always + // referring to the same *instance* of the same + // deployment unit. + if idu_edges + .contains(&(server_pkg.clone(), client_pkg.clone())) + { + continue; + } + } + // Multiple server components may produce an API. However, // if an API is produced by multiple server components // within the same deployment unit, we would like to only @@ -506,17 +608,13 @@ impl SystemApis { .collect(); for other_unit in other_units { let other_node = nodes.get(other_unit).unwrap(); - graph.update_edge( - *my_node, - *other_node, - client_pkg.clone(), - ); + graph.update_edge(*my_node, *other_node, client_pkg); } } } } - Ok(Dot::new(&graph).to_string()) + Ok((graph, nodes)) } /// Returns a string that can be passed to `dot(1)` to render a graph of @@ -530,17 +628,17 @@ impl SystemApis { } // The complex type below is only used in this one place: the return value - // of an internal helper function. A type alias doesn't seem better. + // of this internal helper function. A type alias doesn't seem better. #[allow(clippy::type_complexity)] fn make_component_graph( &self, dependency_filter: ApiDependencyFilter, versioned_on_server_only: bool, ) -> Result<( - petgraph::graph::Graph<&ServerComponentName, &ClientPackageName>, + Graph<&ServerComponentName, &ClientPackageName>, BTreeMap<&ServerComponentName, NodeIndex>, )> { - let mut graph = petgraph::graph::Graph::new(); + let mut graph = Graph::new(); let nodes: BTreeMap<_, _> = self .server_component_units .keys() @@ -577,6 +675,123 @@ impl SystemApis { Ok((graph, nodes)) } + /// Computes the set of (server, client) edges for server-side-only- + /// versioned APIs where the server and client are in the same deployment + /// unit. + /// + /// See the caller for more on this. + fn compute_required_idu_edges( + &self, + ) -> Result> { + let filter = ApiDependencyFilter::Default; + let mut required = BTreeSet::new(); + + for (server, server_unit) in &self.server_component_units { + for (client, _) in self.component_apis_consumed(server, filter)? { + // Only consider server-side-versioned APIs. + let api = self + .api_metadata + .client_pkgname_lookup(client) + .expect("consumed API must have metadata"); + if api.versioned_how != VersionedHow::Server { + continue; + } + + // Check if any producer is in the same deployment unit. + for producer in self.api_producers(client) { + let producer_unit = self + .server_component_units + .get(producer) + .expect("API producer must be in some deployment unit"); + + if server_unit == producer_unit { + // This edge would create an intra-unit dependency for + // a server-versioned API, so it must be in the + // intra-deployment-unit-only list. + required.insert((server.clone(), client.clone())); + break; + } + } + } + } + + Ok(required) + } + + /// Validates that these two sets of (server, client) edges match: + /// + /// - API dependencies found by this tool *within* a deployment unit for a + /// server-side-versioned API + /// - API dependencies annotated in the metadata as being + /// intra-deployment-unit + /// + /// Why? Recall that a server-side-only-versioned API means that the server + /// is always updated before its clients. Further, the update system does + /// not (and cannot) guarantee anything about the ordering of updates for a + /// particular kind of deployment unit. (Example: for host OS, the update + /// system does not say that any particular sleds are updated before any + /// others.) Thus, if you have a server-side-only-versioned API with a + /// client in the same deployment unit, that's only allowable if the client + /// is always talking to an instance of the server in the same *instance* of + /// the same deployment unit. A dependency from Sled Agent to Propolis in + /// the *same* host OS is okay. A dependency from Sled Agent to a Propolis + /// on a different sled is not. + /// + /// If we found a same-deployment-unit edge that's not labeled in the + /// manifest as intra-deployment-unit, that means we've identified either a + /// manifest bug or an update bug waiting to happen. + /// + /// Returns the validated set of edges for use by + /// make_deployment_unit_graph. + fn validate_idu_only_edges( + &self, + ) -> Result> { + let required = self.compute_required_idu_edges()?; + + // Build the configured set from the manifest. + let mut configured = BTreeSet::new(); + for edge in self.api_metadata.intra_deployment_unit_only_edges() { + configured.insert((edge.server.clone(), edge.client.clone())); + } + + // Compare the two sets. + let missing: BTreeSet<_> = required.difference(&configured).collect(); + let extra: BTreeSet<_> = configured.difference(&required).collect(); + + if !missing.is_empty() || !extra.is_empty() { + let mut msg = String::new(); + for (server, client) in missing { + msg.push_str(&format!( + "The following API dependendency exists between two \ + components in the same deployment unit, but is not \ + present in `intra_deployment_unit_only_edges`: \ + server {:?} client {:?}\n\ + If this client only ever uses this API with a server \ + in the same *instance* of the same deployment unit, \ + then add it to `intra_deployment_unit_only_edges`. \ + Otherwise, this relationship is incompatible with \ + automated upgrade.\n", + server, client, + )); + } + + for (server, client) in extra { + msg.push_str(&format!( + "`intra_deployment_unit_only_edges` contains an edge \ + between server {:?} and client {:?}, but either \ + this API dependency was not found, or the API is not \ + server-side-only-versioned, or this client and server \ + are not in the same deployment unit.\n", + server, client, + )); + } + + bail!("{}", msg); + } + + Ok(required) + } + /// Verifies various important properties about the assignment of which APIs /// are server-managed vs. client-managed. /// @@ -631,6 +846,10 @@ impl SystemApis { // can't be part of a cycle. let filter = ApiDependencyFilter::Default; + // Validate that all configured intra_deployment_unit_only_edges are + // correct and match the required set exactly. + let idu_only_edges = self.validate_idu_only_edges()?; + // Construct a graph where: // // - nodes are all the API producer and consumer components @@ -643,8 +862,23 @@ impl SystemApis { nodes.iter().map(|(s_c, node)| (node, s_c)).collect(); if let Err(error) = petgraph::algo::toposort(&graph, None) { bail!( - "graph of server-managed components has a cycle (includes \ - node: {:?})", + "graph of server-managed API dependencies between components \ + has a cycle (includes node: {:?})", + reverse_nodes.get(&error.node_id()).unwrap() + ); + } + + // Do the same with a graph of deployment units. + let (graph, nodes) = self.make_deployment_unit_graph( + filter, + EdgeFilter::DagOnly(&idu_only_edges), + )?; + let reverse_nodes: BTreeMap<_, _> = + nodes.iter().map(|(d_u, node)| (node, d_u)).collect(); + if let Err(error) = petgraph::algo::toposort(&graph, None) { + bail!( + "graph of server-managed API dependencies between deployment \ + units has a cycle (includes node: {:?})", reverse_nodes.get(&error.node_id()).unwrap() ); } @@ -814,6 +1048,11 @@ impl SystemApis { } } +enum EdgeFilter<'a> { + All, + DagOnly(&'a BTreeSet<(ServerComponentName, ClientPackageName)>), +} + /// Describes proposals for assigning how APIs should be versioned, based on /// heuristics applied while checking the DAG pub struct DagCheck<'a> {