diff --git a/Cargo.lock b/Cargo.lock index 6ed7bfd0b60..1de0e44cc2f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5479,6 +5479,7 @@ dependencies = [ "logging", "malloc_utils", "metrics", + "mockall", "network_utils", "opentelemetry", "opentelemetry-otlp", diff --git a/beacon_node/client/src/config.rs b/beacon_node/client/src/config.rs index aeaa196df86..53f91d2842b 100644 --- a/beacon_node/client/src/config.rs +++ b/beacon_node/client/src/config.rs @@ -79,6 +79,7 @@ pub struct Config { pub genesis_state_url: Option, pub genesis_state_url_timeout: Duration, pub allow_insecure_genesis_sync: bool, + pub telemetry_sample_ratio: Option // for testing only } impl Default for Config { @@ -107,6 +108,7 @@ impl Default for Config { // This default value should always be overwritten by the CLI default value. genesis_state_url_timeout: Duration::from_secs(60), allow_insecure_genesis_sync: false, + telemetry_sample_ratio: None } } } diff --git a/book/src/help_bn.md b/book/src/help_bn.md index 5f3c43a7e42..4d13936e2aa 100644 --- a/book/src/help_bn.md +++ b/book/src/help_bn.md @@ -399,6 +399,12 @@ Options: Override the OpenTelemetry service name. Defaults to 'lighthouse-bn' for beacon node, 'lighthouse-vc' for validator client, or 'lighthouse' for other subcommands. + --telemetry-trace-sample-rate + OpenTelemetry trace sampling rate as a percentage (0-100). A value of + 1 means 1% of traces are sampled. Lower values reduce resource + consumption. For more info see + https://opentelemetry.io/docs/concepts/sampling/#why-sampling + [default: 1] --trusted-peers One or more comma-delimited trusted peer ids which always have the highest score according to the peer scoring system. diff --git a/book/src/help_general.md b/book/src/help_general.md index 56e4aebdb52..34c48f060ab 100644 --- a/book/src/help_general.md +++ b/book/src/help_general.md @@ -83,6 +83,12 @@ Options: Override the OpenTelemetry service name. Defaults to 'lighthouse-bn' for beacon node, 'lighthouse-vc' for validator client, or 'lighthouse' for other subcommands. + --telemetry-trace-sample-rate + OpenTelemetry trace sampling rate as a percentage (0-100). A value of + 1 means 1% of traces are sampled. Lower values reduce resource + consumption. For more info see + https://opentelemetry.io/docs/concepts/sampling/#why-sampling + [default: 1] -V, --version Print version diff --git a/book/src/help_vc.md b/book/src/help_vc.md index 2a9936d1d2f..83cd2785ea7 100644 --- a/book/src/help_vc.md +++ b/book/src/help_vc.md @@ -141,6 +141,12 @@ Options: Override the OpenTelemetry service name. Defaults to 'lighthouse-bn' for beacon node, 'lighthouse-vc' for validator client, or 'lighthouse' for other subcommands. + --telemetry-trace-sample-rate + OpenTelemetry trace sampling rate as a percentage (0-100). A value of + 1 means 1% of traces are sampled. Lower values reduce resource + consumption. For more info see + https://opentelemetry.io/docs/concepts/sampling/#why-sampling + [default: 1] --validator-registration-batch-size Defines the number of validators per validator/register_validator request sent to the BN. This value can be reduced to avoid timeouts diff --git a/book/src/help_vm.md b/book/src/help_vm.md index 409c56a74d8..2ee1c1fe2b5 100644 --- a/book/src/help_vm.md +++ b/book/src/help_vm.md @@ -84,6 +84,12 @@ Options: Override the OpenTelemetry service name. Defaults to 'lighthouse-bn' for beacon node, 'lighthouse-vc' for validator client, or 'lighthouse' for other subcommands. + --telemetry-trace-sample-rate + OpenTelemetry trace sampling rate as a percentage (0-100). A value of + 1 means 1% of traces are sampled. Lower values reduce resource + consumption. For more info see + https://opentelemetry.io/docs/concepts/sampling/#why-sampling + [default: 1] Flags: --disable-log-timestamp diff --git a/book/src/help_vm_create.md b/book/src/help_vm_create.md index a438c075dcc..ae7eed3b5b7 100644 --- a/book/src/help_vm_create.md +++ b/book/src/help_vm_create.md @@ -100,6 +100,12 @@ Options: Override the OpenTelemetry service name. Defaults to 'lighthouse-bn' for beacon node, 'lighthouse-vc' for validator client, or 'lighthouse' for other subcommands. + --telemetry-trace-sample-rate + OpenTelemetry trace sampling rate as a percentage (0-100). A value of + 1 means 1% of traces are sampled. Lower values reduce resource + consumption. For more info see + https://opentelemetry.io/docs/concepts/sampling/#why-sampling + [default: 1] Flags: --disable-deposits diff --git a/book/src/help_vm_import.md b/book/src/help_vm_import.md index 3c768f67052..3cfc40972d7 100644 --- a/book/src/help_vm_import.md +++ b/book/src/help_vm_import.md @@ -80,6 +80,12 @@ Options: Override the OpenTelemetry service name. Defaults to 'lighthouse-bn' for beacon node, 'lighthouse-vc' for validator client, or 'lighthouse' for other subcommands. + --telemetry-trace-sample-rate + OpenTelemetry trace sampling rate as a percentage (0-100). A value of + 1 means 1% of traces are sampled. Lower values reduce resource + consumption. For more info see + https://opentelemetry.io/docs/concepts/sampling/#why-sampling + [default: 1] --validators-file The path to a JSON file containing a list of validators to be imported to the validator client. This file is usually named "validators.json". diff --git a/book/src/help_vm_move.md b/book/src/help_vm_move.md index cd139449b39..a17c2ebc4e2 100644 --- a/book/src/help_vm_move.md +++ b/book/src/help_vm_move.md @@ -89,6 +89,12 @@ Options: Override the OpenTelemetry service name. Defaults to 'lighthouse-bn' for beacon node, 'lighthouse-vc' for validator client, or 'lighthouse' for other subcommands. + --telemetry-trace-sample-rate + OpenTelemetry trace sampling rate as a percentage (0-100). A value of + 1 means 1% of traces are sampled. Lower values reduce resource + consumption. For more info see + https://opentelemetry.io/docs/concepts/sampling/#why-sampling + [default: 1] --validators The validators to be moved. Either a list of 0x-prefixed validator pubkeys or the keyword "all". diff --git a/lighthouse/Cargo.toml b/lighthouse/Cargo.toml index ebe00c9be59..141ff159b18 100644 --- a/lighthouse/Cargo.toml +++ b/lighthouse/Cargo.toml @@ -61,6 +61,7 @@ network_utils = { workspace = true } opentelemetry = { workspace = true } opentelemetry-otlp = { workspace = true } opentelemetry_sdk = { workspace = true } +mockall = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } serde_yaml = { workspace = true } diff --git a/lighthouse/src/main.rs b/lighthouse/src/main.rs index c93016a0f54..e0bb90b16be 100644 --- a/lighthouse/src/main.rs +++ b/lighthouse/src/main.rs @@ -294,6 +294,23 @@ fn main() { .global(true) .display_order(0) ) + .arg( + Arg::new("telemetry-trace-sample-rate") + .long("telemetry-trace-sample-rate") + .value_name("PERCENT") + .help( + "OpenTelemetry trace sampling rate as a percentage (0-100). \ + A value of 1 means 1% of traces are sampled. \ + Lower values reduce resource consumption. \ + For more info see https://opentelemetry.io/docs/concepts/sampling/#why-sampling" + ) + .requires("telemetry-collector-url") + .value_parser(clap::value_parser!(u8).range(0..=100)) + .default_value("1") + .action(ArgAction::Set) + .global(true) + .display_order(0) + ) .arg( Arg::new("datadir") .long("datadir") @@ -657,6 +674,7 @@ fn run( .eth2_network_config(eth2_network_config)? .build()?; + let mut telemetry_sample_ratio = None; if let Some(telemetry_collector_url) = matches.get_one::("telemetry-collector-url") { let telemetry_layer = environment.runtime().block_on(async { let exporter = opentelemetry_otlp::SpanExporter::builder() @@ -675,8 +693,18 @@ fn run( _ => "lighthouse".to_string(), }); + // Calculate sample percent as a ratio (percentage / 100) + telemetry_sample_ratio = Some(matches + .get_one::("telemetry-trace-sample-rate") + .copied() + .unwrap_or(1) as f64 / 100.0); + let sampler = opentelemetry_sdk::trace::Sampler::ParentBased(Box::new( + opentelemetry_sdk::trace::Sampler::TraceIdRatioBased(telemetry_sample_ratio.unwrap()), + )); + let provider = opentelemetry_sdk::trace::SdkTracerProvider::builder() .with_batch_exporter(exporter) + .with_sampler(sampler) .with_resource( opentelemetry_sdk::Resource::builder() .with_service_name(service_name) @@ -823,6 +851,7 @@ fn run( let executor = context.executor.clone(); let mut config = beacon_node::get_config::(matches, &context)?; config.logger_config = logger_config; + config.telemetry_sample_ratio = telemetry_sample_ratio; // Dump configs if `dump-config` or `dump-chain-config` flags are set clap_utils::check_dump_configs::<_, E>(matches, &config, &context.eth2_config.spec)?; @@ -868,4 +897,4 @@ fn run( ShutdownReason::Success(_) => Ok(()), ShutdownReason::Failure(msg) => Err(msg.to_string()), } -} +} \ No newline at end of file diff --git a/lighthouse/tests/beacon_node.rs b/lighthouse/tests/beacon_node.rs index 207324ea33f..04814c15a5c 100644 --- a/lighthouse/tests/beacon_node.rs +++ b/lighthouse/tests/beacon_node.rs @@ -2839,3 +2839,42 @@ fn invalid_block_roots_default_mainnet() { assert!(config.chain.invalid_block_roots.is_empty()); }) } + +#[test] +fn telemetry_sample_rate_default() { + let float_tolerance: f64 = 1e-3; + let expected_ratio = 0.01; + + CommandLineTest::new() + .flag("telemetry-collector-url", Some("http://localhost:4317")) + .run_with_zero_port() + .with_config(|config| { + let actual_ratio = config.telemetry_sample_ratio.expect("telemetry_sample_ratio should be set"); + assert!( + (actual_ratio - expected_ratio).abs() < float_tolerance, + "Expected {}, got {}", + expected_ratio, + actual_ratio + ); + }); +} + +#[test] +fn telemetry_sample_rate_custom() { + let float_tolerance: f64 = 1e-3; + let expected_ratio = 0.05; + + CommandLineTest::new() + .flag("telemetry-trace-sample-rate", Some("5")) + .flag("telemetry-collector-url", Some("http://localhost:4317")) + .run_with_zero_port() + .with_config(|config| { + let actual_ratio = config.telemetry_sample_ratio.expect("telemetry_sample_ratio should be set"); + assert!( + (actual_ratio - expected_ratio).abs() < float_tolerance, + "Expected {}, got {}", + expected_ratio, + actual_ratio + ); + }); +} \ No newline at end of file