diff --git a/changelog/fragments/1765377327-hide-healthcheckv2-from-status-output.yaml b/changelog/fragments/1765377327-hide-healthcheckv2-from-status-output.yaml new file mode 100644 index 00000000000..dd7fe9f01cb --- /dev/null +++ b/changelog/fragments/1765377327-hide-healthcheckv2-from-status-output.yaml @@ -0,0 +1,45 @@ +# REQUIRED +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: bug-fix + +# REQUIRED for all kinds +# Change summary; a 80ish characters long description of the change. +summary: hide healthcheckv2 from status output + +# REQUIRED for breaking-change, deprecation, known-issue +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# description: + +# REQUIRED for breaking-change, deprecation, known-issue +# impact: + +# REQUIRED for breaking-change, deprecation, known-issue +# action: + +# REQUIRED for all kinds +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent + +# AUTOMATED +# OPTIONAL to manually add other PR URLs +# PR URL: A link the PR that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +# pr: https://github.com/owner/repo/1234 + +# AUTOMATED +# OPTIONAL to manually add other issue URLs +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +# issue: https://github.com/owner/repo/1234 diff --git a/internal/pkg/otel/manager/execution_subprocess.go b/internal/pkg/otel/manager/execution_subprocess.go index 702738e05e2..56eb08c204f 100644 --- a/internal/pkg/otel/manager/execution_subprocess.go +++ b/internal/pkg/otel/manager/execution_subprocess.go @@ -14,7 +14,6 @@ import ( "sync" "time" - "github.com/gofrs/uuid/v5" "go.opentelemetry.io/collector/component" "gopkg.in/yaml.v3" @@ -41,16 +40,12 @@ const ( // newSubprocessExecution creates a new execution which runs the otel collector in a subprocess. A metricsPort or // healthCheckPort of 0 will result in a random port being used. -func newSubprocessExecution(logLevel logp.Level, collectorPath string, metricsPort int, healthCheckPort int) (*subprocessExecution, error) { - nsUUID, err := uuid.NewV4() - if err != nil { - return nil, fmt.Errorf("cannot generate UUID: %w", err) - } +func newSubprocessExecution(logLevel logp.Level, collectorPath string, uuid string, metricsPort int, healthCheckPort int) (*subprocessExecution, error) { componentType, err := component.NewType(healthCheckExtensionName) if err != nil { return nil, fmt.Errorf("cannot create component type: %w", err) } - healthCheckExtensionID := component.NewIDWithName(componentType, nsUUID.String()).String() + healthCheckExtensionID := component.NewIDWithName(componentType, uuid).String() return &subprocessExecution{ collectorPath: collectorPath, diff --git a/internal/pkg/otel/manager/manager.go b/internal/pkg/otel/manager/manager.go index 75748be597c..fcf5b43ef72 100644 --- a/internal/pkg/otel/manager/manager.go +++ b/internal/pkg/otel/manager/manager.go @@ -18,24 +18,21 @@ import ( "sync/atomic" "time" + "github.com/gofrs/uuid/v5" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/status" + "go.opentelemetry.io/collector/confmap" "go.uber.org/zap" - "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" - - componentmonitoring "github.com/elastic/elastic-agent/internal/pkg/agent/application/monitoring/component" - "github.com/elastic/elastic-agent-client/v7/pkg/client" + "github.com/elastic/elastic-agent-libs/logp" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" + componentmonitoring "github.com/elastic/elastic-agent/internal/pkg/agent/application/monitoring/component" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" + "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" "github.com/elastic/elastic-agent/internal/pkg/otel/translate" "github.com/elastic/elastic-agent/pkg/component" "github.com/elastic/elastic-agent/pkg/component/runtime" - - "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/status" - "go.opentelemetry.io/collector/confmap" - - "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent/pkg/core/logger" ) @@ -82,8 +79,9 @@ type OTelManager struct { agentInfo info.Agent beatMonitoringConfigGetter translate.BeatMonitoringConfigGetter - collectorCfg *confmap.Conf - components []component.Component + healthCheckExtID string + collectorCfg *confmap.Conf + components []component.Component // The current configuration that the OTel collector is using. In the case that // the mergedCollectorCfg is nil then the collector is not running. @@ -142,6 +140,12 @@ func NewOTelManager( var recoveryTimer collectorRecoveryTimer var err error + hcUUID, err := uuid.NewV4() + if err != nil { + return nil, fmt.Errorf("cannot generate UUID: %w", err) + } + hcUUIDStr := hcUUID.String() + // determine the otel collector ports collectorMetricsPort, collectorHealthCheckPort := 0, 0 if agentCollectorConfig != nil { @@ -168,7 +172,7 @@ func NewOTelManager( return nil, fmt.Errorf("failed to get the path to the collector executable: %w", err) } recoveryTimer = newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute) - exec, err = newSubprocessExecution(logLevel, executable, collectorMetricsPort, collectorHealthCheckPort) + exec, err = newSubprocessExecution(logLevel, executable, hcUUIDStr, collectorMetricsPort, collectorHealthCheckPort) if err != nil { return nil, fmt.Errorf("failed to create subprocess execution: %w", err) } @@ -183,6 +187,7 @@ func NewOTelManager( baseLogger: baseLogger, agentInfo: agentInfo, beatMonitoringConfigGetter: beatMonitoringConfigGetter, + healthCheckExtID: fmt.Sprintf("extension:healthcheckv2/%s", hcUUIDStr), errCh: make(chan error, 1), // holds at most one error collectorStatusCh: make(chan *status.AggregateStatus, 1), // componentStateCh uses a buffer channel to ensure that no state transitions are missed and to prevent @@ -534,6 +539,8 @@ func (m *OTelManager) handleOtelStatusUpdate(otelStatus *status.AggregateStatus) delete(extensionsMap.ComponentStatusMap, extensionKey) case strings.HasPrefix(extensionKey, "extension:elastic_diagnostics"): delete(extensionsMap.ComponentStatusMap, extensionKey) + case extensionKey == m.healthCheckExtID: + delete(extensionsMap.ComponentStatusMap, extensionKey) } } diff --git a/internal/pkg/otel/manager/manager_test.go b/internal/pkg/otel/manager/manager_test.go index 2431c960163..0eaab62ff72 100644 --- a/internal/pkg/otel/manager/manager_test.go +++ b/internal/pkg/otel/manager/manager_test.go @@ -348,7 +348,11 @@ func TestOTelManager_Run(t *testing.T) { { name: "subprocess collector config updates", execModeFn: func(collectorRunErr chan error) (collectorExecution, error) { - return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0) + hcUUID, err := uuid.NewV4() + if err != nil { + return nil, fmt.Errorf("cannot generate UUID: %w", err) + } + return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0) }, restarter: newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute), testFn: func(t *testing.T, m *OTelManager, e *EventListener, exec *testExecution, managerCtxCancel context.CancelFunc, collectorRunErr chan error) { @@ -376,7 +380,11 @@ func TestOTelManager_Run(t *testing.T) { { name: "subprocess collector stopped gracefully outside manager", execModeFn: func(collectorRunErr chan error) (collectorExecution, error) { - return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0) + hcUUID, err := uuid.NewV4() + if err != nil { + return nil, fmt.Errorf("cannot generate UUID: %w", err) + } + return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0) }, restarter: newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute), testFn: func(t *testing.T, m *OTelManager, e *EventListener, exec *testExecution, managerCtxCancel context.CancelFunc, collectorRunErr chan error) { @@ -405,7 +413,11 @@ func TestOTelManager_Run(t *testing.T) { { name: "subprocess collector killed outside manager", execModeFn: func(collectorRunErr chan error) (collectorExecution, error) { - return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0) + hcUUID, err := uuid.NewV4() + if err != nil { + return nil, fmt.Errorf("cannot generate UUID: %w", err) + } + return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0) }, restarter: newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute), testFn: func(t *testing.T, m *OTelManager, e *EventListener, exec *testExecution, managerCtxCancel context.CancelFunc, collectorRunErr chan error) { @@ -448,7 +460,11 @@ func TestOTelManager_Run(t *testing.T) { { name: "subprocess collector panics restarts", execModeFn: func(collectorRunErr chan error) (collectorExecution, error) { - return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0) + hcUUID, err := uuid.NewV4() + if err != nil { + return nil, fmt.Errorf("cannot generate UUID: %w", err) + } + return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0) }, restarter: newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute), testFn: func(t *testing.T, m *OTelManager, e *EventListener, exec *testExecution, managerCtxCancel context.CancelFunc, collectorRunErr chan error) { @@ -483,7 +499,11 @@ func TestOTelManager_Run(t *testing.T) { { name: "subprocess collector panics reports fatal", execModeFn: func(collectorRunErr chan error) (collectorExecution, error) { - return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0) + hcUUID, err := uuid.NewV4() + if err != nil { + return nil, fmt.Errorf("cannot generate UUID: %w", err) + } + return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0) }, restarter: newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute), testFn: func(t *testing.T, m *OTelManager, e *EventListener, exec *testExecution, managerCtxCancel context.CancelFunc, collectorRunErr chan error) { @@ -524,7 +544,11 @@ func TestOTelManager_Run(t *testing.T) { { name: "subprocess collector killed if delayed and manager is stopped", execModeFn: func(collectorRunErr chan error) (collectorExecution, error) { - subprocessExec, err := newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0) + hcUUID, err := uuid.NewV4() + if err != nil { + return nil, fmt.Errorf("cannot generate UUID: %w", err) + } + subprocessExec, err := newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0) if err != nil { return nil, err } @@ -577,7 +601,11 @@ func TestOTelManager_Run(t *testing.T) { { name: "subprocess collector gracefully exited if delayed a bit and manager is stopped", execModeFn: func(collectorRunErr chan error) (collectorExecution, error) { - subprocessExec, err := newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0) + hcUUID, err := uuid.NewV4() + if err != nil { + return nil, fmt.Errorf("cannot generate UUID: %w", err) + } + subprocessExec, err := newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0) if err != nil { return nil, err } @@ -630,7 +658,11 @@ func TestOTelManager_Run(t *testing.T) { { name: "subprocess user has healthcheck extension", execModeFn: func(collectorRunErr chan error) (collectorExecution, error) { - return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0) + hcUUID, err := uuid.NewV4() + if err != nil { + return nil, fmt.Errorf("cannot generate UUID: %w", err) + } + return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0) }, restarter: newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute), testFn: func(t *testing.T, m *OTelManager, e *EventListener, exec *testExecution, managerCtxCancel context.CancelFunc, collectorRunErr chan error) { @@ -665,7 +697,11 @@ func TestOTelManager_Run(t *testing.T) { { name: "subprocess collector empty config", execModeFn: func(collectorRunErr chan error) (collectorExecution, error) { - return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0) + hcUUID, err := uuid.NewV4() + if err != nil { + return nil, fmt.Errorf("cannot generate UUID: %w", err) + } + return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0) }, restarter: newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute), skipListeningErrors: true, @@ -712,7 +748,11 @@ func TestOTelManager_Run(t *testing.T) { { name: "subprocess collector failed to start", execModeFn: func(collectorRunErr chan error) (collectorExecution, error) { - return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0) + hcUUID, err := uuid.NewV4() + if err != nil { + return nil, fmt.Errorf("cannot generate UUID: %w", err) + } + return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0) }, restarter: newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute), testFn: func(t *testing.T, m *OTelManager, e *EventListener, exec *testExecution, managerCtxCancel context.CancelFunc, collectorRunErr chan error) { @@ -846,7 +886,11 @@ func TestOTelManager_Logging(t *testing.T) { { name: "subprocess execution", execModeFn: func(collectorRunErr chan error) (collectorExecution, error) { - return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0) + hcUUID, err := uuid.NewV4() + if err != nil { + return nil, fmt.Errorf("cannot generate UUID: %w", err) + } + return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0) }, }, } { @@ -919,7 +963,11 @@ func TestOTelManager_Ports(t *testing.T) { { name: "subprocess execution", execModeFn: func(collectorRunErr chan error) (collectorExecution, error) { - return newSubprocessExecution(logp.DebugLevel, testBinary, metricsPort, healthCheckPort) + hcUUID, err := uuid.NewV4() + if err != nil { + return nil, fmt.Errorf("cannot generate UUID: %w", err) + } + return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), metricsPort, healthCheckPort) }, healthCheckEnabled: true, }, @@ -1071,7 +1119,7 @@ func TestOTelManager_PortConflict(t *testing.T) { waitTimeForStop, ) require.NoError(t, err, "could not create otel manager") - executionMode, err := newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0) + executionMode, err := newSubprocessExecution(logp.DebugLevel, testBinary, strings.TrimPrefix(m.healthCheckExtID, "extension:healthcheckv2/"), 0, 0) require.NoError(t, err, "could not create subprocess execution mode") m.execution = executionMode @@ -1288,6 +1336,9 @@ func TestOTelManager_handleOtelStatusUpdate(t *testing.T) { "extension:elastic_diagnostics/test": { Event: componentstatus.NewEvent(componentstatus.StatusOK), }, + "extension:healthcheckv2/uuid": { + Event: componentstatus.NewEvent(componentstatus.StatusOK), + }, }, }, }, @@ -1373,6 +1424,7 @@ func TestOTelManager_handleOtelStatusUpdate(t *testing.T) { mgr := &OTelManager{ logger: newTestLogger(), components: tt.components, + healthCheckExtID: "extension:healthcheckv2/uuid", currentComponentStates: make(map[string]runtime.ComponentComponentState), }