Skip to content

Commit dba935f

Browse files
[8.19] (backport #11718) [OTEL] Hide healthcheckv2 from agent status. (#11775)
* [OTEL] Hide healthcheckv2 from agent status. (#11718) * Hide healthcheckv2 from agent status. * Add changelog entry. * Update entry. * Filter out by specific UUID. (cherry picked from commit 81d77f5) # Conflicts: # internal/pkg/otel/manager/manager.go * Fix conflict. --------- Co-authored-by: Blake Rouse <[email protected]>
1 parent e012ad8 commit dba935f

File tree

4 files changed

+132
-33
lines changed

4 files changed

+132
-33
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# REQUIRED
2+
# Kind can be one of:
3+
# - breaking-change: a change to previously-documented behavior
4+
# - deprecation: functionality that is being removed in a later release
5+
# - bug-fix: fixes a problem in a previous version
6+
# - enhancement: extends functionality but does not break or fix existing behavior
7+
# - feature: new functionality
8+
# - known-issue: problems that we are aware of in a given version
9+
# - security: impacts on the security of a product or a user’s deployment.
10+
# - upgrade: important information for someone upgrading from a prior version
11+
# - other: does not fit into any of the other categories
12+
kind: bug-fix
13+
14+
# REQUIRED for all kinds
15+
# Change summary; a 80ish characters long description of the change.
16+
summary: hide healthcheckv2 from status output
17+
18+
# REQUIRED for breaking-change, deprecation, known-issue
19+
# Long description; in case the summary is not enough to describe the change
20+
# this field accommodate a description without length limits.
21+
# description:
22+
23+
# REQUIRED for breaking-change, deprecation, known-issue
24+
# impact:
25+
26+
# REQUIRED for breaking-change, deprecation, known-issue
27+
# action:
28+
29+
# REQUIRED for all kinds
30+
# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
31+
component: elastic-agent
32+
33+
# AUTOMATED
34+
# OPTIONAL to manually add other PR URLs
35+
# PR URL: A link the PR that added the changeset.
36+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
37+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
38+
# Please provide it if you are adding a fragment for a different PR.
39+
# pr: https://github.com/owner/repo/1234
40+
41+
# AUTOMATED
42+
# OPTIONAL to manually add other issue URLs
43+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
44+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
45+
# issue: https://github.com/owner/repo/1234

internal/pkg/otel/manager/execution_subprocess.go

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ import (
1414
"sync"
1515
"time"
1616

17-
"github.com/gofrs/uuid/v5"
1817
"go.opentelemetry.io/collector/component"
1918
"gopkg.in/yaml.v3"
2019

@@ -41,16 +40,12 @@ const (
4140

4241
// newSubprocessExecution creates a new execution which runs the otel collector in a subprocess. A metricsPort or
4342
// healthCheckPort of 0 will result in a random port being used.
44-
func newSubprocessExecution(logLevel logp.Level, collectorPath string, metricsPort int, healthCheckPort int) (*subprocessExecution, error) {
45-
nsUUID, err := uuid.NewV4()
46-
if err != nil {
47-
return nil, fmt.Errorf("cannot generate UUID: %w", err)
48-
}
43+
func newSubprocessExecution(logLevel logp.Level, collectorPath string, uuid string, metricsPort int, healthCheckPort int) (*subprocessExecution, error) {
4944
componentType, err := component.NewType(healthCheckExtensionName)
5045
if err != nil {
5146
return nil, fmt.Errorf("cannot create component type: %w", err)
5247
}
53-
healthCheckExtensionID := component.NewIDWithName(componentType, nsUUID.String()).String()
48+
healthCheckExtensionID := component.NewIDWithName(componentType, uuid).String()
5449

5550
return &subprocessExecution{
5651
collectorPath: collectorPath,

internal/pkg/otel/manager/manager.go

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,24 +18,21 @@ import (
1818
"sync/atomic"
1919
"time"
2020

21+
"github.com/gofrs/uuid/v5"
22+
"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/status"
23+
"go.opentelemetry.io/collector/confmap"
2124
"go.uber.org/zap"
2225

23-
"github.com/elastic/elastic-agent/internal/pkg/agent/configuration"
24-
25-
componentmonitoring "github.com/elastic/elastic-agent/internal/pkg/agent/application/monitoring/component"
26-
2726
"github.com/elastic/elastic-agent-client/v7/pkg/client"
27+
"github.com/elastic/elastic-agent-libs/logp"
28+
2829
"github.com/elastic/elastic-agent/internal/pkg/agent/application/info"
30+
componentmonitoring "github.com/elastic/elastic-agent/internal/pkg/agent/application/monitoring/component"
2931
"github.com/elastic/elastic-agent/internal/pkg/agent/application/paths"
32+
"github.com/elastic/elastic-agent/internal/pkg/agent/configuration"
3033
"github.com/elastic/elastic-agent/internal/pkg/otel/translate"
3134
"github.com/elastic/elastic-agent/pkg/component"
3235
"github.com/elastic/elastic-agent/pkg/component/runtime"
33-
34-
"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/status"
35-
"go.opentelemetry.io/collector/confmap"
36-
37-
"github.com/elastic/elastic-agent-libs/logp"
38-
3936
"github.com/elastic/elastic-agent/pkg/core/logger"
4037
)
4138

@@ -82,8 +79,9 @@ type OTelManager struct {
8279
agentInfo info.Agent
8380
beatMonitoringConfigGetter translate.BeatMonitoringConfigGetter
8481

85-
collectorCfg *confmap.Conf
86-
components []component.Component
82+
healthCheckExtID string
83+
collectorCfg *confmap.Conf
84+
components []component.Component
8785

8886
// The current configuration that the OTel collector is using. In the case that
8987
// the mergedCollectorCfg is nil then the collector is not running.
@@ -142,6 +140,12 @@ func NewOTelManager(
142140
var recoveryTimer collectorRecoveryTimer
143141
var err error
144142

143+
hcUUID, err := uuid.NewV4()
144+
if err != nil {
145+
return nil, fmt.Errorf("cannot generate UUID: %w", err)
146+
}
147+
hcUUIDStr := hcUUID.String()
148+
145149
// determine the otel collector ports
146150
collectorMetricsPort, collectorHealthCheckPort := 0, 0
147151
if agentCollectorConfig != nil {
@@ -168,7 +172,7 @@ func NewOTelManager(
168172
return nil, fmt.Errorf("failed to get the path to the collector executable: %w", err)
169173
}
170174
recoveryTimer = newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute)
171-
exec, err = newSubprocessExecution(logLevel, executable, collectorMetricsPort, collectorHealthCheckPort)
175+
exec, err = newSubprocessExecution(logLevel, executable, hcUUIDStr, collectorMetricsPort, collectorHealthCheckPort)
172176
if err != nil {
173177
return nil, fmt.Errorf("failed to create subprocess execution: %w", err)
174178
}
@@ -183,6 +187,7 @@ func NewOTelManager(
183187
baseLogger: baseLogger,
184188
agentInfo: agentInfo,
185189
beatMonitoringConfigGetter: beatMonitoringConfigGetter,
190+
healthCheckExtID: fmt.Sprintf("extension:healthcheckv2/%s", hcUUIDStr),
186191
errCh: make(chan error, 1), // holds at most one error
187192
collectorStatusCh: make(chan *status.AggregateStatus, 1),
188193
// componentStateCh uses a buffer channel to ensure that no state transitions are missed and to prevent
@@ -534,6 +539,8 @@ func (m *OTelManager) handleOtelStatusUpdate(otelStatus *status.AggregateStatus)
534539
delete(extensionsMap.ComponentStatusMap, extensionKey)
535540
case strings.HasPrefix(extensionKey, "extension:elastic_diagnostics"):
536541
delete(extensionsMap.ComponentStatusMap, extensionKey)
542+
case extensionKey == m.healthCheckExtID:
543+
delete(extensionsMap.ComponentStatusMap, extensionKey)
537544
}
538545
}
539546

internal/pkg/otel/manager/manager_test.go

Lines changed: 65 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,11 @@ func TestOTelManager_Run(t *testing.T) {
348348
{
349349
name: "subprocess collector config updates",
350350
execModeFn: func(collectorRunErr chan error) (collectorExecution, error) {
351-
return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0)
351+
hcUUID, err := uuid.NewV4()
352+
if err != nil {
353+
return nil, fmt.Errorf("cannot generate UUID: %w", err)
354+
}
355+
return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0)
352356
},
353357
restarter: newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute),
354358
testFn: func(t *testing.T, m *OTelManager, e *EventListener, exec *testExecution, managerCtxCancel context.CancelFunc, collectorRunErr chan error) {
@@ -376,7 +380,11 @@ func TestOTelManager_Run(t *testing.T) {
376380
{
377381
name: "subprocess collector stopped gracefully outside manager",
378382
execModeFn: func(collectorRunErr chan error) (collectorExecution, error) {
379-
return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0)
383+
hcUUID, err := uuid.NewV4()
384+
if err != nil {
385+
return nil, fmt.Errorf("cannot generate UUID: %w", err)
386+
}
387+
return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0)
380388
},
381389
restarter: newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute),
382390
testFn: func(t *testing.T, m *OTelManager, e *EventListener, exec *testExecution, managerCtxCancel context.CancelFunc, collectorRunErr chan error) {
@@ -405,7 +413,11 @@ func TestOTelManager_Run(t *testing.T) {
405413
{
406414
name: "subprocess collector killed outside manager",
407415
execModeFn: func(collectorRunErr chan error) (collectorExecution, error) {
408-
return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0)
416+
hcUUID, err := uuid.NewV4()
417+
if err != nil {
418+
return nil, fmt.Errorf("cannot generate UUID: %w", err)
419+
}
420+
return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0)
409421
},
410422
restarter: newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute),
411423
testFn: func(t *testing.T, m *OTelManager, e *EventListener, exec *testExecution, managerCtxCancel context.CancelFunc, collectorRunErr chan error) {
@@ -448,7 +460,11 @@ func TestOTelManager_Run(t *testing.T) {
448460
{
449461
name: "subprocess collector panics restarts",
450462
execModeFn: func(collectorRunErr chan error) (collectorExecution, error) {
451-
return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0)
463+
hcUUID, err := uuid.NewV4()
464+
if err != nil {
465+
return nil, fmt.Errorf("cannot generate UUID: %w", err)
466+
}
467+
return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0)
452468
},
453469
restarter: newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute),
454470
testFn: func(t *testing.T, m *OTelManager, e *EventListener, exec *testExecution, managerCtxCancel context.CancelFunc, collectorRunErr chan error) {
@@ -483,7 +499,11 @@ func TestOTelManager_Run(t *testing.T) {
483499
{
484500
name: "subprocess collector panics reports fatal",
485501
execModeFn: func(collectorRunErr chan error) (collectorExecution, error) {
486-
return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0)
502+
hcUUID, err := uuid.NewV4()
503+
if err != nil {
504+
return nil, fmt.Errorf("cannot generate UUID: %w", err)
505+
}
506+
return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0)
487507
},
488508
restarter: newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute),
489509
testFn: func(t *testing.T, m *OTelManager, e *EventListener, exec *testExecution, managerCtxCancel context.CancelFunc, collectorRunErr chan error) {
@@ -524,7 +544,11 @@ func TestOTelManager_Run(t *testing.T) {
524544
{
525545
name: "subprocess collector killed if delayed and manager is stopped",
526546
execModeFn: func(collectorRunErr chan error) (collectorExecution, error) {
527-
subprocessExec, err := newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0)
547+
hcUUID, err := uuid.NewV4()
548+
if err != nil {
549+
return nil, fmt.Errorf("cannot generate UUID: %w", err)
550+
}
551+
subprocessExec, err := newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0)
528552
if err != nil {
529553
return nil, err
530554
}
@@ -577,7 +601,11 @@ func TestOTelManager_Run(t *testing.T) {
577601
{
578602
name: "subprocess collector gracefully exited if delayed a bit and manager is stopped",
579603
execModeFn: func(collectorRunErr chan error) (collectorExecution, error) {
580-
subprocessExec, err := newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0)
604+
hcUUID, err := uuid.NewV4()
605+
if err != nil {
606+
return nil, fmt.Errorf("cannot generate UUID: %w", err)
607+
}
608+
subprocessExec, err := newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0)
581609
if err != nil {
582610
return nil, err
583611
}
@@ -630,7 +658,11 @@ func TestOTelManager_Run(t *testing.T) {
630658
{
631659
name: "subprocess user has healthcheck extension",
632660
execModeFn: func(collectorRunErr chan error) (collectorExecution, error) {
633-
return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0)
661+
hcUUID, err := uuid.NewV4()
662+
if err != nil {
663+
return nil, fmt.Errorf("cannot generate UUID: %w", err)
664+
}
665+
return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0)
634666
},
635667
restarter: newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute),
636668
testFn: func(t *testing.T, m *OTelManager, e *EventListener, exec *testExecution, managerCtxCancel context.CancelFunc, collectorRunErr chan error) {
@@ -665,7 +697,11 @@ func TestOTelManager_Run(t *testing.T) {
665697
{
666698
name: "subprocess collector empty config",
667699
execModeFn: func(collectorRunErr chan error) (collectorExecution, error) {
668-
return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0)
700+
hcUUID, err := uuid.NewV4()
701+
if err != nil {
702+
return nil, fmt.Errorf("cannot generate UUID: %w", err)
703+
}
704+
return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0)
669705
},
670706
restarter: newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute),
671707
skipListeningErrors: true,
@@ -712,7 +748,11 @@ func TestOTelManager_Run(t *testing.T) {
712748
{
713749
name: "subprocess collector failed to start",
714750
execModeFn: func(collectorRunErr chan error) (collectorExecution, error) {
715-
return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0)
751+
hcUUID, err := uuid.NewV4()
752+
if err != nil {
753+
return nil, fmt.Errorf("cannot generate UUID: %w", err)
754+
}
755+
return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0)
716756
},
717757
restarter: newRecoveryBackoff(100*time.Nanosecond, 10*time.Second, time.Minute),
718758
testFn: func(t *testing.T, m *OTelManager, e *EventListener, exec *testExecution, managerCtxCancel context.CancelFunc, collectorRunErr chan error) {
@@ -846,7 +886,11 @@ func TestOTelManager_Logging(t *testing.T) {
846886
{
847887
name: "subprocess execution",
848888
execModeFn: func(collectorRunErr chan error) (collectorExecution, error) {
849-
return newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0)
889+
hcUUID, err := uuid.NewV4()
890+
if err != nil {
891+
return nil, fmt.Errorf("cannot generate UUID: %w", err)
892+
}
893+
return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), 0, 0)
850894
},
851895
},
852896
} {
@@ -919,7 +963,11 @@ func TestOTelManager_Ports(t *testing.T) {
919963
{
920964
name: "subprocess execution",
921965
execModeFn: func(collectorRunErr chan error) (collectorExecution, error) {
922-
return newSubprocessExecution(logp.DebugLevel, testBinary, metricsPort, healthCheckPort)
966+
hcUUID, err := uuid.NewV4()
967+
if err != nil {
968+
return nil, fmt.Errorf("cannot generate UUID: %w", err)
969+
}
970+
return newSubprocessExecution(logp.DebugLevel, testBinary, hcUUID.String(), metricsPort, healthCheckPort)
923971
},
924972
healthCheckEnabled: true,
925973
},
@@ -1071,7 +1119,7 @@ func TestOTelManager_PortConflict(t *testing.T) {
10711119
waitTimeForStop,
10721120
)
10731121
require.NoError(t, err, "could not create otel manager")
1074-
executionMode, err := newSubprocessExecution(logp.DebugLevel, testBinary, 0, 0)
1122+
executionMode, err := newSubprocessExecution(logp.DebugLevel, testBinary, strings.TrimPrefix(m.healthCheckExtID, "extension:healthcheckv2/"), 0, 0)
10751123
require.NoError(t, err, "could not create subprocess execution mode")
10761124
m.execution = executionMode
10771125

@@ -1288,6 +1336,9 @@ func TestOTelManager_handleOtelStatusUpdate(t *testing.T) {
12881336
"extension:elastic_diagnostics/test": {
12891337
Event: componentstatus.NewEvent(componentstatus.StatusOK),
12901338
},
1339+
"extension:healthcheckv2/uuid": {
1340+
Event: componentstatus.NewEvent(componentstatus.StatusOK),
1341+
},
12911342
},
12921343
},
12931344
},
@@ -1373,6 +1424,7 @@ func TestOTelManager_handleOtelStatusUpdate(t *testing.T) {
13731424
mgr := &OTelManager{
13741425
logger: newTestLogger(),
13751426
components: tt.components,
1427+
healthCheckExtID: "extension:healthcheckv2/uuid",
13761428
currentComponentStates: make(map[string]runtime.ComponentComponentState),
13771429
}
13781430

0 commit comments

Comments
 (0)