From d365d19dc526c1bf126c199dcb6a580679af903f Mon Sep 17 00:00:00 2001 From: Aayush Maini Date: Fri, 24 Apr 2026 10:12:36 -0700 Subject: [PATCH] fix: harden Docker scan timeout and cancellation in LinuxContainerDetector - Fix OperationCanceledException swallowing in 5 DockerService methods (CanPingDockerAsync, CanRunLinuxContainersAsync, ImageExistsLocallyAsync, TryPullImageAsync, InspectImageAsync) by adding cancellationToken.ThrowIfCancellationRequested() in catch blocks - Move CanRunLinuxContainersAsync call inside try/catch in LinuxContainerDetector to prevent unhandled OCE crash - Add belt-and-suspenders Task.WhenAny with 30s race timer for RemoveContainerAsync in finally block - Make stream.Dispose() fire-and-forget via Task.Run to prevent blocking if underlying socket close() hangs - Add TelemetryRelay.FlushCurrentTelemetry() before ReadContainerOutputAsync to ensure telemetry is durable before risky long-running operation - Add 60s heartbeat timer in LinuxContainerDetector for process liveness monitoring - Add ILogger breadcrumbs for container start, removal, and cleanup Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../DockerService.cs | 60 +++++++++++++++++-- .../Telemetry/TelemetryRelay.cs | 19 ++++++ .../linux/LinuxContainerDetector.cs | 38 ++++++++---- 3 files changed, 101 insertions(+), 16 deletions(-) diff --git a/src/Microsoft.ComponentDetection.Common/DockerService.cs b/src/Microsoft.ComponentDetection.Common/DockerService.cs index 538a56013..faf3148b9 100644 --- a/src/Microsoft.ComponentDetection.Common/DockerService.cs +++ b/src/Microsoft.ComponentDetection.Common/DockerService.cs @@ -10,6 +10,7 @@ namespace Microsoft.ComponentDetection.Common; using System.Threading.Tasks; using Docker.DotNet; using Docker.DotNet.Models; +using Microsoft.ComponentDetection.Common.Telemetry; using Microsoft.ComponentDetection.Common.Telemetry.Records; using Microsoft.ComponentDetection.Contracts; using Microsoft.ComponentDetection.Contracts.BcdeModels; @@ -38,6 +39,7 @@ public async Task CanPingDockerAsync(CancellationToken cancellationToken = catch (Exception e) { this.logger.LogError(e, "Failed to ping docker"); + cancellationToken.ThrowIfCancellationRequested(); return false; } } @@ -59,6 +61,7 @@ public async Task CanRunLinuxContainersAsync(CancellationToken cancellatio catch (Exception e) { record.ExceptionMessage = e.Message; + cancellationToken.ThrowIfCancellationRequested(); } return false; @@ -79,6 +82,7 @@ public async Task ImageExistsLocallyAsync(string image, CancellationToken catch (Exception e) { record.ExceptionMessage = e.Message; + cancellationToken.ThrowIfCancellationRequested(); return false; } } @@ -114,6 +118,7 @@ public async Task TryPullImageAsync(string image, CancellationToken cancel catch (Exception e) { record.ExceptionMessage = e.Message; + cancellationToken.ThrowIfCancellationRequested(); return false; } } @@ -178,6 +183,7 @@ public async Task InspectImageAsync(string image, Cancellation catch (Exception e) { record.ExceptionMessage = e.Message; + cancellationToken.ThrowIfCancellationRequested(); return null; } } @@ -207,6 +213,12 @@ public async Task InspectImageAsync(string image, Cancellation var stream = await AttachContainerAsync(container.ID, cancellationToken); await StartContainerAsync(container.ID, cancellationToken); + this.logger.LogInformation("Container {ContainerId} started for image {Image}, reading output...", container.ID, image); + + // Flush telemetry before the long-running ReadOutput so we get mid-scan + // data in App Insights even if the process hangs during the read. + TelemetryRelay.Instance.FlushCurrentTelemetry(); + var (stdout, stderr) = await ReadContainerOutputAsync(stream, container.ID, image, cancellationToken); record.Stdout = stdout; @@ -217,13 +229,33 @@ public async Task InspectImageAsync(string image, Cancellation finally { // Best-effort container cleanup with a bounded timeout. - // RemoveContainerAsync already handles not-found, but we must guard against - // the Docker daemon hanging on container removal (e.g. when the container - // process is stuck), which would block the detector indefinitely. + // Use Task.WhenAny as belt-and-suspenders: even if Docker.DotNet's HTTP + // pipeline doesn't honor the CTS (e.g. kernel-level socket blocking), + // we abandon the removal rather than hanging indefinitely. + this.logger.LogInformation("Removing container {ContainerId}...", container.ID); using var removeCts = new CancellationTokenSource(TimeSpan.FromSeconds(30)); try { - await RemoveContainerAsync(container.ID, removeCts.Token); + var removeTask = RemoveContainerAsync(container.ID, removeCts.Token); + var removeTimeout = Task.Delay(TimeSpan.FromSeconds(30), CancellationToken.None); + + if (await Task.WhenAny(removeTask, removeTimeout) == removeTimeout) + { + this.logger.LogWarning( + "RemoveContainerAsync timed out for container {ContainerId}; abandoning cleanup", + container.ID); + + // Observe the abandoned task to prevent unobserved task exceptions + _ = removeTask.ContinueWith( + static _ => { }, + CancellationToken.None, + TaskContinuationOptions.OnlyOnFaulted, + TaskScheduler.Default); + } + else + { + await removeTask; // Observe any exception from completed task + } } catch (Exception ex) { @@ -232,6 +264,8 @@ public async Task InspectImageAsync(string image, Cancellation "Failed to remove container {ContainerId}; abandoning cleanup", container.ID); } + + this.logger.LogInformation("Container {ContainerId} cleanup complete", container.ID); } } @@ -264,8 +298,22 @@ public async Task InspectImageAsync(string image, Cancellation { record.WasCancelled = true; - // Dispose the stream to unblock any pending read operation - stream.Dispose(); + // Dispose the stream to unblock any pending read operation. + // Run in fire-and-forget: if the underlying socket close() blocks + // (e.g. Docker daemon in kernel D-state), we don't want to hang here. + _ = Task.Run( + () => + { + try + { + stream.Dispose(); + } + catch + { + // best effort + } + }, + CancellationToken.None); // Observe the readTask to prevent unobserved task exceptions. // Running any continuation automatically marks the exception as observed. diff --git a/src/Microsoft.ComponentDetection.Common/Telemetry/TelemetryRelay.cs b/src/Microsoft.ComponentDetection.Common/Telemetry/TelemetryRelay.cs index f5b3326d8..d70b95ae5 100644 --- a/src/Microsoft.ComponentDetection.Common/Telemetry/TelemetryRelay.cs +++ b/src/Microsoft.ComponentDetection.Common/Telemetry/TelemetryRelay.cs @@ -48,6 +48,25 @@ public void PostTelemetryRecord(IDetectionTelemetryRecord record) } } + /// + /// Flushes all buffered telemetry records to their services without shutting down. + /// Use this at critical checkpoints to ensure telemetry is delivered even if the process later hangs. + /// + public void FlushCurrentTelemetry() + { + foreach (var service in this.telemetryServices) + { + try + { + service.Flush(); + } + catch + { + // Telemetry should never crash the application + } + } + } + /// /// Disables the sending of telemetry and flushes any messages out of the queue for each service. /// diff --git a/src/Microsoft.ComponentDetection.Detectors/linux/LinuxContainerDetector.cs b/src/Microsoft.ComponentDetection.Detectors/linux/LinuxContainerDetector.cs index e4aa56a74..0a5a4009c 100644 --- a/src/Microsoft.ComponentDetection.Detectors/linux/LinuxContainerDetector.cs +++ b/src/Microsoft.ComponentDetection.Detectors/linux/LinuxContainerDetector.cs @@ -32,6 +32,7 @@ ILogger logger private const LinuxScannerScope DefaultScanScope = LinuxScannerScope.AllLayers; private const string LocalImageMountPoint = "/image"; + private const int HeartbeatIntervalSeconds = 60; // Base image annotations from ADO dockerTask private const string BaseImageRefAnnotation = "image.base.ref.name"; @@ -99,19 +100,32 @@ public async Task ExecuteDetectorAsync( using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); timeoutCts.CancelAfter(GetTimeout(request.DetectorArgs)); - if (!await this.dockerService.CanRunLinuxContainersAsync(timeoutCts.Token)) - { - using var record = new LinuxContainerDetectorUnsupportedOs - { - Os = RuntimeInformation.OSDescription, - }; - this.logger.LogInformation("Linux containers are not available on this host."); - return EmptySuccessfulScan(); - } - var results = Enumerable.Empty(); + + // Heartbeat timer: logs every 60s while the detector is running. + // If the process dies silently (OOM, SIGKILL), the heartbeat stops + // and we know approximately when it happened from the last log entry. + var scanStart = DateTime.UtcNow; + using var heartbeat = new Timer( + _ => this.logger.LogInformation( + "LinuxContainerDetector heartbeat — still scanning ({ElapsedSeconds}s elapsed)", + (DateTime.UtcNow - scanStart).TotalSeconds), + state: null, + dueTime: TimeSpan.FromSeconds(HeartbeatIntervalSeconds), + period: TimeSpan.FromSeconds(HeartbeatIntervalSeconds)); + try { + if (!await this.dockerService.CanRunLinuxContainersAsync(timeoutCts.Token)) + { + using var record = new LinuxContainerDetectorUnsupportedOs + { + Os = RuntimeInformation.OSDescription, + }; + this.logger.LogInformation("Linux containers are not available on this host."); + return EmptySuccessfulScan(); + } + results = await this.ProcessImagesAsync( allImages, request.ComponentRecorder, @@ -132,6 +146,10 @@ public async Task ExecuteDetectorAsync( this.logger.LogError(e, "Unexpected error during Linux container image scanning"); } + this.logger.LogInformation( + "LinuxContainerDetector completed after {ElapsedSeconds}s", + (DateTime.UtcNow - scanStart).TotalSeconds); + return new IndividualDetectorScanResult { ContainerDetails = results