From 35bb08412bc79142eb9b333daccb9a05f8c48bcc Mon Sep 17 00:00:00 2001 From: David Gageot Date: Sat, 18 Apr 2026 09:02:04 +0200 Subject: [PATCH] shell: fix hang when a tool command backgrounds a child process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a shell command backgrounds a grandchild (e.g. `docker run ... &`), the grandchild inherits the stdout/stderr pipe fds created by exec.Cmd for non-*os.File writers. cmd.Wait() then blocks until EOF on those pipes, which never comes while the grandchild holds them open — making the tool call hang until the configured timeout (observed wedging an eval run for minutes after the agent ran a detached `docker run`). Set cmd.WaitDelay to 500ms in runNativeCommand so that once the direct shell child has exited, Go force-closes the pipe read ends and Wait() returns promptly. The grandchild is left untouched — exactly what the user's `&` intended. Add two regression tests covering the plain backgrounded-child case and the detached (setsid) variant where the process-group kill fallback cannot rescue us. Assisted-By: docker-agent --- pkg/tools/builtin/shell.go | 16 ++++++++ pkg/tools/builtin/shell_test.go | 72 +++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/pkg/tools/builtin/shell.go b/pkg/tools/builtin/shell.go index f0853afaa..ac184ed06 100644 --- a/pkg/tools/builtin/shell.go +++ b/pkg/tools/builtin/shell.go @@ -150,11 +150,27 @@ func (h *shellHandler) RunShell(ctx context.Context, params RunShellArgs) (*tool return h.runNativeCommand(timeoutCtx, ctx, params.Cmd, cwd, timeout), nil } +// waitDelayAfterShellExit caps how long cmd.Wait() blocks on stdout/stderr +// copy goroutines after the direct shell child has exited. +// +// When cmd.Stdout/Stderr are not *os.File, Go's exec package creates OS pipes +// and spawns copy goroutines; cmd.Wait() only returns after *both* the child +// exits and those goroutines see EOF on the pipes. If the command backgrounds +// a grandchild (e.g. `docker run ... &`, `sleep 10 &`) that inherits the pipe +// fds, the pipes stay open and Wait() blocks until the configured timeout. +// +// cmd.WaitDelay tells Go to force-close the pipes and return this long after +// the direct child has exited, letting the grandchild keep running while the +// tool call returns promptly. A short delay is plenty because any output the +// shell itself produced is already flushed by the time it exits. +const waitDelayAfterShellExit = 500 * time.Millisecond + func (h *shellHandler) runNativeCommand(timeoutCtx, ctx context.Context, command, cwd string, timeout time.Duration) *tools.ToolCallResult { cmd := exec.Command(h.shell, append(h.shellArgsPrefix, command)...) cmd.Env = h.env cmd.Dir = cwd cmd.SysProcAttr = platformSpecificSysProcAttr() + cmd.WaitDelay = waitDelayAfterShellExit var outBuf bytes.Buffer cmd.Stdout = &outBuf diff --git a/pkg/tools/builtin/shell_test.go b/pkg/tools/builtin/shell_test.go index 4de61fb82..ef251ac33 100644 --- a/pkg/tools/builtin/shell_test.go +++ b/pkg/tools/builtin/shell_test.go @@ -2,7 +2,10 @@ package builtin import ( "os" + "os/exec" + "runtime" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -177,3 +180,72 @@ func TestShellTool_RelativeCwdResolvesAgainstWorkingDir(t *testing.T) { assert.Contains(t, result.Output, subdir, "relative cwd must resolve against the configured workingDir, not the process cwd") } + +// Regression test for a shell-tool hang caused by backgrounded grandchildren. +// +// A command like `sleep 10 &` makes the shell exit immediately, but the +// backgrounded sleep inherits stdout/stderr. Without cmd.WaitDelay, Go's +// exec.Cmd.Wait() blocks reading the pipe until the configured timeout, +// which makes the tool call hang (observed in eval runs where the agent +// launched a server with `docker run ... &`). +// +// With the WaitDelay safeguard the tool must return within a small fraction +// of the configured timeout. +func TestShellTool_BackgroundedChildDoesNotBlockReturn(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("POSIX shell backgrounding semantics; skipped on Windows") + } + + tool := NewShellTool(nil, &config.RuntimeConfig{Config: config.Config{WorkingDir: t.TempDir()}}) + + start := time.Now() + result, err := tool.handler.RunShell(t.Context(), RunShellArgs{ + // sleep inherits stdout/stderr from the shell and holds the pipe + // open for 30s. The tool must return as soon as the shell exits. + Cmd: "sleep 30 &", + Timeout: 20, + }) + elapsed := time.Since(start) + + require.NoError(t, err) + require.NotNil(t, result) + assert.Less(t, elapsed, 5*time.Second, + "shell tool must return promptly when the command backgrounds a child "+ + "that inherits stdout/stderr; elapsed=%s", elapsed) +} + +// Even when the backgrounded child detaches into its own session (so the +// shell tool's process-group kill cannot reach it on timeout), cmd.WaitDelay +// must still allow the tool call to return. +func TestShellTool_DetachedBackgroundedChildDoesNotBlockReturn(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("POSIX shell backgrounding semantics; skipped on Windows") + } + if _, err := exec.LookPath("setsid"); err != nil { + t.Skip("setsid not available") + } + + tool := NewShellTool(nil, &config.RuntimeConfig{Config: config.Config{WorkingDir: t.TempDir()}}) + + done := make(chan struct{}) + var result *tools.ToolCallResult + var err error + go func() { + defer close(done) + result, err = tool.handler.RunShell(t.Context(), RunShellArgs{ + // setsid places sleep in its own session/process group, so the + // process-group kill fallback in the timeout path cannot reach + // it. Only cmd.WaitDelay can unblock Wait() here. + Cmd: "setsid sleep 30 &", + Timeout: 20, + }) + }() + + select { + case <-done: + require.NoError(t, err) + require.NotNil(t, result) + case <-time.After(10 * time.Second): + t.Fatal("shell tool hung when command backgrounded a detached child") + } +}