refactor: consolidate frontend vLLM tests around shared dynamic ports

keivenchang · keivenchang · commit dfc5606b867f · 2025-12-18T05:49:04.000Z
- Use runtime_services_dynamic_ports + shared ServicePorts/dynamo_dynamic_ports for xdist-safe ports\n- Avoid terminating pytest by default in DynamoFrontendProcess\n- Prevent tokio runtime drop panic during tokenizer/encoding initialization\n- Add per-test pytest timeouts and record suite runtime

Signed-off-by: Keiven Chang &lt;keivenchang@users.noreply.github.com&gt;
diff --git a/lib/parsers/src/reasoning/gpt_oss_parser.rs b/lib/parsers/src/reasoning/gpt_oss_parser.rs
@@ -18,8 +18,22 @@ static GLOBAL_HARMONY_GPTOSS_ENCODING: OnceLock<Result<HarmonyEncoding, anyhow::
     OnceLock::new();
 
 fn get_harmony_encoding() -> &'static Result<HarmonyEncoding, anyhow::Error> {
-    GLOBAL_HARMONY_GPTOSS_ENCODING
-        .get_or_init(|| load_harmony_encoding(HarmonyEncodingName::HarmonyGptOss))
+    GLOBAL_HARMONY_GPTOSS_ENCODING.get_or_init(|| {
+        // `openai-harmony` currently uses `reqwest::blocking`, which spins up and drops a
+        // Tokio runtime internally. If this runs on a Tokio runtime worker thread, Tokio
+        // will panic when dropping that runtime unless we're in a "blocking allowed"
+        // section. This is frequently triggered from async request handlers.
+        //
+        // `block_in_place` is safe here because the work is one-time initialization
+        // guarded by `OnceLock`, and it prevents panics in async contexts.
+        if tokio::runtime::Handle::try_current().is_ok() {
+            tokio::task::block_in_place(|| {
+                load_harmony_encoding(HarmonyEncodingName::HarmonyGptOss)
+            })
+        } else {
+            load_harmony_encoding(HarmonyEncodingName::HarmonyGptOss)
+        }
+    })
 }
 
 pub struct GptOssReasoningParser {
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -6,14 +6,15 @@
 import shutil
 import tempfile
 from pathlib import Path
-from typing import Optional
+from typing import Generator, Optional
 
 import pytest
 from filelock import FileLock
 
-from tests.utils.constants import TEST_MODELS
+from tests.utils.constants import TEST_MODELS, DefaultPort
 from tests.utils.managed_process import ManagedProcess
 from tests.utils.port_utils import (
+    ServicePorts,
     allocate_port,
     allocate_ports,
     deallocate_port,
@@ -634,3 +635,34 @@ def file_storage_backend():
             os.environ["DYN_FILE_KV"] = old_env
         else:
             os.environ.pop("DYN_FILE_KV", None)
+
+
+########################################################
+# Shared Port Allocation (Dynamo deployments)
+########################################################
+
+
+@pytest.fixture(scope="function")
+def num_system_ports(request) -> int:
+    """Number of system ports to allocate for this test.
+
+    Default: 2 ports (sufficient for most aggregated and disaggregated tests).
+    Override with: @pytest.mark.parametrize("num_system_ports", [4], indirect=True)
+    """
+    return getattr(request, "param", 2)
+
+
+@pytest.fixture(scope="function")
+def dynamo_dynamic_ports(num_system_ports) -> Generator[ServicePorts, None, None]:
+    """Allocate per-test ports for Dynamo deployments.
+
+    - frontend_port: OpenAI-compatible HTTP/gRPC ingress (dynamo.frontend)
+    - system_ports: List of worker metrics/system ports (configurable count via num_system_ports)
+    """
+    frontend_port = allocate_port(DefaultPort.FRONTEND.value)
+    system_port_list = allocate_ports(num_system_ports, DefaultPort.SYSTEM1.value)
+    all_ports = [frontend_port, *system_port_list]
+    try:
+        yield ServicePorts(frontend_port=frontend_port, system_ports=system_port_list)
+    finally:
+        deallocate_ports(all_ports)
diff --git a/tests/frontend/test_vllm.py b/tests/frontend/test_vllm.py
@@ -1,22 +1,27 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""End-to-end tests covering reasoning effort behaviour."""
+"""End-to-end tests covering reasoning effort behaviour.
+
+Runtime note:
+- `python -m pytest tests/frontend/test_vllm.py -v` took ~228s (3m48s) wall time.
+- Expect variance depending on model cache state, compilation warmup, and system load.
+"""
 
 from __future__ import annotations
 
 import logging
 import os
 import shutil
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Generator, Optional, Tuple
 
 import pytest
 import requests
 
-from tests.conftest import EtcdServer, NatsServer
 from tests.utils.constants import GPT_OSS
-from tests.utils.managed_process import ManagedProcess
+from tests.utils.managed_process import DynamoFrontendProcess, ManagedProcess
 from tests.utils.payloads import check_models_api
+from tests.utils.port_utils import ServicePorts
 
 logger = logging.getLogger(__name__)
 
@@ -62,40 +67,20 @@
 }
 
 
-class DynamoFrontendProcess(ManagedProcess):
-    """Process manager for Dynamo frontend"""
-
-    def __init__(self, request):
-        command = ["python", "-m", "dynamo.frontend", "--router-mode", "round-robin"]
-
-        # Unset DYN_SYSTEM_PORT - frontend doesn't use system metrics server
-        env = os.environ.copy()
-        env.pop("DYN_SYSTEM_PORT", None)
-
-        log_dir = f"{request.node.name}_frontend"
-
-        # Clean up any existing log directory from previous runs
-        try:
-            shutil.rmtree(log_dir)
-            logger.info(f"Cleaned up existing log directory: {log_dir}")
-        except FileNotFoundError:
-            # Directory doesn't exist, which is fine
-            pass
-
-        super().__init__(
-            command=command,
-            env=env,
-            display_output=True,
-            terminate_existing=True,
-            log_dir=log_dir,
-        )
-
-
 class VllmWorkerProcess(ManagedProcess):
     """Vllm Worker process for GPT-OSS model."""
 
-    def __init__(self, request, worker_id: str = "vllm-worker"):
+    def __init__(
+        self,
+        request,
+        *,
+        frontend_port: int,
+        system_port: int,
+        worker_id: str = "vllm-worker",
+    ):
         self.worker_id = worker_id
+        self.frontend_port = int(frontend_port)
+        self.system_port = int(system_port)
 
         command = [
             "python3",
@@ -114,7 +99,7 @@ def __init__(self, request, worker_id: str = "vllm-worker"):
         env = os.environ.copy()
         env["DYN_LOG"] = "debug"
         env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
-        env["DYN_SYSTEM_PORT"] = "8083"
+        env["DYN_SYSTEM_PORT"] = str(self.system_port)
 
         log_dir = f"{request.node.name}_{worker_id}"
 
@@ -127,8 +112,8 @@ def __init__(self, request, worker_id: str = "vllm-worker"):
             command=command,
             env=env,
             health_check_urls=[
-                ("http://localhost:8000/v1/models", check_models_api),
-                ("http://localhost:8083/health", self.is_ready),
+                (f"http://localhost:{self.frontend_port}/v1/models", check_models_api),
+                (f"http://localhost:{self.system_port}/health", self.is_ready),
             ],
             timeout=500,
             display_output=True,
@@ -155,34 +140,49 @@ def is_ready(self, response) -> bool:
 
 def _send_chat_request(
     payload: Dict[str, Any],
+    *,
+    base_url: str,
     timeout: int = 180,
 ) -> requests.Response:
     """Send a chat completion request with a specific payload."""
     headers = {"Content-Type": "application/json"}
 
     response = requests.post(
-        "http://localhost:8000/v1/chat/completions",
+        f"{base_url}/v1/chat/completions",
         headers=headers,
         json=payload,
         timeout=timeout,
     )
     return response
 
 
-@pytest.fixture(scope="module")
-def runtime_services(request):
-    """Module-scoped runtime services for this test file."""
-    with NatsServer(request) as nats_process:
-        with EtcdServer(request) as etcd_process:
-            yield nats_process, etcd_process
-
-
-@pytest.fixture(scope="module")
-def start_services(request, runtime_services):
-    """Start frontend and worker processes once for this module's tests."""
-    with DynamoFrontendProcess(request):
+@pytest.fixture(scope="function")
+def start_services(
+    request, runtime_services_dynamic_ports, dynamo_dynamic_ports: ServicePorts
+) -> Generator[None, None, None]:
+    """Start frontend and worker processes for this test.
+
+    `runtime_services_dynamic_ports` ensures NATS/etcd run on per-test ports and sets
+    NATS_SERVER/ETCD_ENDPOINTS env vars for Dynamo to discover them.
+    """
+    _ = runtime_services_dynamic_ports
+    frontend_port = dynamo_dynamic_ports.frontend_port
+    system_port = dynamo_dynamic_ports.system_ports[0]
+    with DynamoFrontendProcess(
+        request,
+        frontend_port=frontend_port,
+        # Optional debugging (not enabled on main):
+        # If the frontend hits a Rust panic, enabling backtraces makes failures diagnosable
+        # from CI logs without needing to repro locally.
+        # extra_env={"RUST_BACKTRACE": "1", "TOKIO_BACKTRACE": "1"},
+        terminate_existing=False,
+    ):
         logger.info("Frontend started for tests")
-        with VllmWorkerProcess(request):
+        with VllmWorkerProcess(
+            request,
+            frontend_port=frontend_port,
+            system_port=system_port,
+        ):
             logger.info("Vllm Worker started for tests")
             yield
 
@@ -218,8 +218,11 @@ def _validate_chat_response(response: requests.Response) -> Dict[str, Any]:
 
 
 @pytest.mark.usefixtures("start_services")
+@pytest.mark.timeout(240)  # ~3x measured total (~70s/test), rounded up
 @pytest.mark.post_merge
-def test_reasoning_effort(request, runtime_services, predownload_models) -> None:
+def test_reasoning_effort(
+    request, dynamo_dynamic_ports: ServicePorts, predownload_models
+) -> None:
     """High reasoning effort should yield more detailed reasoning than low effort."""
 
     prompt = (
@@ -252,12 +255,13 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None
         "chat_template_args": {"reasoning_effort": "low"},
     }
 
-    high_response = _send_chat_request(high_payload)
+    base_url = f"http://localhost:{dynamo_dynamic_ports.frontend_port}"
+    high_response = _send_chat_request(high_payload, base_url=base_url)
     high_reasoning_text, high_reasoning_tokens = _extract_reasoning_metrics(
         _validate_chat_response(high_response)
     )
 
-    low_response = _send_chat_request(low_payload)
+    low_response = _send_chat_request(low_payload, base_url=base_url)
     low_reasoning_text, low_reasoning_tokens = _extract_reasoning_metrics(
         _validate_chat_response(low_response)
     )
@@ -281,8 +285,11 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None
 
 
 @pytest.mark.usefixtures("start_services")
+@pytest.mark.timeout(180)  # ~3x measured total (~50s/test), rounded up
 @pytest.mark.post_merge
-def test_tool_calling(request, runtime_services, predownload_models) -> None:
+def test_tool_calling(
+    request, dynamo_dynamic_ports: ServicePorts, predownload_models
+) -> None:
     """Test tool calling functionality with weather and system health tools."""
 
     payload = {
@@ -302,7 +309,8 @@ def test_tool_calling(request, runtime_services, predownload_models) -> None:
         "response_format": {"type": "text"},
     }
 
-    response = _send_chat_request(payload)
+    base_url = f"http://localhost:{dynamo_dynamic_ports.frontend_port}"
+    response = _send_chat_request(payload, base_url=base_url)
     response_data = _validate_chat_response(response)
 
     logger.info("Tool call response: %s", response_data)
@@ -320,9 +328,10 @@ def test_tool_calling(request, runtime_services, predownload_models) -> None:
 
 
 @pytest.mark.usefixtures("start_services")
+@pytest.mark.timeout(180)  # ~3x measured total (~50s/test), rounded up
 @pytest.mark.nightly
 def test_tool_calling_second_round(
-    request, runtime_services, predownload_models
+    request, dynamo_dynamic_ports: ServicePorts, predownload_models
 ) -> None:
     """Test tool calling with a follow-up message containing assistant's prior tool calls."""
 
@@ -364,7 +373,8 @@ def test_tool_calling_second_round(
         "response_format": {"type": "text"},
     }
 
-    response = _send_chat_request(payload)
+    base_url = f"http://localhost:{dynamo_dynamic_ports.frontend_port}"
+    response = _send_chat_request(payload, base_url=base_url)
     response_data = _validate_chat_response(response)
 
     logger.info("Tool call second round response: %s", response_data)
@@ -383,8 +393,11 @@ def test_tool_calling_second_round(
 
 
 @pytest.mark.usefixtures("start_services")
+@pytest.mark.timeout(180)  # ~3x measured total (~57s/test), rounded up
 @pytest.mark.nightly
-def test_reasoning(request, runtime_services, predownload_models) -> None:
+def test_reasoning(
+    request, dynamo_dynamic_ports: ServicePorts, predownload_models
+) -> None:
     """Test reasoning functionality with a mathematical problem."""
 
     payload = {
@@ -402,7 +415,8 @@ def test_reasoning(request, runtime_services, predownload_models) -> None:
         "max_tokens": 2000,
     }
 
-    response = _send_chat_request(payload)
+    base_url = f"http://localhost:{dynamo_dynamic_ports.frontend_port}"
+    response = _send_chat_request(payload, base_url=base_url)
     response_data = _validate_chat_response(response)
 
     logger.info("Reasoning response: %s", response_data)
diff --git a/tests/utils/managed_process.py b/tests/utils/managed_process.py
@@ -595,7 +595,7 @@ def __init__(
         router_mode: str = "round-robin",
         extra_args: Optional[list[str]] = None,
         extra_env: Optional[dict[str, str]] = None,
-        terminate_existing: bool = True,
+        terminate_existing: bool = False,
     ):
         # TODO: Refactor remaining duplicate "DynamoFrontendProcess" helpers in tests to
         # use this shared implementation (and delete the copies):
@@ -643,6 +643,8 @@ def __init__(
             command=command,
             env=env,
             display_output=True,
+            # Default to False because the launcher is typically `python`, and killing
+            # "existing python" processes can terminate the pytest runner itself.
             terminate_existing=terminate_existing,
             log_dir=log_dir,
         )
diff --git a/tests/utils/port_utils.py b/tests/utils/port_utils.py
@@ -15,6 +15,7 @@
 import socket
 import tempfile
 import time
+from dataclasses import dataclass
 from pathlib import Path
 
 # Port allocation lock file
@@ -27,6 +28,18 @@
 _PORT_MAX = 32767
 
 
+@dataclass(frozen=True)
+class ServicePorts:
+    """Port allocation for Dynamo service deployments.
+
+    Used by tests that need to pass a cohesive set of ports around (frontend + one or
+    more worker/system ports).
+    """
+
+    frontend_port: int
+    system_ports: list[int]
+
+
 def _load_port_registry() -> dict:
     """Load the port registry from disk.