Skip to content

Commit dfc5606

Browse files
committed
refactor: consolidate frontend vLLM tests around shared dynamic ports
- Use runtime_services_dynamic_ports + shared ServicePorts/dynamo_dynamic_ports for xdist-safe ports\n- Avoid terminating pytest by default in DynamoFrontendProcess\n- Prevent tokio runtime drop panic during tokenizer/encoding initialization\n- Add per-test pytest timeouts and record suite runtime Signed-off-by: Keiven Chang <[email protected]>
1 parent f10f594 commit dfc5606

File tree

5 files changed

+140
-65
lines changed

5 files changed

+140
-65
lines changed

lib/parsers/src/reasoning/gpt_oss_parser.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,22 @@ static GLOBAL_HARMONY_GPTOSS_ENCODING: OnceLock<Result<HarmonyEncoding, anyhow::
1818
OnceLock::new();
1919

2020
fn get_harmony_encoding() -> &'static Result<HarmonyEncoding, anyhow::Error> {
21-
GLOBAL_HARMONY_GPTOSS_ENCODING
22-
.get_or_init(|| load_harmony_encoding(HarmonyEncodingName::HarmonyGptOss))
21+
GLOBAL_HARMONY_GPTOSS_ENCODING.get_or_init(|| {
22+
// `openai-harmony` currently uses `reqwest::blocking`, which spins up and drops a
23+
// Tokio runtime internally. If this runs on a Tokio runtime worker thread, Tokio
24+
// will panic when dropping that runtime unless we're in a "blocking allowed"
25+
// section. This is frequently triggered from async request handlers.
26+
//
27+
// `block_in_place` is safe here because the work is one-time initialization
28+
// guarded by `OnceLock`, and it prevents panics in async contexts.
29+
if tokio::runtime::Handle::try_current().is_ok() {
30+
tokio::task::block_in_place(|| {
31+
load_harmony_encoding(HarmonyEncodingName::HarmonyGptOss)
32+
})
33+
} else {
34+
load_harmony_encoding(HarmonyEncodingName::HarmonyGptOss)
35+
}
36+
})
2337
}
2438

2539
pub struct GptOssReasoningParser {

tests/conftest.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,15 @@
66
import shutil
77
import tempfile
88
from pathlib import Path
9-
from typing import Optional
9+
from typing import Generator, Optional
1010

1111
import pytest
1212
from filelock import FileLock
1313

14-
from tests.utils.constants import TEST_MODELS
14+
from tests.utils.constants import TEST_MODELS, DefaultPort
1515
from tests.utils.managed_process import ManagedProcess
1616
from tests.utils.port_utils import (
17+
ServicePorts,
1718
allocate_port,
1819
allocate_ports,
1920
deallocate_port,
@@ -634,3 +635,34 @@ def file_storage_backend():
634635
os.environ["DYN_FILE_KV"] = old_env
635636
else:
636637
os.environ.pop("DYN_FILE_KV", None)
638+
639+
640+
########################################################
641+
# Shared Port Allocation (Dynamo deployments)
642+
########################################################
643+
644+
645+
@pytest.fixture(scope="function")
646+
def num_system_ports(request) -> int:
647+
"""Number of system ports to allocate for this test.
648+
649+
Default: 2 ports (sufficient for most aggregated and disaggregated tests).
650+
Override with: @pytest.mark.parametrize("num_system_ports", [4], indirect=True)
651+
"""
652+
return getattr(request, "param", 2)
653+
654+
655+
@pytest.fixture(scope="function")
656+
def dynamo_dynamic_ports(num_system_ports) -> Generator[ServicePorts, None, None]:
657+
"""Allocate per-test ports for Dynamo deployments.
658+
659+
- frontend_port: OpenAI-compatible HTTP/gRPC ingress (dynamo.frontend)
660+
- system_ports: List of worker metrics/system ports (configurable count via num_system_ports)
661+
"""
662+
frontend_port = allocate_port(DefaultPort.FRONTEND.value)
663+
system_port_list = allocate_ports(num_system_ports, DefaultPort.SYSTEM1.value)
664+
all_ports = [frontend_port, *system_port_list]
665+
try:
666+
yield ServicePorts(frontend_port=frontend_port, system_ports=system_port_list)
667+
finally:
668+
deallocate_ports(all_ports)

tests/frontend/test_vllm.py

Lines changed: 74 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,27 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4-
"""End-to-end tests covering reasoning effort behaviour."""
4+
"""End-to-end tests covering reasoning effort behaviour.
5+
6+
Runtime note:
7+
- `python -m pytest tests/frontend/test_vllm.py -v` took ~228s (3m48s) wall time.
8+
- Expect variance depending on model cache state, compilation warmup, and system load.
9+
"""
510

611
from __future__ import annotations
712

813
import logging
914
import os
1015
import shutil
11-
from typing import Any, Dict, Optional, Tuple
16+
from typing import Any, Dict, Generator, Optional, Tuple
1217

1318
import pytest
1419
import requests
1520

16-
from tests.conftest import EtcdServer, NatsServer
1721
from tests.utils.constants import GPT_OSS
18-
from tests.utils.managed_process import ManagedProcess
22+
from tests.utils.managed_process import DynamoFrontendProcess, ManagedProcess
1923
from tests.utils.payloads import check_models_api
24+
from tests.utils.port_utils import ServicePorts
2025

2126
logger = logging.getLogger(__name__)
2227

@@ -62,40 +67,20 @@
6267
}
6368

6469

65-
class DynamoFrontendProcess(ManagedProcess):
66-
"""Process manager for Dynamo frontend"""
67-
68-
def __init__(self, request):
69-
command = ["python", "-m", "dynamo.frontend", "--router-mode", "round-robin"]
70-
71-
# Unset DYN_SYSTEM_PORT - frontend doesn't use system metrics server
72-
env = os.environ.copy()
73-
env.pop("DYN_SYSTEM_PORT", None)
74-
75-
log_dir = f"{request.node.name}_frontend"
76-
77-
# Clean up any existing log directory from previous runs
78-
try:
79-
shutil.rmtree(log_dir)
80-
logger.info(f"Cleaned up existing log directory: {log_dir}")
81-
except FileNotFoundError:
82-
# Directory doesn't exist, which is fine
83-
pass
84-
85-
super().__init__(
86-
command=command,
87-
env=env,
88-
display_output=True,
89-
terminate_existing=True,
90-
log_dir=log_dir,
91-
)
92-
93-
9470
class VllmWorkerProcess(ManagedProcess):
9571
"""Vllm Worker process for GPT-OSS model."""
9672

97-
def __init__(self, request, worker_id: str = "vllm-worker"):
73+
def __init__(
74+
self,
75+
request,
76+
*,
77+
frontend_port: int,
78+
system_port: int,
79+
worker_id: str = "vllm-worker",
80+
):
9881
self.worker_id = worker_id
82+
self.frontend_port = int(frontend_port)
83+
self.system_port = int(system_port)
9984

10085
command = [
10186
"python3",
@@ -114,7 +99,7 @@ def __init__(self, request, worker_id: str = "vllm-worker"):
11499
env = os.environ.copy()
115100
env["DYN_LOG"] = "debug"
116101
env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
117-
env["DYN_SYSTEM_PORT"] = "8083"
102+
env["DYN_SYSTEM_PORT"] = str(self.system_port)
118103

119104
log_dir = f"{request.node.name}_{worker_id}"
120105

@@ -127,8 +112,8 @@ def __init__(self, request, worker_id: str = "vllm-worker"):
127112
command=command,
128113
env=env,
129114
health_check_urls=[
130-
("http://localhost:8000/v1/models", check_models_api),
131-
("http://localhost:8083/health", self.is_ready),
115+
(f"http://localhost:{self.frontend_port}/v1/models", check_models_api),
116+
(f"http://localhost:{self.system_port}/health", self.is_ready),
132117
],
133118
timeout=500,
134119
display_output=True,
@@ -155,34 +140,49 @@ def is_ready(self, response) -> bool:
155140

156141
def _send_chat_request(
157142
payload: Dict[str, Any],
143+
*,
144+
base_url: str,
158145
timeout: int = 180,
159146
) -> requests.Response:
160147
"""Send a chat completion request with a specific payload."""
161148
headers = {"Content-Type": "application/json"}
162149

163150
response = requests.post(
164-
"http://localhost:8000/v1/chat/completions",
151+
f"{base_url}/v1/chat/completions",
165152
headers=headers,
166153
json=payload,
167154
timeout=timeout,
168155
)
169156
return response
170157

171158

172-
@pytest.fixture(scope="module")
173-
def runtime_services(request):
174-
"""Module-scoped runtime services for this test file."""
175-
with NatsServer(request) as nats_process:
176-
with EtcdServer(request) as etcd_process:
177-
yield nats_process, etcd_process
178-
179-
180-
@pytest.fixture(scope="module")
181-
def start_services(request, runtime_services):
182-
"""Start frontend and worker processes once for this module's tests."""
183-
with DynamoFrontendProcess(request):
159+
@pytest.fixture(scope="function")
160+
def start_services(
161+
request, runtime_services_dynamic_ports, dynamo_dynamic_ports: ServicePorts
162+
) -> Generator[None, None, None]:
163+
"""Start frontend and worker processes for this test.
164+
165+
`runtime_services_dynamic_ports` ensures NATS/etcd run on per-test ports and sets
166+
NATS_SERVER/ETCD_ENDPOINTS env vars for Dynamo to discover them.
167+
"""
168+
_ = runtime_services_dynamic_ports
169+
frontend_port = dynamo_dynamic_ports.frontend_port
170+
system_port = dynamo_dynamic_ports.system_ports[0]
171+
with DynamoFrontendProcess(
172+
request,
173+
frontend_port=frontend_port,
174+
# Optional debugging (not enabled on main):
175+
# If the frontend hits a Rust panic, enabling backtraces makes failures diagnosable
176+
# from CI logs without needing to repro locally.
177+
# extra_env={"RUST_BACKTRACE": "1", "TOKIO_BACKTRACE": "1"},
178+
terminate_existing=False,
179+
):
184180
logger.info("Frontend started for tests")
185-
with VllmWorkerProcess(request):
181+
with VllmWorkerProcess(
182+
request,
183+
frontend_port=frontend_port,
184+
system_port=system_port,
185+
):
186186
logger.info("Vllm Worker started for tests")
187187
yield
188188

@@ -218,8 +218,11 @@ def _validate_chat_response(response: requests.Response) -> Dict[str, Any]:
218218

219219

220220
@pytest.mark.usefixtures("start_services")
221+
@pytest.mark.timeout(240) # ~3x measured total (~70s/test), rounded up
221222
@pytest.mark.post_merge
222-
def test_reasoning_effort(request, runtime_services, predownload_models) -> None:
223+
def test_reasoning_effort(
224+
request, dynamo_dynamic_ports: ServicePorts, predownload_models
225+
) -> None:
223226
"""High reasoning effort should yield more detailed reasoning than low effort."""
224227

225228
prompt = (
@@ -252,12 +255,13 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None
252255
"chat_template_args": {"reasoning_effort": "low"},
253256
}
254257

255-
high_response = _send_chat_request(high_payload)
258+
base_url = f"http://localhost:{dynamo_dynamic_ports.frontend_port}"
259+
high_response = _send_chat_request(high_payload, base_url=base_url)
256260
high_reasoning_text, high_reasoning_tokens = _extract_reasoning_metrics(
257261
_validate_chat_response(high_response)
258262
)
259263

260-
low_response = _send_chat_request(low_payload)
264+
low_response = _send_chat_request(low_payload, base_url=base_url)
261265
low_reasoning_text, low_reasoning_tokens = _extract_reasoning_metrics(
262266
_validate_chat_response(low_response)
263267
)
@@ -281,8 +285,11 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None
281285

282286

283287
@pytest.mark.usefixtures("start_services")
288+
@pytest.mark.timeout(180) # ~3x measured total (~50s/test), rounded up
284289
@pytest.mark.post_merge
285-
def test_tool_calling(request, runtime_services, predownload_models) -> None:
290+
def test_tool_calling(
291+
request, dynamo_dynamic_ports: ServicePorts, predownload_models
292+
) -> None:
286293
"""Test tool calling functionality with weather and system health tools."""
287294

288295
payload = {
@@ -302,7 +309,8 @@ def test_tool_calling(request, runtime_services, predownload_models) -> None:
302309
"response_format": {"type": "text"},
303310
}
304311

305-
response = _send_chat_request(payload)
312+
base_url = f"http://localhost:{dynamo_dynamic_ports.frontend_port}"
313+
response = _send_chat_request(payload, base_url=base_url)
306314
response_data = _validate_chat_response(response)
307315

308316
logger.info("Tool call response: %s", response_data)
@@ -320,9 +328,10 @@ def test_tool_calling(request, runtime_services, predownload_models) -> None:
320328

321329

322330
@pytest.mark.usefixtures("start_services")
331+
@pytest.mark.timeout(180) # ~3x measured total (~50s/test), rounded up
323332
@pytest.mark.nightly
324333
def test_tool_calling_second_round(
325-
request, runtime_services, predownload_models
334+
request, dynamo_dynamic_ports: ServicePorts, predownload_models
326335
) -> None:
327336
"""Test tool calling with a follow-up message containing assistant's prior tool calls."""
328337

@@ -364,7 +373,8 @@ def test_tool_calling_second_round(
364373
"response_format": {"type": "text"},
365374
}
366375

367-
response = _send_chat_request(payload)
376+
base_url = f"http://localhost:{dynamo_dynamic_ports.frontend_port}"
377+
response = _send_chat_request(payload, base_url=base_url)
368378
response_data = _validate_chat_response(response)
369379

370380
logger.info("Tool call second round response: %s", response_data)
@@ -383,8 +393,11 @@ def test_tool_calling_second_round(
383393

384394

385395
@pytest.mark.usefixtures("start_services")
396+
@pytest.mark.timeout(180) # ~3x measured total (~57s/test), rounded up
386397
@pytest.mark.nightly
387-
def test_reasoning(request, runtime_services, predownload_models) -> None:
398+
def test_reasoning(
399+
request, dynamo_dynamic_ports: ServicePorts, predownload_models
400+
) -> None:
388401
"""Test reasoning functionality with a mathematical problem."""
389402

390403
payload = {
@@ -402,7 +415,8 @@ def test_reasoning(request, runtime_services, predownload_models) -> None:
402415
"max_tokens": 2000,
403416
}
404417

405-
response = _send_chat_request(payload)
418+
base_url = f"http://localhost:{dynamo_dynamic_ports.frontend_port}"
419+
response = _send_chat_request(payload, base_url=base_url)
406420
response_data = _validate_chat_response(response)
407421

408422
logger.info("Reasoning response: %s", response_data)

tests/utils/managed_process.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -595,7 +595,7 @@ def __init__(
595595
router_mode: str = "round-robin",
596596
extra_args: Optional[list[str]] = None,
597597
extra_env: Optional[dict[str, str]] = None,
598-
terminate_existing: bool = True,
598+
terminate_existing: bool = False,
599599
):
600600
# TODO: Refactor remaining duplicate "DynamoFrontendProcess" helpers in tests to
601601
# use this shared implementation (and delete the copies):
@@ -643,6 +643,8 @@ def __init__(
643643
command=command,
644644
env=env,
645645
display_output=True,
646+
# Default to False because the launcher is typically `python`, and killing
647+
# "existing python" processes can terminate the pytest runner itself.
646648
terminate_existing=terminate_existing,
647649
log_dir=log_dir,
648650
)

tests/utils/port_utils.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import socket
1616
import tempfile
1717
import time
18+
from dataclasses import dataclass
1819
from pathlib import Path
1920

2021
# Port allocation lock file
@@ -27,6 +28,18 @@
2728
_PORT_MAX = 32767
2829

2930

31+
@dataclass(frozen=True)
32+
class ServicePorts:
33+
"""Port allocation for Dynamo service deployments.
34+
35+
Used by tests that need to pass a cohesive set of ports around (frontend + one or
36+
more worker/system ports).
37+
"""
38+
39+
frontend_port: int
40+
system_ports: list[int]
41+
42+
3043
def _load_port_registry() -> dict:
3144
"""Load the port registry from disk.
3245

0 commit comments

Comments
 (0)