11# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22# SPDX-License-Identifier: Apache-2.0
33
4- """End-to-end tests covering reasoning effort behaviour."""
4+ """End-to-end tests covering reasoning effort behaviour.
5+
6+ Runtime note:
7+ - `python -m pytest tests/frontend/test_vllm.py -v` took ~228s (3m48s) wall time.
8+ - Expect variance depending on model cache state, compilation warmup, and system load.
9+ """
510
611from __future__ import annotations
712
813import logging
914import os
1015import shutil
11- from typing import Any , Dict , Optional , Tuple
16+ from typing import Any , Dict , Generator , Optional , Tuple
1217
1318import pytest
1419import requests
1520
16- from tests .conftest import EtcdServer , NatsServer
1721from tests .utils .constants import GPT_OSS
18- from tests .utils .managed_process import ManagedProcess
22+ from tests .utils .managed_process import DynamoFrontendProcess , ManagedProcess
1923from tests .utils .payloads import check_models_api
24+ from tests .utils .port_utils import ServicePorts
2025
2126logger = logging .getLogger (__name__ )
2227
6267}
6368
6469
65- class DynamoFrontendProcess (ManagedProcess ):
66- """Process manager for Dynamo frontend"""
67-
68- def __init__ (self , request ):
69- command = ["python" , "-m" , "dynamo.frontend" , "--router-mode" , "round-robin" ]
70-
71- # Unset DYN_SYSTEM_PORT - frontend doesn't use system metrics server
72- env = os .environ .copy ()
73- env .pop ("DYN_SYSTEM_PORT" , None )
74-
75- log_dir = f"{ request .node .name } _frontend"
76-
77- # Clean up any existing log directory from previous runs
78- try :
79- shutil .rmtree (log_dir )
80- logger .info (f"Cleaned up existing log directory: { log_dir } " )
81- except FileNotFoundError :
82- # Directory doesn't exist, which is fine
83- pass
84-
85- super ().__init__ (
86- command = command ,
87- env = env ,
88- display_output = True ,
89- terminate_existing = True ,
90- log_dir = log_dir ,
91- )
92-
93-
9470class VllmWorkerProcess (ManagedProcess ):
9571 """Vllm Worker process for GPT-OSS model."""
9672
97- def __init__ (self , request , worker_id : str = "vllm-worker" ):
73+ def __init__ (
74+ self ,
75+ request ,
76+ * ,
77+ frontend_port : int ,
78+ system_port : int ,
79+ worker_id : str = "vllm-worker" ,
80+ ):
9881 self .worker_id = worker_id
82+ self .frontend_port = int (frontend_port )
83+ self .system_port = int (system_port )
9984
10085 command = [
10186 "python3" ,
@@ -114,7 +99,7 @@ def __init__(self, request, worker_id: str = "vllm-worker"):
11499 env = os .environ .copy ()
115100 env ["DYN_LOG" ] = "debug"
116101 env ["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS" ] = '["generate"]'
117- env ["DYN_SYSTEM_PORT" ] = "8083"
102+ env ["DYN_SYSTEM_PORT" ] = str ( self . system_port )
118103
119104 log_dir = f"{ request .node .name } _{ worker_id } "
120105
@@ -127,8 +112,8 @@ def __init__(self, request, worker_id: str = "vllm-worker"):
127112 command = command ,
128113 env = env ,
129114 health_check_urls = [
130- ("http://localhost:8000 /v1/models" , check_models_api ),
131- ("http://localhost:8083 /health" , self .is_ready ),
115+ (f "http://localhost:{ self . frontend_port } /v1/models" , check_models_api ),
116+ (f "http://localhost:{ self . system_port } /health" , self .is_ready ),
132117 ],
133118 timeout = 500 ,
134119 display_output = True ,
@@ -155,34 +140,49 @@ def is_ready(self, response) -> bool:
155140
156141def _send_chat_request (
157142 payload : Dict [str , Any ],
143+ * ,
144+ base_url : str ,
158145 timeout : int = 180 ,
159146) -> requests .Response :
160147 """Send a chat completion request with a specific payload."""
161148 headers = {"Content-Type" : "application/json" }
162149
163150 response = requests .post (
164- "http://localhost:8000 /v1/chat/completions" ,
151+ f" { base_url } /v1/chat/completions" ,
165152 headers = headers ,
166153 json = payload ,
167154 timeout = timeout ,
168155 )
169156 return response
170157
171158
172- @pytest .fixture (scope = "module" )
173- def runtime_services (request ):
174- """Module-scoped runtime services for this test file."""
175- with NatsServer (request ) as nats_process :
176- with EtcdServer (request ) as etcd_process :
177- yield nats_process , etcd_process
178-
179-
180- @pytest .fixture (scope = "module" )
181- def start_services (request , runtime_services ):
182- """Start frontend and worker processes once for this module's tests."""
183- with DynamoFrontendProcess (request ):
159+ @pytest .fixture (scope = "function" )
160+ def start_services (
161+ request , runtime_services_dynamic_ports , dynamo_dynamic_ports : ServicePorts
162+ ) -> Generator [None , None , None ]:
163+ """Start frontend and worker processes for this test.
164+
165+ `runtime_services_dynamic_ports` ensures NATS/etcd run on per-test ports and sets
166+ NATS_SERVER/ETCD_ENDPOINTS env vars for Dynamo to discover them.
167+ """
168+ _ = runtime_services_dynamic_ports
169+ frontend_port = dynamo_dynamic_ports .frontend_port
170+ system_port = dynamo_dynamic_ports .system_ports [0 ]
171+ with DynamoFrontendProcess (
172+ request ,
173+ frontend_port = frontend_port ,
174+ # Optional debugging (not enabled on main):
175+ # If the frontend hits a Rust panic, enabling backtraces makes failures diagnosable
176+ # from CI logs without needing to repro locally.
177+ # extra_env={"RUST_BACKTRACE": "1", "TOKIO_BACKTRACE": "1"},
178+ terminate_existing = False ,
179+ ):
184180 logger .info ("Frontend started for tests" )
185- with VllmWorkerProcess (request ):
181+ with VllmWorkerProcess (
182+ request ,
183+ frontend_port = frontend_port ,
184+ system_port = system_port ,
185+ ):
186186 logger .info ("Vllm Worker started for tests" )
187187 yield
188188
@@ -218,8 +218,11 @@ def _validate_chat_response(response: requests.Response) -> Dict[str, Any]:
218218
219219
220220@pytest .mark .usefixtures ("start_services" )
221+ @pytest .mark .timeout (240 ) # ~3x measured total (~70s/test), rounded up
221222@pytest .mark .post_merge
222- def test_reasoning_effort (request , runtime_services , predownload_models ) -> None :
223+ def test_reasoning_effort (
224+ request , dynamo_dynamic_ports : ServicePorts , predownload_models
225+ ) -> None :
223226 """High reasoning effort should yield more detailed reasoning than low effort."""
224227
225228 prompt = (
@@ -252,12 +255,13 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None
252255 "chat_template_args" : {"reasoning_effort" : "low" },
253256 }
254257
255- high_response = _send_chat_request (high_payload )
258+ base_url = f"http://localhost:{ dynamo_dynamic_ports .frontend_port } "
259+ high_response = _send_chat_request (high_payload , base_url = base_url )
256260 high_reasoning_text , high_reasoning_tokens = _extract_reasoning_metrics (
257261 _validate_chat_response (high_response )
258262 )
259263
260- low_response = _send_chat_request (low_payload )
264+ low_response = _send_chat_request (low_payload , base_url = base_url )
261265 low_reasoning_text , low_reasoning_tokens = _extract_reasoning_metrics (
262266 _validate_chat_response (low_response )
263267 )
@@ -281,8 +285,11 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None
281285
282286
283287@pytest .mark .usefixtures ("start_services" )
288+ @pytest .mark .timeout (180 ) # ~3x measured total (~50s/test), rounded up
284289@pytest .mark .post_merge
285- def test_tool_calling (request , runtime_services , predownload_models ) -> None :
290+ def test_tool_calling (
291+ request , dynamo_dynamic_ports : ServicePorts , predownload_models
292+ ) -> None :
286293 """Test tool calling functionality with weather and system health tools."""
287294
288295 payload = {
@@ -302,7 +309,8 @@ def test_tool_calling(request, runtime_services, predownload_models) -> None:
302309 "response_format" : {"type" : "text" },
303310 }
304311
305- response = _send_chat_request (payload )
312+ base_url = f"http://localhost:{ dynamo_dynamic_ports .frontend_port } "
313+ response = _send_chat_request (payload , base_url = base_url )
306314 response_data = _validate_chat_response (response )
307315
308316 logger .info ("Tool call response: %s" , response_data )
@@ -320,9 +328,10 @@ def test_tool_calling(request, runtime_services, predownload_models) -> None:
320328
321329
322330@pytest .mark .usefixtures ("start_services" )
331+ @pytest .mark .timeout (180 ) # ~3x measured total (~50s/test), rounded up
323332@pytest .mark .nightly
324333def test_tool_calling_second_round (
325- request , runtime_services , predownload_models
334+ request , dynamo_dynamic_ports : ServicePorts , predownload_models
326335) -> None :
327336 """Test tool calling with a follow-up message containing assistant's prior tool calls."""
328337
@@ -364,7 +373,8 @@ def test_tool_calling_second_round(
364373 "response_format" : {"type" : "text" },
365374 }
366375
367- response = _send_chat_request (payload )
376+ base_url = f"http://localhost:{ dynamo_dynamic_ports .frontend_port } "
377+ response = _send_chat_request (payload , base_url = base_url )
368378 response_data = _validate_chat_response (response )
369379
370380 logger .info ("Tool call second round response: %s" , response_data )
@@ -383,8 +393,11 @@ def test_tool_calling_second_round(
383393
384394
385395@pytest .mark .usefixtures ("start_services" )
396+ @pytest .mark .timeout (180 ) # ~3x measured total (~57s/test), rounded up
386397@pytest .mark .nightly
387- def test_reasoning (request , runtime_services , predownload_models ) -> None :
398+ def test_reasoning (
399+ request , dynamo_dynamic_ports : ServicePorts , predownload_models
400+ ) -> None :
388401 """Test reasoning functionality with a mathematical problem."""
389402
390403 payload = {
@@ -402,7 +415,8 @@ def test_reasoning(request, runtime_services, predownload_models) -> None:
402415 "max_tokens" : 2000 ,
403416 }
404417
405- response = _send_chat_request (payload )
418+ base_url = f"http://localhost:{ dynamo_dynamic_ports .frontend_port } "
419+ response = _send_chat_request (payload , base_url = base_url )
406420 response_data = _validate_chat_response (response )
407421
408422 logger .info ("Reasoning response: %s" , response_data )
0 commit comments