Skip to content

Commit dcb7d1c

Browse files
committed
Retrieve URL for every model instance
1 parent 90a4a1c commit dcb7d1c

File tree

2 files changed

+39
-37
lines changed

2 files changed

+39
-37
lines changed

vec_inf/cli/_cli.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,10 +188,10 @@ def status(slurm_job_id: int, log_dir: str=None, json_mode: bool=False) -> None:
188188
status = server_status[0]
189189
slurm_job_failed_reason = server_status[1]
190190
elif server_status == "RUNNING":
191-
status = model_health_check(slurm_job_name)
191+
status = model_health_check(slurm_job_name, slurm_job_id, log_dir)
192192
if status == "READY":
193193
# Only set base_url if model is ready to serve requests
194-
base_url = get_base_url(slurm_job_name)
194+
base_url = get_base_url(slurm_job_name, slurm_job_id, log_dir)
195195
else:
196196
# If model is not ready, then status must be "FAILED"
197197
status = status[0]

vec_inf/cli/_utils.py

Lines changed: 37 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99

1010
MODEL_READY_SIGNATURE = "INFO: Uvicorn running on http://0.0.0.0:"
11+
SERVER_ADDRESS_SIGNATURE = "Server address: "
1112

1213

1314
def run_bash_command(command: str) -> str:
@@ -19,68 +20,69 @@ def run_bash_command(command: str) -> str:
1920
return stdout
2021

2122

22-
def get_model_dir(slurm_job_name: str, is_log_dir: bool=False) -> str:
23+
def read_slurm_log(
24+
slurm_job_name: str,
25+
slurm_job_id: int,
26+
slurm_log_type: str,
27+
log_dir: str
28+
) -> Union[list, str]:
2329
"""
2430
Get the directory of a model
2531
"""
26-
if is_log_dir:
27-
models_dir = os.path.join(os.path.expanduser("~"), ".vec-inf-logs")
28-
else:
29-
models_dir = os.path.join(
30-
os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
31-
"models"
32-
)
33-
model_dir = ""
34-
for dir in sorted(os.listdir(models_dir), key=len, reverse=True):
35-
if dir in slurm_job_name:
36-
model_dir = os.path.join(models_dir, dir)
37-
break
38-
39-
return model_dir
40-
41-
42-
def is_server_running(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> Union[str, tuple]:
43-
"""
44-
Check if a model is ready to serve requests
45-
"""
4632
if not log_dir:
47-
log_dir = get_model_dir(slurm_job_name, is_log_dir=True)
33+
models_dir = os.path.join(os.path.expanduser("~"), ".vec-inf-logs")
34+
35+
for dir in sorted(os.listdir(models_dir), key=len, reverse=True):
36+
if dir in slurm_job_name:
37+
log_dir = os.path.join(models_dir, dir)
38+
break
4839

4940
try:
50-
file_path = os.path.join(log_dir, f"{slurm_job_name}.{slurm_job_id}.err")
41+
file_path = os.path.join(log_dir, f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}")
5142
with open(file_path, 'r') as file:
5243
lines = file.readlines()
5344
except FileNotFoundError:
5445
print(f"Could not find file: {file_path}")
5546
return "LOG_FILE_NOT_FOUND"
47+
return lines
48+
49+
def is_server_running(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> Union[str, tuple]:
50+
"""
51+
Check if a model is ready to serve requests
52+
"""
53+
log_content = read_slurm_log(slurm_job_name, slurm_job_id, "err", log_dir)
54+
if type(log_content) is str:
55+
return log_content
5656

57-
for line in lines:
57+
for line in log_content:
5858
if "error" in line.lower():
5959
return ("FAILED", line.strip("\n"))
6060
if MODEL_READY_SIGNATURE in line:
6161
return "RUNNING"
6262
return "LAUNCHING"
6363

6464

65-
def get_base_url(slurm_job_name: str) -> str:
65+
def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> str:
6666
"""
6767
Get the base URL of a model
6868
"""
69-
model_dir = get_model_dir(slurm_job_name)
70-
try:
71-
file_path = os.path.join(model_dir, f".{slurm_job_name}_url")
72-
with open(file_path, 'r') as file:
73-
lines = file.readlines()
74-
except FileNotFoundError:
75-
return "UNAVAILABLE"
76-
return lines[0].strip().strip("\n")
69+
log_content = read_slurm_log(slurm_job_name, slurm_job_id, "out", log_dir)
70+
if type(log_content) is str:
71+
return log_content
72+
73+
for line in log_content:
74+
if SERVER_ADDRESS_SIGNATURE in line:
75+
return line.split(SERVER_ADDRESS_SIGNATURE)[1].strip("\n")
76+
return "URL_NOT_FOUND"
7777

7878

79-
def model_health_check(slurm_job_name: str) -> Union[str, tuple]:
79+
def model_health_check(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> Union[str, tuple]:
8080
"""
8181
Check the health of a running model on the cluster
8282
"""
83-
base_url = get_base_url(slurm_job_name)
83+
base_url = get_base_url(slurm_job_name, slurm_job_id, log_dir)
84+
if not base_url.startswith("http"):
85+
return ("FAILED", base_url)
8486
health_check_url = base_url.replace("v1", "health")
8587

8688
try:

0 commit comments

Comments
 (0)