88
99
1010MODEL_READY_SIGNATURE = "INFO: Uvicorn running on http://0.0.0.0:"
11+ SERVER_ADDRESS_SIGNATURE = "Server address: "
1112
1213
1314def run_bash_command (command : str ) -> str :
@@ -19,68 +20,69 @@ def run_bash_command(command: str) -> str:
1920 return stdout
2021
2122
22- def get_model_dir (slurm_job_name : str , is_log_dir : bool = False ) -> str :
23+ def read_slurm_log (
24+ slurm_job_name : str ,
25+ slurm_job_id : int ,
26+ slurm_log_type : str ,
27+ log_dir : str
28+ ) -> Union [list , str ]:
2329 """
2430 Get the directory of a model
2531 """
26- if is_log_dir :
27- models_dir = os .path .join (os .path .expanduser ("~" ), ".vec-inf-logs" )
28- else :
29- models_dir = os .path .join (
30- os .path .dirname (os .path .dirname (os .path .realpath (__file__ ))),
31- "models"
32- )
33- model_dir = ""
34- for dir in sorted (os .listdir (models_dir ), key = len , reverse = True ):
35- if dir in slurm_job_name :
36- model_dir = os .path .join (models_dir , dir )
37- break
38-
39- return model_dir
40-
41-
42- def is_server_running (slurm_job_name : str , slurm_job_id : int , log_dir : str ) -> Union [str , tuple ]:
43- """
44- Check if a model is ready to serve requests
45- """
4632 if not log_dir :
47- log_dir = get_model_dir (slurm_job_name , is_log_dir = True )
33+ models_dir = os .path .join (os .path .expanduser ("~" ), ".vec-inf-logs" )
34+
35+ for dir in sorted (os .listdir (models_dir ), key = len , reverse = True ):
36+ if dir in slurm_job_name :
37+ log_dir = os .path .join (models_dir , dir )
38+ break
4839
4940 try :
50- file_path = os .path .join (log_dir , f"{ slurm_job_name } .{ slurm_job_id } .err " )
41+ file_path = os .path .join (log_dir , f"{ slurm_job_name } .{ slurm_job_id } .{ slurm_log_type } " )
5142 with open (file_path , 'r' ) as file :
5243 lines = file .readlines ()
5344 except FileNotFoundError :
5445 print (f"Could not find file: { file_path } " )
5546 return "LOG_FILE_NOT_FOUND"
47+ return lines
48+
49+ def is_server_running (slurm_job_name : str , slurm_job_id : int , log_dir : str ) -> Union [str , tuple ]:
50+ """
51+ Check if a model is ready to serve requests
52+ """
53+ log_content = read_slurm_log (slurm_job_name , slurm_job_id , "err" , log_dir )
54+ if type (log_content ) is str :
55+ return log_content
5656
57- for line in lines :
57+ for line in log_content :
5858 if "error" in line .lower ():
5959 return ("FAILED" , line .strip ("\n " ))
6060 if MODEL_READY_SIGNATURE in line :
6161 return "RUNNING"
6262 return "LAUNCHING"
6363
6464
65- def get_base_url (slurm_job_name : str ) -> str :
65+ def get_base_url (slurm_job_name : str , slurm_job_id : int , log_dir : str ) -> str :
6666 """
6767 Get the base URL of a model
6868 """
69- model_dir = get_model_dir (slurm_job_name )
70- try :
71- file_path = os . path . join ( model_dir , f". { slurm_job_name } _url" )
72- with open ( file_path , 'r' ) as file :
73- lines = file . readlines ()
74- except FileNotFoundError :
75- return "UNAVAILABLE"
76- return lines [ 0 ]. strip (). strip ( " \n " )
69+ log_content = read_slurm_log (slurm_job_name , slurm_job_id , "out" , log_dir )
70+ if type ( log_content ) is str :
71+ return log_content
72+
73+ for line in log_content :
74+ if SERVER_ADDRESS_SIGNATURE in line :
75+ return line . split ( SERVER_ADDRESS_SIGNATURE )[ 1 ]. strip ( " \n " )
76+ return "URL_NOT_FOUND"
7777
7878
79- def model_health_check (slurm_job_name : str ) -> Union [str , tuple ]:
79+ def model_health_check (slurm_job_name : str , slurm_job_id : int , log_dir : str ) -> Union [str , tuple ]:
8080 """
8181 Check the health of a running model on the cluster
8282 """
83- base_url = get_base_url (slurm_job_name )
83+ base_url = get_base_url (slurm_job_name , slurm_job_id , log_dir )
84+ if not base_url .startswith ("http" ):
85+ return ("FAILED" , base_url )
8486 health_check_url = base_url .replace ("v1" , "health" )
8587
8688 try :
0 commit comments