Skip to content

Commit b81960d

Browse files
committed
make eval script also handle performance measurement
Summary: 1. refactors the eval script to also handle performance measurement in vllm 2. adds a simple `vllm bench latency` script to bench in vllm The script is broken on every single recipe, we'll have to fix and enable things in future PRs, will update the performance tables afterwards. Also, add convenience flags to skip model creation, lm_eval, vllm as needed to enable running just a single model + single step. Test Plan: ``` SKIP_MODEL_CREATE=1 SKIP_LM_EVAL=1 SKIP_VLLM=0 with-proxy ./benchmarks/quantization/measure_accuracy_and_performance.sh h100 ``` Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 15f7481 ghstack-comment-id: 3634216524 Pull-Request: #3473
1 parent 6b9b310 commit b81960d

File tree

2 files changed

+159
-108
lines changed

2 files changed

+159
-108
lines changed

benchmarks/quantization/eval_accuracy_for_readme.sh

Lines changed: 0 additions & 108 deletions
This file was deleted.
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
#!/bin/bash
2+
3+
# measure_accuracy_and_performance.sh - Evaluate quantization recipe accuracy
4+
#
5+
# Usage: ./measure_accuracy_and_performance.sh [TAG_OR_RECIPE] [MODEL_ID] [LOG_FILE]
6+
#
7+
# Arguments:
8+
# TAG_OR_RECIPE (optional) Tag group, single recipe name, or "all" (default: all)
9+
# Valid tags: all, h100
10+
# Valid recipes: None, float8_rowwise,
11+
# int4_groupwise_weight_float8_rowwise_activation,
12+
# int4_groupwise_hqq_weight_only,
13+
# int8_rowwise_weight_only, int8_rowwise
14+
# MODEL_ID (optional) HuggingFace model ID (default: meta-llama/Llama-3.1-8B)
15+
# LOG_FILE (optional) Output log file path (default: benchmarks/data/measure_accuracy_and_performance_log.txt)
16+
#
17+
# Environment Variables:
18+
# SKIP_MODEL_CREATE If set to 1, skip creating quantized models (assumes models already exist)
19+
# SKIP_LM_EVAL If set to 1, skip running lm_eval (only creates quantized models)
20+
# SKIP_VLLM If set to 1, skip running vllm performance benchmarking
21+
#
22+
# Examples:
23+
# ./measure_accuracy_and_performance.sh # Run all recipes with default model
24+
# ./measure_accuracy_and_performance.sh h100 # Run H100-compatible recipes only
25+
# ./measure_accuracy_and_performance.sh float8_rowwise # Run single recipe
26+
# ./measure_accuracy_and_performance.sh h100 meta-llama/Llama-3.2-8B # Custom model with H100 recipes
27+
# ./measure_accuracy_and_performance.sh int8_rowwise meta-llama/Llama-3.2-8B my_log.txt # All custom args
28+
# SKIP_MODEL_CREATE=1 ./measure_accuracy_and_performance.sh h100 # Skip model creation, only run eval
29+
# SKIP_LM_EVAL=1 ./measure_accuracy_and_performance.sh h100 # Skip lm_eval, only create models
30+
# SKIP_VLLM=1 ./measure_accuracy_and_performance.sh h100 # Skip vllm benchmarking
31+
32+
set -e
33+
34+
# Define all available quantization recipes
35+
QUANT_RECIPES_ALL=(
36+
# no quantization (baseline)
37+
"None"
38+
"float8_rowwise"
39+
"int4_groupwise_weight_float8_rowwise_activation"
40+
# note: below only works on A100
41+
"int4_groupwise_hqq_weight_only"
42+
"int8_rowwise_weight_only"
43+
"int8_rowwise"
44+
)
45+
46+
# Define H100-compatible recipes (excludes A100-only recipes)
47+
# TODO(future PR): add `int4_groupwise_weight_float8_rowwise_activation` here,
48+
# need to fix https://gist.github.com/vkuzo/6b128681b628744d445c553cdeac8a85
49+
QUANT_RECIPES_H100=(
50+
"None"
51+
"float8_rowwise"
52+
"int8_rowwise_weight_only"
53+
"int8_rowwise"
54+
)
55+
56+
# Define recipes that are known to be broken in vllm
57+
VLLM_BROKEN_RECIPES=(
58+
# TODO(future PR): fix this
59+
# current stack trace: https://gist.github.com/vkuzo/eed4894c5f3434e15d70b163e6077f60
60+
"float8_rowwise"
61+
# as of this PR, this recipe is still using AQT and CUDA graph capture time
62+
# in vLLM is really slow (>5 mins)
63+
# TODO(future PR): reenable this once we migrate this recipe off of AQT
64+
"int8_rowwise_weight_only"
65+
# TODO(future PR): fix this
66+
# error: https://gist.github.com/vkuzo/5bf389079442bb9851ef315cdcb797b4
67+
"int8_rowwise"
68+
)
69+
70+
# TODO(future PR): add A100 and B200 tag groups
71+
72+
# Get tag/recipe as first positional argument (optional, default: all)
73+
TAG_OR_RECIPE="${1:-all}"
74+
75+
# Get model_id as second positional argument (optional)
76+
MODEL_ID="${2:-meta-llama/Llama-3.1-8B}"
77+
78+
# Get log file as third positional argument (optional)
79+
LOG_FILE="${3:-benchmarks/data/measure_accuracy_and_performance_log.txt}"
80+
81+
# Select recipes based on tag or specific recipe
82+
if [ "$TAG_OR_RECIPE" = "all" ]; then
83+
QUANT_RECIPES=("${QUANT_RECIPES_ALL[@]}")
84+
elif [ "$TAG_OR_RECIPE" = "h100" ]; then
85+
QUANT_RECIPES=("${QUANT_RECIPES_H100[@]}")
86+
else
87+
# Check if it's a valid recipe name
88+
VALID_RECIPE=false
89+
for recipe in "${QUANT_RECIPES_ALL[@]}"; do
90+
if [ "$recipe" = "$TAG_OR_RECIPE" ]; then
91+
VALID_RECIPE=true
92+
QUANT_RECIPES=("$TAG_OR_RECIPE")
93+
break
94+
fi
95+
done
96+
97+
if [ "$VALID_RECIPE" = false ]; then
98+
echo "Error: Invalid tag or recipe name: '$TAG_OR_RECIPE'"
99+
echo ""
100+
echo "Valid tags:"
101+
echo " - all"
102+
echo " - h100"
103+
echo ""
104+
echo "Valid recipe names:"
105+
for recipe in "${QUANT_RECIPES_ALL[@]}"; do
106+
echo " - $recipe"
107+
done
108+
exit 1
109+
fi
110+
fi
111+
112+
rm -rf $LOG_FILE
113+
touch $LOG_FILE
114+
115+
for quant_recipe in "${QUANT_RECIPES[@]}"; do
116+
117+
echo "processing $quant_recipe"
118+
119+
OUTPUT_DIR="benchmarks/data/quantized_model/$MODEL_ID-$quant_recipe/"
120+
121+
# create quantized model (unless skipped via environment variable)
122+
if [ "${SKIP_MODEL_CREATE:-0}" != "1" ]; then
123+
# Note: the -u flag is to prevent python from buffering stdout and stderr
124+
# and make the output log file be in chronological order
125+
rm -rf $OUTPUT_DIR && python -u benchmarks/quantization/create_quantized_model.py --model_id $MODEL_ID --output_dir $OUTPUT_DIR --quant_recipe_name $quant_recipe 2>&1 | tee -a "$LOG_FILE"
126+
else
127+
echo "Skipping model creation (SKIP_MODEL_CREATE=1), using existing model at $OUTPUT_DIR"
128+
fi
129+
130+
# run eval (unless skipped via environment variable)
131+
if [ "${SKIP_LM_EVAL:-0}" != "1" ]; then
132+
lm_eval --model hf --model_args "pretrained=$OUTPUT_DIR" --tasks "wikitext,winogrande" --device "cuda:0" --batch_size auto --output_path "$OUTPUT_DIR/lm_eval_outputs/" 2>&1 | tee -a "$LOG_FILE"
133+
else
134+
echo "Skipping lm_eval (SKIP_LM_EVAL=1)"
135+
fi
136+
137+
# simple performance test (unless skipped via environment variable)
138+
if [ "${SKIP_VLLM:-0}" != "1" ]; then
139+
# Check if this recipe is known to be broken in vllm
140+
RECIPE_BROKEN_IN_VLLM=false
141+
for broken_recipe in "${VLLM_BROKEN_RECIPES[@]}"; do
142+
if [ "$quant_recipe" = "$broken_recipe" ]; then
143+
RECIPE_BROKEN_IN_VLLM=true
144+
break
145+
fi
146+
done
147+
148+
if [ "$RECIPE_BROKEN_IN_VLLM" = true ]; then
149+
echo "Skipping vllm benchmarking for $quant_recipe (known to be broken in vllm)"
150+
else
151+
vllm bench latency --input_len 256 --output_len 256 --model $OUTPUT_DIR --batch_size 1 2>&1 | tee -a "$LOG_FILE"
152+
fi
153+
else
154+
echo "Skipping vllm benchmarking (SKIP_VLLM=1)"
155+
fi
156+
157+
done
158+
159+
# TODO(future PR): script to parse the log file instead of manual copy-paste

0 commit comments

Comments
 (0)