|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# measure_accuracy_and_performance.sh - Evaluate quantization recipe accuracy |
| 4 | +# |
| 5 | +# Usage: ./measure_accuracy_and_performance.sh [TAG_OR_RECIPE] [MODEL_ID] [LOG_FILE] |
| 6 | +# |
| 7 | +# Arguments: |
| 8 | +# TAG_OR_RECIPE (optional) Tag group, single recipe name, or "all" (default: all) |
| 9 | +# Valid tags: all, h100 |
| 10 | +# Valid recipes: None, float8_rowwise, |
| 11 | +# int4_groupwise_weight_float8_rowwise_activation, |
| 12 | +# int4_groupwise_hqq_weight_only, |
| 13 | +# int8_rowwise_weight_only, int8_rowwise |
| 14 | +# MODEL_ID (optional) HuggingFace model ID (default: meta-llama/Llama-3.1-8B) |
| 15 | +# LOG_FILE (optional) Output log file path (default: benchmarks/data/measure_accuracy_and_performance_log.txt) |
| 16 | +# |
| 17 | +# Environment Variables: |
| 18 | +# SKIP_MODEL_CREATE If set to 1, skip creating quantized models (assumes models already exist) |
| 19 | +# SKIP_LM_EVAL If set to 1, skip running lm_eval (only creates quantized models) |
| 20 | +# SKIP_VLLM If set to 1, skip running vllm performance benchmarking |
| 21 | +# |
| 22 | +# Examples: |
| 23 | +# ./measure_accuracy_and_performance.sh # Run all recipes with default model |
| 24 | +# ./measure_accuracy_and_performance.sh h100 # Run H100-compatible recipes only |
| 25 | +# ./measure_accuracy_and_performance.sh float8_rowwise # Run single recipe |
| 26 | +# ./measure_accuracy_and_performance.sh h100 meta-llama/Llama-3.2-8B # Custom model with H100 recipes |
| 27 | +# ./measure_accuracy_and_performance.sh int8_rowwise meta-llama/Llama-3.2-8B my_log.txt # All custom args |
| 28 | +# SKIP_MODEL_CREATE=1 ./measure_accuracy_and_performance.sh h100 # Skip model creation, only run eval |
| 29 | +# SKIP_LM_EVAL=1 ./measure_accuracy_and_performance.sh h100 # Skip lm_eval, only create models |
| 30 | +# SKIP_VLLM=1 ./measure_accuracy_and_performance.sh h100 # Skip vllm benchmarking |
| 31 | + |
| 32 | +set -e |
| 33 | + |
| 34 | +# Define all available quantization recipes |
| 35 | +QUANT_RECIPES_ALL=( |
| 36 | + # no quantization (baseline) |
| 37 | + "None" |
| 38 | + "float8_rowwise" |
| 39 | + "int4_groupwise_weight_float8_rowwise_activation" |
| 40 | + # note: below only works on A100 |
| 41 | + "int4_groupwise_hqq_weight_only" |
| 42 | + "int8_rowwise_weight_only" |
| 43 | + "int8_rowwise" |
| 44 | +) |
| 45 | + |
| 46 | +# Define H100-compatible recipes (excludes A100-only recipes) |
| 47 | +# TODO(future PR): add `int4_groupwise_weight_float8_rowwise_activation` here, |
| 48 | +# need to fix https://gist.github.com/vkuzo/6b128681b628744d445c553cdeac8a85 |
| 49 | +QUANT_RECIPES_H100=( |
| 50 | + "None" |
| 51 | + "float8_rowwise" |
| 52 | + "int8_rowwise_weight_only" |
| 53 | + "int8_rowwise" |
| 54 | +) |
| 55 | + |
| 56 | +# Define recipes that are known to be broken in vllm |
| 57 | +VLLM_BROKEN_RECIPES=( |
| 58 | + # TODO(future PR): fix this |
| 59 | + # current stack trace: https://gist.github.com/vkuzo/eed4894c5f3434e15d70b163e6077f60 |
| 60 | + "float8_rowwise" |
| 61 | + # as of this PR, this recipe is still using AQT and CUDA graph capture time |
| 62 | + # in vLLM is really slow (>5 mins) |
| 63 | + # TODO(future PR): reenable this once we migrate this recipe off of AQT |
| 64 | + "int8_rowwise_weight_only" |
| 65 | + # TODO(future PR): fix this |
| 66 | + # error: https://gist.github.com/vkuzo/5bf389079442bb9851ef315cdcb797b4 |
| 67 | + "int8_rowwise" |
| 68 | +) |
| 69 | + |
| 70 | +# TODO(future PR): add A100 and B200 tag groups |
| 71 | + |
| 72 | +# Get tag/recipe as first positional argument (optional, default: all) |
| 73 | +TAG_OR_RECIPE="${1:-all}" |
| 74 | + |
| 75 | +# Get model_id as second positional argument (optional) |
| 76 | +MODEL_ID="${2:-meta-llama/Llama-3.1-8B}" |
| 77 | + |
| 78 | +# Get log file as third positional argument (optional) |
| 79 | +LOG_FILE="${3:-benchmarks/data/measure_accuracy_and_performance_log.txt}" |
| 80 | + |
| 81 | +# Select recipes based on tag or specific recipe |
| 82 | +if [ "$TAG_OR_RECIPE" = "all" ]; then |
| 83 | + QUANT_RECIPES=("${QUANT_RECIPES_ALL[@]}") |
| 84 | +elif [ "$TAG_OR_RECIPE" = "h100" ]; then |
| 85 | + QUANT_RECIPES=("${QUANT_RECIPES_H100[@]}") |
| 86 | +else |
| 87 | + # Check if it's a valid recipe name |
| 88 | + VALID_RECIPE=false |
| 89 | + for recipe in "${QUANT_RECIPES_ALL[@]}"; do |
| 90 | + if [ "$recipe" = "$TAG_OR_RECIPE" ]; then |
| 91 | + VALID_RECIPE=true |
| 92 | + QUANT_RECIPES=("$TAG_OR_RECIPE") |
| 93 | + break |
| 94 | + fi |
| 95 | + done |
| 96 | + |
| 97 | + if [ "$VALID_RECIPE" = false ]; then |
| 98 | + echo "Error: Invalid tag or recipe name: '$TAG_OR_RECIPE'" |
| 99 | + echo "" |
| 100 | + echo "Valid tags:" |
| 101 | + echo " - all" |
| 102 | + echo " - h100" |
| 103 | + echo "" |
| 104 | + echo "Valid recipe names:" |
| 105 | + for recipe in "${QUANT_RECIPES_ALL[@]}"; do |
| 106 | + echo " - $recipe" |
| 107 | + done |
| 108 | + exit 1 |
| 109 | + fi |
| 110 | +fi |
| 111 | + |
| 112 | +rm -rf $LOG_FILE |
| 113 | +touch $LOG_FILE |
| 114 | + |
| 115 | +for quant_recipe in "${QUANT_RECIPES[@]}"; do |
| 116 | + |
| 117 | + echo "processing $quant_recipe" |
| 118 | + |
| 119 | + OUTPUT_DIR="benchmarks/data/quantized_model/$MODEL_ID-$quant_recipe/" |
| 120 | + |
| 121 | + # create quantized model (unless skipped via environment variable) |
| 122 | + if [ "${SKIP_MODEL_CREATE:-0}" != "1" ]; then |
| 123 | + # Note: the -u flag is to prevent python from buffering stdout and stderr |
| 124 | + # and make the output log file be in chronological order |
| 125 | + rm -rf $OUTPUT_DIR && python -u benchmarks/quantization/create_quantized_model.py --model_id $MODEL_ID --output_dir $OUTPUT_DIR --quant_recipe_name $quant_recipe 2>&1 | tee -a "$LOG_FILE" |
| 126 | + else |
| 127 | + echo "Skipping model creation (SKIP_MODEL_CREATE=1), using existing model at $OUTPUT_DIR" |
| 128 | + fi |
| 129 | + |
| 130 | + # run eval (unless skipped via environment variable) |
| 131 | + if [ "${SKIP_LM_EVAL:-0}" != "1" ]; then |
| 132 | + lm_eval --model hf --model_args "pretrained=$OUTPUT_DIR" --tasks "wikitext,winogrande" --device "cuda:0" --batch_size auto --output_path "$OUTPUT_DIR/lm_eval_outputs/" 2>&1 | tee -a "$LOG_FILE" |
| 133 | + else |
| 134 | + echo "Skipping lm_eval (SKIP_LM_EVAL=1)" |
| 135 | + fi |
| 136 | + |
| 137 | + # simple performance test (unless skipped via environment variable) |
| 138 | + if [ "${SKIP_VLLM:-0}" != "1" ]; then |
| 139 | + # Check if this recipe is known to be broken in vllm |
| 140 | + RECIPE_BROKEN_IN_VLLM=false |
| 141 | + for broken_recipe in "${VLLM_BROKEN_RECIPES[@]}"; do |
| 142 | + if [ "$quant_recipe" = "$broken_recipe" ]; then |
| 143 | + RECIPE_BROKEN_IN_VLLM=true |
| 144 | + break |
| 145 | + fi |
| 146 | + done |
| 147 | + |
| 148 | + if [ "$RECIPE_BROKEN_IN_VLLM" = true ]; then |
| 149 | + echo "Skipping vllm benchmarking for $quant_recipe (known to be broken in vllm)" |
| 150 | + else |
| 151 | + vllm bench latency --input_len 256 --output_len 256 --model $OUTPUT_DIR --batch_size 1 2>&1 | tee -a "$LOG_FILE" |
| 152 | + fi |
| 153 | + else |
| 154 | + echo "Skipping vllm benchmarking (SKIP_VLLM=1)" |
| 155 | + fi |
| 156 | + |
| 157 | +done |
| 158 | + |
| 159 | +# TODO(future PR): script to parse the log file instead of manual copy-paste |
0 commit comments