simplify accuracy eval

vkuzo · vkuzo · commit be272a67dbae · 2025-12-09T06:30:15.000-08:00
Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 10d9979 ghstack-comment-id: 3632556050 Pull-Request: #3470
diff --git a/benchmarks/quantization/create_quantized_model.py b/benchmarks/quantization/create_quantized_model.py
@@ -21,7 +21,7 @@
 
 
 def string_to_config(s):
-    if s is None:
+    if s == "None":
         return None
     elif s == "float8_rowwise":
         return Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
@@ -41,7 +41,7 @@ def string_to_config(s):
         raise AssertionError(f"unsupported {s}")
 
 
-def quantize_model_and_save(model_id, quant_config, output_dir="results"):
+def quantize_model_and_save(model_id, quant_config, output_dir):
     """Quantize the model and save it to the output directory."""
     print("Quantizing model with config: ", quant_config)
     if quant_config is None:
@@ -60,27 +60,6 @@ def quantize_model_and_save(model_id, quant_config, output_dir="results"):
     return quantized_model, tokenizer
 
 
-def run_lm_eval(model_dir, tasks_list=["hellaswag"], device="cuda:0", batch_size=8):
-    """Run the lm_eval command using subprocess."""
-    tasks_str = ",".join(tasks_list)
-    command = [
-        "lm_eval",
-        "--model",
-        "hf",
-        "--model_args",
-        f"pretrained={model_dir}",
-        "--tasks",
-        f"{tasks_str}",
-        "--device",
-        f"{device}",
-        "--batch_size",
-        f"{batch_size}",
-        "--output_path",
-        f"{model_dir}/lm_eval_outputs/",
-    ]
-    subprocess.run(command, check=True)
-
-
 def get_size_of_dir(model_output_dir):
     # get dir size from shell, to skip complexity of dealing with tensor
     # subclasses
@@ -94,43 +73,23 @@ def get_size_of_dir(model_output_dir):
 def run(
     model_id: str,
     quant_recipe_name: str | None,
-    tasks,
-    device,
-    batch_size,
     model_output_dir,
 ):
     print(f"\nRunning {model_id=} with {quant_recipe_name=}\n")
-    model_name = model_id.split("/")[-1]
-    model_output_dir = (
-        f"benchmarks/data/quantized_model/{model_name}-{quant_recipe_name}"
-    )
     quant_config = string_to_config(quant_recipe_name)
     quantized_model, tokenizer = quantize_model_and_save(
         model_id, quant_config=quant_config, output_dir=model_output_dir
     )
     print(quantized_model)
-
+    print(f"saved {model_id=}, {quant_recipe_name=} to {model_output_dir=}")
     model_size = get_size_of_dir(model_output_dir) / 1e9
     print(f"checkpoint size: {model_size} GB")
 
-    run_lm_eval(
-        model_output_dir, tasks_list=tasks, device=device, batch_size=batch_size
-    )
-    print("done\n")
-
 
 if __name__ == "__main__":
-    try:
-        import lm_eval  # noqa: F401
-    except:
-        print(
-            "lm_eval is required to run this script. Please install it using pip install lm-eval."
-        )
-        exit(0)
-
     # Set up argument parser
     parser = argparse.ArgumentParser(
-        description="Quantize a model and evaluate its throughput."
+        description="Load a model from HuggingFace, quantize it and save it to disk."
     )
     parser.add_argument(
         "--model_id",
@@ -141,26 +100,12 @@ def run(
     parser.add_argument(
         "--quant_recipe_name",
         type=str,
-        default=None,
-        help="The quantization recipe to use.",
-    )
-    parser.add_argument(
-        "--tasks",
-        nargs="+",
-        type=str,
-        default=["wikitext"],
-        help="List of lm-eluther tasks to evaluate usage: --tasks task1 task2",
-    )
-    parser.add_argument(
-        "--device", type=str, default="cuda:0", help="Device to run the model on."
-    )
-    parser.add_argument(
-        "--batch_size", type=str, default="auto", help="Batch size for lm_eval."
+        help="The quantization recipe to use, 'None' means no quantization",
     )
     parser.add_argument(
         "--output_dir",
         type=str,
-        default="quantized_models",
+        default="benchmarks/data/quantized_model/test",
         help="Output directory for quantized model.",
     )
     args = parser.parse_args()
@@ -169,8 +114,5 @@ def run(
     run(
         model_id=args.model_id,
         quant_recipe_name=args.quant_recipe_name,
-        tasks=args.tasks,
-        device=args.device,
-        batch_size=args.batch_size,
         model_output_dir=args.output_dir,
     )
diff --git a/benchmarks/quantization/eval_accuracy_for_readme.sh b/benchmarks/quantization/eval_accuracy_for_readme.sh
@@ -2,29 +2,40 @@
 
 set -e
 
-# Get model_id as positional argument (optional)
+# Get model_id as first positional argument (optional)
 MODEL_ID="${1:-meta-llama/Llama-3.1-8B}"
 
-# Get log file as first positional argument (optional)
+# Get log file as second positional argument (optional)
 LOG_FILE="${2:-benchmarks/data/eval_accuracy_for_readme_log.txt}"
+rm $LOG_FILE
+touch $LOG_FILE
 
-# Build the base command arguments
-BASE_ARGS="--tasks wikitext winogrande"
-if [[ -n "$MODEL_ID" ]]; then
-  BASE_ARGS="--model_id $MODEL_ID $BASE_ARGS"
-fi
-
-# baseline
-# note: the -u flag is to prevent python from buffering stdout and stderr
-# and make the output log file be in chronological order
-time python -u benchmarks/quantization/eval_accuracy_for_readme.py $BASE_ARGS 2>&1 | tee "$LOG_FILE"
-
-# quantized recipes
-# note:
-# * `int4_groupwise_hqq_weight_float8_rowwise_activation` doesn't work with dtype_map auto: https://gist.github.com/vkuzo/6b128681b628744d445c553cdeac8a85
-# * `int4_groupwise_hqq_weight_only` only works on A100
-for quant_recipe in float8_rowwise int4_groupwise_weight_float8_rowwise_activation int4_groupwise_hqq_weight_only int8_rowwise_weight_only int8_rowwise; do
-  time python -u benchmarks/quantization/eval_accuracy_for_readme.py $BASE_ARGS --quant_recipe_name $quant_recipe 2>&1 | tee -a "$LOG_FILE"
+QUANT_RECIPES=(
+  # no quantization (baseline)
+  "None"
+  "float8_rowwise"
+  # note: below doesn't work with dtype_map auto: https://gist.github.com/vkuzo/6b128681b628744d445c553cdeac8a85
+  "int4_groupwise_weight_float8_rowwise_activation"
+  # note: below only works on A100
+  "int4_groupwise_hqq_weight_only"
+  "int8_rowwise_weight_only"
+  "int8_rowwise"
+)
+
+for quant_recipe in "${QUANT_RECIPES[@]}"; do
+
+  echo "processing $quant_recipe"
+ 
+  OUTPUT_DIR="benchmarks/data/quantized_model/$MODEL_ID-$quant_recipe/"
+  rm -rf $OUTPUT_DIR
+
+  # create quantized model
+  # Note: the -u flag is to prevent python from buffering stdout and stderr
+  # and make the output log file be in chronological order
+  python -u benchmarks/quantization/create_quantized_model.py --model_id $MODEL_ID --output_dir $OUTPUT_DIR --quant_recipe_name $quant_recipe 2>&1 | tee -a "$LOG_FILE"
+
+  # run eval
+  lm_eval --model hf --model_args "pretrained=$OUTPUT_DIR" --tasks "wikitext,winogrande" --device "cuda:0" --batch_size auto --output_path "$OUTPUT_DIR/lm_eval_outputs/" 2>&1 | tee -a "$LOG_FILE"
 done
 
 # TODO(future PR): script to parse the log file instead of manual copy-paste