diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md index 57dff91b799..267ccda454b 100644 --- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md +++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md @@ -7,14 +7,14 @@ This example quantizes and validates the accuracy of Llama4. ## 1. Environment ```shell -docker run -d --gpus all -v ... --shm-size=100g --name llama4 -it nvcr.io/nvidia/pytorch:25.05-py3 /bin/bash +docker run -d --gpus all -v ... --shm-size=100g --name llama4 -it nvcr.io/nvidia/pytorch:25.08-py3 /bin/bash docker exec -it llama4 bash git clone https://github.com/intel/neural-compressor.git cd neural-compressor/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4 -# Use `INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@v3.6rc` for the latest updates before neural-compressor v3.6 release -pip install neural-compressor-pt==3.6 -# Use `pip install git+https://github.com/intel/auto-round.git@v0.8.0rc2` for the latest updates before auto-round v0.8.0 release -pip install auto-round==0.8.0 +# Use `INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master` for the latest updates before neural-compressor v3.7 release +pip install neural-compressor-pt==3.7 +# Use `pip install git+https://github.com/intel/auto-round.git@main` for the latest updates before auto-round v0.9.3 release +pip install auto-round==0.9.3 bash setup.sh ``` @@ -36,5 +36,5 @@ CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=L ## 2. Benchmark ```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results/Llama-4-Scout-17B-16E-Instruct-w4g32/ --tasks=piqa --batch_size=1 --tp_size=4 +CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results --tasks=piqa --batch_size=1 --tp_size=4 ``` diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/main.py b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/main.py index b3171ddee2e..a848a8bf4a7 100644 --- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/main.py +++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/main.py @@ -85,7 +85,7 @@ def tune(args): iters=args.iters, scheme=args.scheme, layer_config=layer_config, - export_format="llm_compressor", + export_format=args.export_format, output_dir=args.output_dir, processor=processor, ) diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh index 0019f164bd7..9388b7f3146 100644 --- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh +++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh @@ -40,6 +40,12 @@ function run_benchmark { batch_size=${batch_size:=1} if [ "${topology}" = "llama4_mxfp4" ]; then + export VLLM_AR_MXFP4_MODULAR_MOE=1 + export VLLM_MXFP4_PRE_UNPACK_TO_FP8=1 + export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 + export VLLM_ENABLE_STATIC_MOE=0 + export VLLM_USE_DEEP_GEMM=0 + export VLLM_ENABLE_AR_EXT=1 extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto,gpu_memory_utilization=0.7" extra_cmd="--gen_kwargs max_gen_toks=2048" fi diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh index fa41efd10b9..25c2b28b5ac 100644 --- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh +++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh @@ -44,7 +44,7 @@ function run_tuning { iters=${iters:=0} if [ "${topology}" = "llama4_mxfp4" ]; then - extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4" + extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4 --export_format auto_round" fi python3 main.py \ diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh index 629d056eba3..6f5e90e386f 100644 --- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh +++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh @@ -2,7 +2,6 @@ pip install -r requirements.txt pip install setuptools --upgrade pip install packaging --upgrade pip install -U "huggingface_hub[cli]" -git clone -b mxfp4 https://github.com/mengniwang95/vllm-fork.git -cd vllm-fork -VLLM_USE_PRECOMPILED=1 pip install . -vvv --no-build-isolation -cd .. +git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork +VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv +pip uninstall flash_attn -y