From 8f8967f7ee6866caed052a91af4ef0bfad130c31 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Mon, 8 Dec 2025 22:08:32 +0800 Subject: [PATCH 1/8] Update setup.sh --- .../quantization/auto_round/llama4/setup.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh index 629d056eba3..72fa5a6c76e 100644 --- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh +++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh @@ -2,7 +2,5 @@ pip install -r requirements.txt pip install setuptools --upgrade pip install packaging --upgrade pip install -U "huggingface_hub[cli]" -git clone -b mxfp4 https://github.com/mengniwang95/vllm-fork.git -cd vllm-fork -VLLM_USE_PRECOMPILED=1 pip install . -vvv --no-build-isolation -cd .. +git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork +VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv From 09c07f4f32bec223268cd90ee798def1ea80be14 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Tue, 9 Dec 2025 15:11:31 +0800 Subject: [PATCH 2/8] Update README.md --- .../quantization/auto_round/llama4/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md index 57dff91b799..35e5c7958ca 100644 --- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md +++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md @@ -36,5 +36,5 @@ CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=L ## 2. Benchmark ```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results/Llama-4-Scout-17B-16E-Instruct-w4g32/ --tasks=piqa --batch_size=1 --tp_size=4 +CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results --tasks=piqa --batch_size=1 --tp_size=4 ``` From bdd59e96927ba919f4679c2634e00fc0144f0e5e Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Tue, 9 Dec 2025 16:30:49 +0800 Subject: [PATCH 3/8] Update main.py --- .../multimodal-modeling/quantization/auto_round/llama4/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/main.py b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/main.py index b3171ddee2e..a848a8bf4a7 100644 --- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/main.py +++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/main.py @@ -85,7 +85,7 @@ def tune(args): iters=args.iters, scheme=args.scheme, layer_config=layer_config, - export_format="llm_compressor", + export_format=args.export_format, output_dir=args.output_dir, processor=processor, ) From f4be90fa262f9b5c6b082259a98f510b7c5ae0a1 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Tue, 9 Dec 2025 16:32:37 +0800 Subject: [PATCH 4/8] Update run_quant.sh --- .../quantization/auto_round/llama4/run_quant.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh index fa41efd10b9..25c2b28b5ac 100644 --- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh +++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh @@ -44,7 +44,7 @@ function run_tuning { iters=${iters:=0} if [ "${topology}" = "llama4_mxfp4" ]; then - extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4" + extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4 --export_format auto_round" fi python3 main.py \ From 6719b4fa53857bf37d1b78dd8d83c3dd212cc522 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Tue, 9 Dec 2025 16:35:41 +0800 Subject: [PATCH 5/8] Update run_benchmark.sh --- .../quantization/auto_round/llama4/run_benchmark.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh index 0019f164bd7..9388b7f3146 100644 --- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh +++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh @@ -40,6 +40,12 @@ function run_benchmark { batch_size=${batch_size:=1} if [ "${topology}" = "llama4_mxfp4" ]; then + export VLLM_AR_MXFP4_MODULAR_MOE=1 + export VLLM_MXFP4_PRE_UNPACK_TO_FP8=1 + export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 + export VLLM_ENABLE_STATIC_MOE=0 + export VLLM_USE_DEEP_GEMM=0 + export VLLM_ENABLE_AR_EXT=1 extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto,gpu_memory_utilization=0.7" extra_cmd="--gen_kwargs max_gen_toks=2048" fi From d7aaf912673d49fac172aee1408026de99317cde Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Tue, 9 Dec 2025 16:37:28 +0800 Subject: [PATCH 6/8] Update README.md --- .../quantization/auto_round/llama4/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md index 35e5c7958ca..a948cbc6894 100644 --- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md +++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md @@ -7,14 +7,14 @@ This example quantizes and validates the accuracy of Llama4. ## 1. Environment ```shell -docker run -d --gpus all -v ... --shm-size=100g --name llama4 -it nvcr.io/nvidia/pytorch:25.05-py3 /bin/bash +docker run -d --gpus all -v ... --shm-size=100g --name llama4 -it nvcr.io/nvidia/pytorch:25.08-py3 /bin/bash docker exec -it llama4 bash git clone https://github.com/intel/neural-compressor.git cd neural-compressor/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4 -# Use `INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@v3.6rc` for the latest updates before neural-compressor v3.6 release -pip install neural-compressor-pt==3.6 -# Use `pip install git+https://github.com/intel/auto-round.git@v0.8.0rc2` for the latest updates before auto-round v0.8.0 release -pip install auto-round==0.8.0 +# Use `INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@v3.6rc` for the latest updates before neural-compressor v3.7 release +pip install neural-compressor-pt==3.7 +# Use `pip install git+https://github.com/intel/auto-round.git@v0.8.0rc2` for the latest updates before auto-round v0.9.3 release +pip install auto-round==0.9.3 bash setup.sh ``` From 346fc618ad70b7f31fcbde05e551de0483ab4198 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Tue, 9 Dec 2025 16:37:53 +0800 Subject: [PATCH 7/8] Update README.md --- .../quantization/auto_round/llama4/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md index a948cbc6894..267ccda454b 100644 --- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md +++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md @@ -11,9 +11,9 @@ docker run -d --gpus all -v ... --shm-size=100g --name llama4 -it nvcr.io/nvidia docker exec -it llama4 bash git clone https://github.com/intel/neural-compressor.git cd neural-compressor/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4 -# Use `INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@v3.6rc` for the latest updates before neural-compressor v3.7 release +# Use `INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master` for the latest updates before neural-compressor v3.7 release pip install neural-compressor-pt==3.7 -# Use `pip install git+https://github.com/intel/auto-round.git@v0.8.0rc2` for the latest updates before auto-round v0.9.3 release +# Use `pip install git+https://github.com/intel/auto-round.git@main` for the latest updates before auto-round v0.9.3 release pip install auto-round==0.9.3 bash setup.sh ``` From 738b977522927bdbbb439a80974e72c5a49d0d8a Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Fri, 12 Dec 2025 13:35:23 +0800 Subject: [PATCH 8/8] Update setup.sh --- .../multimodal-modeling/quantization/auto_round/llama4/setup.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh index 72fa5a6c76e..6f5e90e386f 100644 --- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh +++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh @@ -4,3 +4,4 @@ pip install packaging --upgrade pip install -U "huggingface_hub[cli]" git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv +pip uninstall flash_attn -y