From 8f8967f7ee6866caed052a91af4ef0bfad130c31 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Mon, 8 Dec 2025 22:08:32 +0800
Subject: [PATCH 1/8] Update setup.sh

---
 .../quantization/auto_round/llama4/setup.sh                 | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh
index 629d056eba3..72fa5a6c76e 100644
--- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh
+++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh
@@ -2,7 +2,5 @@ pip install -r requirements.txt
 pip install setuptools --upgrade
 pip install packaging --upgrade
 pip install -U "huggingface_hub[cli]"
-git clone -b mxfp4 https://github.com/mengniwang95/vllm-fork.git
-cd vllm-fork
-VLLM_USE_PRECOMPILED=1 pip install . -vvv --no-build-isolation
-cd ..
+git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
+VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv

From 09c07f4f32bec223268cd90ee798def1ea80be14 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Tue, 9 Dec 2025 15:11:31 +0800
Subject: [PATCH 2/8] Update README.md

---
 .../quantization/auto_round/llama4/README.md                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
index 57dff91b799..35e5c7958ca 100644
--- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
+++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
@@ -36,5 +36,5 @@ CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=L
 ## 2. Benchmark
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results/Llama-4-Scout-17B-16E-Instruct-w4g32/ --tasks=piqa --batch_size=1 --tp_size=4
+CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results --tasks=piqa --batch_size=1 --tp_size=4
 ```

From bdd59e96927ba919f4679c2634e00fc0144f0e5e Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Tue, 9 Dec 2025 16:30:49 +0800
Subject: [PATCH 3/8] Update main.py

---
 .../multimodal-modeling/quantization/auto_round/llama4/main.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/main.py b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/main.py
index b3171ddee2e..a848a8bf4a7 100644
--- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/main.py
+++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/main.py
@@ -85,7 +85,7 @@ def tune(args):
         iters=args.iters,
         scheme=args.scheme,
         layer_config=layer_config,
-        export_format="llm_compressor",
+        export_format=args.export_format,
         output_dir=args.output_dir,
         processor=processor,
     )

From f4be90fa262f9b5c6b082259a98f510b7c5ae0a1 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Tue, 9 Dec 2025 16:32:37 +0800
Subject: [PATCH 4/8] Update run_quant.sh

---
 .../quantization/auto_round/llama4/run_quant.sh                 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh
index fa41efd10b9..25c2b28b5ac 100644
--- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh
+++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh
@@ -44,7 +44,7 @@ function run_tuning {
     iters=${iters:=0}
 
     if [ "${topology}" = "llama4_mxfp4" ]; then
-        extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4"
+        extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4 --export_format auto_round"
     fi
 
     python3 main.py \

From 6719b4fa53857bf37d1b78dd8d83c3dd212cc522 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Tue, 9 Dec 2025 16:35:41 +0800
Subject: [PATCH 5/8] Update run_benchmark.sh

---
 .../quantization/auto_round/llama4/run_benchmark.sh         | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
index 0019f164bd7..9388b7f3146 100644
--- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
+++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
@@ -40,6 +40,12 @@ function run_benchmark {
     batch_size=${batch_size:=1}
 
     if [ "${topology}" = "llama4_mxfp4" ]; then
+        export VLLM_AR_MXFP4_MODULAR_MOE=1
+        export VLLM_MXFP4_PRE_UNPACK_TO_FP8=1
+        export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0
+        export VLLM_ENABLE_STATIC_MOE=0
+        export VLLM_USE_DEEP_GEMM=0
+        export VLLM_ENABLE_AR_EXT=1
         extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto,gpu_memory_utilization=0.7"
         extra_cmd="--gen_kwargs max_gen_toks=2048"
     fi

From d7aaf912673d49fac172aee1408026de99317cde Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Tue, 9 Dec 2025 16:37:28 +0800
Subject: [PATCH 6/8] Update README.md

---
 .../quantization/auto_round/llama4/README.md           | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
index 35e5c7958ca..a948cbc6894 100644
--- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
+++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
@@ -7,14 +7,14 @@ This example quantizes and validates the accuracy of Llama4.
 ## 1. Environment
 
 ```shell
-docker run -d --gpus all -v ... --shm-size=100g --name llama4 -it nvcr.io/nvidia/pytorch:25.05-py3 /bin/bash
+docker run -d --gpus all -v ... --shm-size=100g --name llama4 -it nvcr.io/nvidia/pytorch:25.08-py3 /bin/bash
 docker exec -it llama4 bash
 git clone https://github.com/intel/neural-compressor.git
 cd neural-compressor/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4
-# Use `INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@v3.6rc` for the latest updates before neural-compressor v3.6 release
-pip install neural-compressor-pt==3.6
-# Use `pip install git+https://github.com/intel/auto-round.git@v0.8.0rc2` for the latest updates before auto-round v0.8.0 release
-pip install auto-round==0.8.0
+# Use `INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@v3.6rc` for the latest updates before neural-compressor v3.7 release
+pip install neural-compressor-pt==3.7
+# Use `pip install git+https://github.com/intel/auto-round.git@v0.8.0rc2` for the latest updates before auto-round v0.9.3 release
+pip install auto-round==0.9.3
 bash setup.sh
 ```
 

From 346fc618ad70b7f31fcbde05e551de0483ab4198 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Tue, 9 Dec 2025 16:37:53 +0800
Subject: [PATCH 7/8] Update README.md

---
 .../quantization/auto_round/llama4/README.md                  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
index a948cbc6894..267ccda454b 100644
--- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
+++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
@@ -11,9 +11,9 @@ docker run -d --gpus all -v ... --shm-size=100g --name llama4 -it nvcr.io/nvidia
 docker exec -it llama4 bash
 git clone https://github.com/intel/neural-compressor.git
 cd neural-compressor/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4
-# Use `INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@v3.6rc` for the latest updates before neural-compressor v3.7 release
+# Use `INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master` for the latest updates before neural-compressor v3.7 release
 pip install neural-compressor-pt==3.7
-# Use `pip install git+https://github.com/intel/auto-round.git@v0.8.0rc2` for the latest updates before auto-round v0.9.3 release
+# Use `pip install git+https://github.com/intel/auto-round.git@main` for the latest updates before auto-round v0.9.3 release
 pip install auto-round==0.9.3
 bash setup.sh
 ```

From 738b977522927bdbbb439a80974e72c5a49d0d8a Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Fri, 12 Dec 2025 13:35:23 +0800
Subject: [PATCH 8/8] Update setup.sh

---
 .../multimodal-modeling/quantization/auto_round/llama4/setup.sh  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh
index 72fa5a6c76e..6f5e90e386f 100644
--- a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh
+++ b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh
@@ -4,3 +4,4 @@ pip install packaging --upgrade
 pip install -U "huggingface_hub[cli]"
 git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
 VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv
+pip uninstall flash_attn -y