From b21b4f68279308eb781aa78b6136cdc193a3a272 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Mon, 22 Jun 2026 13:45:34 +0200
Subject: [PATCH 01/80] expose latencies with the speedup in
 OnnxDiscrepancyCheck

---
 olive/passes/onnx/discrepancy_check.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 661100d1c..f4ce31210 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -293,7 +293,7 @@ def _run_for_config(
 
         # Measure inference speedup (ONNX vs PyTorch) on the target device
         if config.timing_iterations > 0:
-            self._measure_speedup(
+            timing = self._measure_speedup(
                 ref_model,
                 session,
                 dataloader,
@@ -302,6 +302,11 @@ def _run_for_config(
                 config.warmup_iterations,
                 config.timing_iterations,
             )
+            if timing is not None:
+                pytorch_time, onnx_time, speedup = timing
+                results["pytorch_latency_s"] = pytorch_time
+                results["onnx_latency_s"] = onnx_time
+                results["speedup"] = speedup
         else:
             logger.info(
                 "OnnxDiscrepancyCheck speedup measurement skipped because timing_iterations=%d.",
@@ -357,7 +362,12 @@ def _run_for_config(
     def _measure_speedup(
         self, ref_model, session, dataloader, io_config, torch_device, warmup_iterations, timing_iterations
     ):
-        """Measure inference speedup of ONNX over PyTorch on the target device."""
+        """Measure inference latencies and speedup of ONNX over PyTorch on the target device.
+
+        Returns a tuple ``(pytorch_time, onnx_time, speedup)`` of the average PyTorch and ONNX
+        per-iteration latencies (in seconds) and the ONNX-over-PyTorch speedup, or ``None`` when
+        measurement is skipped.
+        """
         if timing_iterations <= 0:
             logger.info(
                 "OnnxDiscrepancyCheck speedup measurement skipped because timing_iterations=%d.",
@@ -419,7 +429,7 @@ def _measure_speedup(
             torch_device,
         )
 
-        return speedup
+        return pytorch_time, onnx_time, speedup
 
     def compare_generation(self, config: type[BasePassConfig], ref_model) -> int:
         """Run generation on both transformers and GenAI, return longest common token sequence length."""

From 72c5d4e40584ef808c977694dfd5da542cae8ef0 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 11:58:34 +0000
Subject: [PATCH 02/80] Extend discrepancy check unit test for latency tuple

---
 test/passes/onnx/test_discrepancy_check.py | 29 ++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index c6a2f83eb..31e738f6f 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -192,3 +192,32 @@ def test_measure_speedup_skips_when_timing_iterations_is_zero(self):
         assert result is None
         ref_model.assert_not_called()
         session.run.assert_not_called()
+
+    def test_measure_speedup_returns_latencies_and_speedup(self):
+        import torch
+
+        from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck
+
+        pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck)
+        ref_model = MagicMock()
+        session = MagicMock()
+        input_data = {"input_ids": torch.tensor([[1, 2, 3]], dtype=torch.int64)}
+        dataloader = [(input_data, None)]
+
+        with (
+            patch("olive.common.utils.format_data", return_value={"input_ids": [1, 2, 3]}),
+            patch("olive.passes.onnx.discrepancy_check.time.perf_counter", side_effect=[10.0, 14.0, 20.0, 22.0]),
+        ):
+            result = pass_instance._measure_speedup(
+                ref_model=ref_model,
+                session=session,
+                dataloader=dataloader,
+                io_config=MagicMock(),
+                torch_device=torch.device("cpu"),
+                warmup_iterations=1,
+                timing_iterations=2,
+            )
+
+        assert result == (2.0, 1.0, 2.0)
+        assert ref_model.call_count == 3
+        assert session.run.call_count == 3

From bf0a9781d113cdc2b088f305f123ab3e8f4f3a4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Mon, 22 Jun 2026 14:02:13 +0200
Subject: [PATCH 03/80] add time to first token in OnnxDiscrepancyCheck

---
 olive/passes/onnx/discrepancy_check.py     | 89 ++++++++++++++++++----
 test/passes/onnx/test_discrepancy_check.py | 16 +++-
 2 files changed, 89 insertions(+), 16 deletions(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 661100d1c..d4fd2e194 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -56,6 +56,11 @@ def _longest_common_token_sequence(seq_a: list[int], seq_b: list[int]) -> int:
     return length
 
 
+def _format_seconds(value: Optional[float]) -> str:
+    """Format an optional latency value (in seconds) for logging."""
+    return "n/a" if value is None else f"{value:.4f}s"
+
+
 class OnnxDiscrepancyCheck(Pass):
     """Validates ONNX model outputs against a reference PyTorch model.
 
@@ -68,6 +73,8 @@ class OnnxDiscrepancyCheck(Pass):
     - Inference speedup of ONNX over PyTorch on the target device (or CPU fallback)
     - Longest common token sequence from the beginning between transformers
       generate and ONNX Runtime GenAI generate (when enabled)
+    - Time-to-first-token and time-to-first-N-tokens latencies for both transformers
+      and ONNX Runtime GenAI generation (when enabled)
 
     The pass status is marked as failed if any configured threshold is exceeded.
     """
@@ -151,6 +158,14 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                 default_value=32,
                 description="Maximum number of new tokens to generate for the token sequence comparison.",
             ),
+            "time_to_first_n_tokens": PassConfigParam(
+                type_=int,
+                default_value=5,
+                description=(
+                    "Number of leading generated tokens used for the time-to-first-N-tokens latency "
+                    "measurement reported for both transformers and ONNX Runtime GenAI."
+                ),
+            ),
             "min_longest_common_tokens": PassConfigParam(
                 type_=Optional[int],
                 default_value=None,
@@ -331,8 +346,9 @@ def _run_for_config(
 
         # Generation token sequence comparison (transformers vs ONNX Runtime GenAI)
         if config.genai_model_path:
-            longest_common = self.compare_generation(config, ref_model)
-            results["longest_common_token_sequence"] = longest_common
+            gen_results = self.compare_generation(config, ref_model)
+            longest_common = gen_results["longest_common_token_sequence"]
+            results.update(gen_results)
             results["genai_model_path"] = config.genai_model_path
             if config.min_longest_common_tokens is not None and longest_common < config.min_longest_common_tokens:
                 results["status"] = "failed"
@@ -421,8 +437,13 @@ def _measure_speedup(
 
         return speedup
 
-    def compare_generation(self, config: type[BasePassConfig], ref_model) -> int:
-        """Run generation on both transformers and GenAI, return longest common token sequence length."""
+    def compare_generation(self, config: type[BasePassConfig], ref_model) -> dict:
+        """Run generation on both transformers and GenAI and compare them.
+
+        Returns a dict with the longest common token sequence length and the time-to-first-token
+        and time-to-first-N-tokens latencies (in seconds) for both transformers and ONNX Runtime
+        GenAI, where N is ``config.time_to_first_n_tokens``.
+        """
         try:
             import onnxruntime_genai as og
         except ImportError as exc:
@@ -431,17 +452,35 @@ def compare_generation(self, config: type[BasePassConfig], ref_model) -> int:
 
         tokenizer = AutoTokenizer.from_pretrained(config.reference_model_path)
 
+        max_new_tokens = config.generate_max_new_tokens
+        first_n = max(1, min(config.time_to_first_n_tokens, max_new_tokens))
+
         # Transformers generation
         input_ids = tokenizer(config.generate_prompt, return_tensors="pt").input_ids
-        input_ids = input_ids.to(ref_model.device)
         import torch
 
-        with torch.no_grad():
-            transformers_output = ref_model.generate(
-                input_ids,
-                max_new_tokens=config.generate_max_new_tokens,
-                do_sample=False,
-            )
+        input_ids = input_ids.to(ref_model.device)
+        use_cuda_sync = ref_model.device.type == "cuda"
+
+        def _time_transformers_generate(num_new_tokens):
+            with torch.no_grad():
+                if use_cuda_sync:
+                    torch.cuda.synchronize()
+                start = time.perf_counter()
+                output = ref_model.generate(
+                    input_ids,
+                    max_new_tokens=num_new_tokens,
+                    do_sample=False,
+                )
+                if use_cuda_sync:
+                    torch.cuda.synchronize()
+                elapsed = time.perf_counter() - start
+            return output, elapsed
+
+        # Time to first token and time to first N tokens (separate timed runs).
+        _, transformers_ttft = _time_transformers_generate(1)
+        _, transformers_ttfn = _time_transformers_generate(first_n)
+        transformers_output, _ = _time_transformers_generate(max_new_tokens)
         transformers_tokens = transformers_output[0].cpu().tolist()
 
         # ONNX Runtime GenAI generation
@@ -450,26 +489,48 @@ def compare_generation(self, config: type[BasePassConfig], ref_model) -> int:
         genai_input_ids = genai_tokenizer.encode(config.generate_prompt)
 
         params = og.GeneratorParams(genai_model)
-        params.set_search_options(max_length=len(genai_input_ids) + config.generate_max_new_tokens, do_sample=False)
+        params.set_search_options(max_length=len(genai_input_ids) + max_new_tokens, do_sample=False)
 
         generator = og.Generator(genai_model, params)
         generator.append_tokens([genai_input_ids])
         genai_tokens = list(genai_input_ids)
+        genai_ttft = None
+        genai_ttfn = None
+        num_generated = 0
+        start = time.perf_counter()
         while not generator.is_done():
             generator.generate_next_token()
             genai_tokens.append(generator.get_next_tokens()[0])
+            num_generated += 1
+            if num_generated == 1:
+                genai_ttft = time.perf_counter() - start
+            if num_generated == first_n:
+                genai_ttfn = time.perf_counter() - start
         del generator
 
         longest_common = _longest_common_token_sequence(transformers_tokens, genai_tokens)
 
+        gen_results = {
+            "longest_common_token_sequence": longest_common,
+            "time_to_first_n_tokens": first_n,
+            "transformers_time_to_first_token_s": transformers_ttft,
+            "transformers_time_to_first_n_tokens_s": transformers_ttfn,
+            "genai_time_to_first_token_s": genai_ttft,
+            "genai_time_to_first_n_tokens_s": genai_ttfn,
+        }
+
         gen_summary = (
             f"OnnxDiscrepancyCheck generation comparison: "
             f"transformers_len={len(transformers_tokens)}, genai_len={len(genai_tokens)}, "
-            f"longest_common_token_sequence={longest_common}"
+            f"longest_common_token_sequence={longest_common}, "
+            f"transformers_ttft={transformers_ttft:.4f}s, "
+            f"transformers_time_to_first_{first_n}_tokens={transformers_ttfn:.4f}s, "
+            f"genai_ttft={_format_seconds(genai_ttft)}, "
+            f"genai_time_to_first_{first_n}_tokens={_format_seconds(genai_ttfn)}"
         )
         logger.info(gen_summary)
 
-        return longest_common
+        return gen_results
 
     def _export_reference_model(self, ref_model, output_model_path: str):
         """Save the reference PyTorch model weights for direct comparison."""
diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index c6a2f83eb..59a1867fd 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -60,6 +60,7 @@ def test_compare_generation_returns_common_prefix_length(self):
         config.genai_model_path = "mock_genai_model"
         config.generate_prompt = "Hello world"
         config.generate_max_new_tokens = 10
+        config.time_to_first_n_tokens = 5
 
         # Mock transformers tokenizer and model
         mock_tokenizer = MagicMock()
@@ -107,7 +108,17 @@ def get_next_tokens_side_effect():
 
         mock_generator.append_tokens.assert_called_once_with([[1, 2, 3]])
         # Common prefix: [1, 2, 3, 10, 11] = 5 tokens before divergence
-        assert result == 5
+        assert result["longest_common_token_sequence"] == 5
+        # Latency metrics are exposed for both transformers and ONNX Runtime GenAI.
+        assert result["time_to_first_n_tokens"] == 5
+        for key in (
+            "transformers_time_to_first_token_s",
+            "transformers_time_to_first_n_tokens_s",
+        ):
+            assert key in result
+            assert isinstance(result[key], float)
+        for key in ("genai_time_to_first_token_s", "genai_time_to_first_n_tokens_s"):
+            assert key in result
 
     def test_compare_generation_fully_matching(self):
         """Test when both outputs are identical."""
@@ -120,6 +131,7 @@ def test_compare_generation_fully_matching(self):
         config.genai_model_path = "mock_genai_model"
         config.generate_prompt = "Test"
         config.generate_max_new_tokens = 5
+        config.time_to_first_n_tokens = 5
 
         mock_tokenizer = MagicMock()
         mock_tokenizer.return_value = MagicMock(input_ids=torch.tensor([[10, 20]]))
@@ -162,7 +174,7 @@ def get_next_tokens_side_effect():
 
         mock_generator.append_tokens.assert_called_once_with([[10, 20]])
         # All 5 tokens match
-        assert result == 5
+        assert result["longest_common_token_sequence"] == 5
 
 
 class TestSpeedupSettings:

From 804bb92762c4d8de35064e084cf16c9af09e272a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 12:43:04 +0000
Subject: [PATCH 04/80] Add return type annotation to _measure_speedup

---
 olive/passes/onnx/discrepancy_check.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index f4ce31210..4e0a603f4 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -361,7 +361,7 @@ def _run_for_config(
 
     def _measure_speedup(
         self, ref_model, session, dataloader, io_config, torch_device, warmup_iterations, timing_iterations
-    ):
+    ) -> tuple[float, float, float] | None:
         """Measure inference latencies and speedup of ONNX over PyTorch on the target device.
 
         Returns a tuple ``(pytorch_time, onnx_time, speedup)`` of the average PyTorch and ONNX

From 1bdee25ff539c74d8af4298a2c217f1f0476aeb6 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:02:53 +0000
Subject: [PATCH 05/80] Add latency key assertions to fully matching
 discrepancy test

---
 test/passes/onnx/test_discrepancy_check.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index 59a1867fd..cad91f434 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -175,6 +175,19 @@ def get_next_tokens_side_effect():
         mock_generator.append_tokens.assert_called_once_with([[10, 20]])
         # All 5 tokens match
         assert result["longest_common_token_sequence"] == 5
+        assert result["time_to_first_n_tokens"] == 5
+        for key in (
+            "transformers_time_to_first_token_s",
+            "transformers_time_to_first_n_tokens_s",
+        ):
+            assert key in result
+            assert isinstance(result[key], float)
+        assert "genai_time_to_first_token_s" in result
+        assert isinstance(result["genai_time_to_first_token_s"], float)
+        assert "genai_time_to_first_n_tokens_s" in result
+        assert result["genai_time_to_first_n_tokens_s"] is None or isinstance(
+            result["genai_time_to_first_n_tokens_s"], float
+        )
 
 
 class TestSpeedupSettings:

From 142ddea487a74d8cfb7c87f504767433af806e5c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:43:51 +0000
Subject: [PATCH 06/80] Handle zero max_new_tokens in generation metrics

---
 olive/passes/onnx/discrepancy_check.py     | 14 ++++---
 test/passes/onnx/test_discrepancy_check.py | 44 ++++++++++++++++++++++
 2 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index d4fd2e194..01c501539 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -453,7 +453,7 @@ def compare_generation(self, config: type[BasePassConfig], ref_model) -> dict:
         tokenizer = AutoTokenizer.from_pretrained(config.reference_model_path)
 
         max_new_tokens = config.generate_max_new_tokens
-        first_n = max(1, min(config.time_to_first_n_tokens, max_new_tokens))
+        first_n = max(1, min(config.time_to_first_n_tokens, max_new_tokens)) if max_new_tokens > 0 else 0
 
         # Transformers generation
         input_ids = tokenizer(config.generate_prompt, return_tensors="pt").input_ids
@@ -478,8 +478,12 @@ def _time_transformers_generate(num_new_tokens):
             return output, elapsed
 
         # Time to first token and time to first N tokens (separate timed runs).
-        _, transformers_ttft = _time_transformers_generate(1)
-        _, transformers_ttfn = _time_transformers_generate(first_n)
+        if max_new_tokens > 0:
+            _, transformers_ttft = _time_transformers_generate(1)
+            _, transformers_ttfn = _time_transformers_generate(first_n)
+        else:
+            transformers_ttft = None
+            transformers_ttfn = None
         transformers_output, _ = _time_transformers_generate(max_new_tokens)
         transformers_tokens = transformers_output[0].cpu().tolist()
 
@@ -523,8 +527,8 @@ def _time_transformers_generate(num_new_tokens):
             f"OnnxDiscrepancyCheck generation comparison: "
             f"transformers_len={len(transformers_tokens)}, genai_len={len(genai_tokens)}, "
             f"longest_common_token_sequence={longest_common}, "
-            f"transformers_ttft={transformers_ttft:.4f}s, "
-            f"transformers_time_to_first_{first_n}_tokens={transformers_ttfn:.4f}s, "
+            f"transformers_ttft={_format_seconds(transformers_ttft)}, "
+            f"transformers_time_to_first_{first_n}_tokens={_format_seconds(transformers_ttfn)}, "
             f"genai_ttft={_format_seconds(genai_ttft)}, "
             f"genai_time_to_first_{first_n}_tokens={_format_seconds(genai_ttfn)}"
         )
diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index cad91f434..9608182ff 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -189,6 +189,50 @@ def get_next_tokens_side_effect():
             result["genai_time_to_first_n_tokens_s"], float
         )
 
+    def test_compare_generation_with_zero_max_new_tokens(self):
+        """Test that latency metrics are skipped when max_new_tokens is zero."""
+        import torch
+
+        from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck
+
+        config = MagicMock()
+        config.reference_model_path = "mock_model"
+        config.genai_model_path = "mock_genai_model"
+        config.generate_prompt = "Test"
+        config.generate_max_new_tokens = 0
+        config.time_to_first_n_tokens = 5
+
+        mock_tokenizer = MagicMock()
+        mock_tokenizer.return_value = MagicMock(input_ids=torch.tensor([[10, 20]]))
+
+        mock_ref_model = MagicMock()
+        mock_ref_model.device = torch.device("cpu")
+        mock_ref_model.generate.return_value = torch.tensor([[10, 20]])
+
+        mock_og = MagicMock()
+        mock_og.Model.return_value = MagicMock()
+        mock_genai_tokenizer = MagicMock()
+        mock_og.Tokenizer.return_value = mock_genai_tokenizer
+        mock_genai_tokenizer.encode.return_value = [10, 20]
+        mock_og.GeneratorParams.return_value = MagicMock()
+
+        mock_generator = MagicMock()
+        mock_generator.is_done.return_value = True
+        mock_og.Generator.return_value = mock_generator
+
+        with (
+            patch.dict(sys.modules, {"onnxruntime_genai": mock_og}),
+            patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer),
+        ):
+            pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck)
+            result = pass_instance.compare_generation(config, mock_ref_model)
+
+        assert mock_ref_model.generate.call_count == 1
+        assert mock_ref_model.generate.call_args.kwargs["max_new_tokens"] == 0
+        assert result["time_to_first_n_tokens"] == 0
+        assert result["transformers_time_to_first_token_s"] is None
+        assert result["transformers_time_to_first_n_tokens_s"] is None
+
 
 class TestSpeedupSettings:
     def test_timing_iterations_default_is_5(self):

From 39cac1cebb4910a9c586cc6b5f62292c28499998 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:45:07 +0000
Subject: [PATCH 07/80] Use single measured transformers generation for latency
 metrics

---
 olive/passes/onnx/discrepancy_check.py | 47 +++++++++++++++-----------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 01c501539..ea9a9bffe 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -448,7 +448,7 @@ def compare_generation(self, config: type[BasePassConfig], ref_model) -> dict:
             import onnxruntime_genai as og
         except ImportError as exc:
             raise ImportError("Please install `onnxruntime-genai` to enable generation comparison.") from exc
-        from transformers import AutoTokenizer
+        from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList
 
         tokenizer = AutoTokenizer.from_pretrained(config.reference_model_path)
 
@@ -462,29 +462,38 @@ def compare_generation(self, config: type[BasePassConfig], ref_model) -> dict:
         input_ids = input_ids.to(ref_model.device)
         use_cuda_sync = ref_model.device.type == "cuda"
 
-        def _time_transformers_generate(num_new_tokens):
-            with torch.no_grad():
-                if use_cuda_sync:
-                    torch.cuda.synchronize()
-                start = time.perf_counter()
-                output = ref_model.generate(
-                    input_ids,
-                    max_new_tokens=num_new_tokens,
-                    do_sample=False,
-                )
-                if use_cuda_sync:
-                    torch.cuda.synchronize()
-                elapsed = time.perf_counter() - start
-            return output, elapsed
+        prompt_token_count = input_ids.shape[-1]
+        transformers_latency = {"start": None, "ttft": None, "ttfn": None}
+
+        class _TransformersLatencyStopCriteria(StoppingCriteria):
+            def __call__(self, generated_ids, scores, **kwargs) -> bool:
+                generated_token_count = generated_ids.shape[-1] - prompt_token_count
+                if generated_token_count >= 1 and transformers_latency["ttft"] is None:
+                    transformers_latency["ttft"] = time.perf_counter() - transformers_latency["start"]
+                if generated_token_count >= first_n and transformers_latency["ttfn"] is None:
+                    transformers_latency["ttfn"] = time.perf_counter() - transformers_latency["start"]
+                return False
 
-        # Time to first token and time to first N tokens (separate timed runs).
+        with torch.no_grad():
+            if use_cuda_sync:
+                torch.cuda.synchronize()
+            start = time.perf_counter()
+            transformers_latency["start"] = start
+            transformers_output = ref_model.generate(
+                input_ids,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+                stopping_criteria=StoppingCriteriaList([_TransformersLatencyStopCriteria()]),
+            )
+            if use_cuda_sync:
+                torch.cuda.synchronize()
+            transformers_elapsed = time.perf_counter() - start
         if max_new_tokens > 0:
-            _, transformers_ttft = _time_transformers_generate(1)
-            _, transformers_ttfn = _time_transformers_generate(first_n)
+            transformers_ttft = transformers_latency["ttft"] or transformers_elapsed
+            transformers_ttfn = transformers_latency["ttfn"] or transformers_elapsed
         else:
             transformers_ttft = None
             transformers_ttfn = None
-        transformers_output, _ = _time_transformers_generate(max_new_tokens)
         transformers_tokens = transformers_output[0].cpu().tolist()
 
         # ONNX Runtime GenAI generation

From 6b5b65236fc6c1b5681dfb72c9427d6815f63db3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Mon, 22 Jun 2026 16:35:33 +0200
Subject: [PATCH 08/80] extend command line --test to trigger speedup measure

---
 olive/cli/base.py     | 45 +++++++++++++++++++++++++++++----
 olive/cli/run.py      |  4 ++-
 test/cli/test_base.py | 59 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+), 6 deletions(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index 50f1e55bf..fda595367 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -22,6 +22,9 @@
 
 TEST_OUTPUT_MARKER_FILE = "olive_test_output.json"
 
+# Metrics that --test can evaluate via the injected OnnxDiscrepancyCheck pass.
+TEST_METRICS = ("mae", "speedup")
+
 
 def _get_test_output_marker_path(output_path: str) -> Path:
     return Path(output_path) / TEST_OUTPUT_MARKER_FILE
@@ -67,8 +70,19 @@ def mark_test_output_path(output_path: Optional[str]) -> None:
     _get_test_output_marker_path(output_path).write_text(json.dumps({"type": "olive_hf_test_output"}, indent=2))
 
 
-def add_discrepancy_check_pass(run_config: dict) -> dict:
-    """Inject OnnxDiscrepancyCheck pass when --test is active and not already configured."""
+def warn_unused_test_metrics(test, metrics: Optional[list]) -> None:
+    """Warn when --test_metrics is provided without --test, since it has no effect."""
+    if metrics and test in (None, False):
+        logger.warning("--test_metrics is ignored because --test is not enabled.")
+
+
+def add_discrepancy_check_pass(run_config: dict, metrics: Optional[list] = None) -> dict:
+    """Inject OnnxDiscrepancyCheck pass when --test is active and not already configured.
+
+    ``metrics`` selects which test metrics to evaluate. Supported values are defined in
+    ``TEST_METRICS`` (``"mae"`` for the max-absolute-error accuracy check and ``"speedup"`` for the
+    ONNX-vs-PyTorch latency measurement). When ``None``, all metrics are evaluated.
+    """
     passes = run_config.get("passes", {})
     # Skip if already configured
     for pass_config in passes.values():
@@ -86,12 +100,21 @@ def add_discrepancy_check_pass(run_config: dict) -> dict:
     if report_dir and Path(report_dir).suffix and not Path(report_dir).is_dir():
         report_dir = str(Path(report_dir).parent)
     logger.debug("Adding OnnxDiscrepancyCheck pass with reference_model_path=%s", reference_model_path)
-    passes["discrepancy_check"] = {
+
+    selected_metrics = set(metrics) if metrics else set(TEST_METRICS)
+    pass_config = {
         "type": "OnnxDiscrepancyCheck",
         "reference_model_path": reference_model_path,
-        "max_mae": 0.1,
         "report_output_dir": report_dir,
     }
+    # Enforce the max-absolute-error threshold only when the accuracy metric is requested.
+    if "mae" in selected_metrics:
+        pass_config["max_mae"] = 0.1
+    # Disable the latency/speedup measurement when the speedup metric is not requested.
+    if "speedup" not in selected_metrics:
+        pass_config["timing_iterations"] = 0
+
+    passes["discrepancy_check"] = pass_config
     run_config["passes"] = passes
     return run_config
 
@@ -135,12 +158,13 @@ def _run_workflow(self):
         from olive.workflows import run as olive_run
 
         validate_test_output_path(self.args.output_path, getattr(self.args, "test", None))
+        warn_unused_test_metrics(getattr(self.args, "test", None), getattr(self.args, "test_metrics", None))
         Path(self.args.output_path).mkdir(parents=True, exist_ok=True)
 
         with tempfile.TemporaryDirectory(prefix="olive-cli-tmp-", dir=self.args.output_path) as tempdir:
             run_config = self._get_run_config(tempdir)
             if getattr(self.args, "test", None) not in (None, False):
-                run_config = add_discrepancy_check_pass(run_config)
+                run_config = add_discrepancy_check_pass(run_config, getattr(self.args, "test_metrics", None))
             if self.args.save_config_file or self.args.dry_run:
                 self._save_config_file(run_config)
             if self.args.dry_run:
@@ -505,6 +529,17 @@ def add_input_model_options(
                 "Optionally provide a folder where the generated test model should be saved and reused."
             ),
         )
+        model_group.add_argument(
+            "--test_metrics",
+            type=str,
+            nargs="+",
+            choices=list(TEST_METRICS),
+            help=(
+                "Metrics to evaluate during a --test run: 'mae' enforces the max absolute error between the "
+                "ONNX and reference model outputs, and 'speedup' measures ONNX-vs-PyTorch inference latency. "
+                "Defaults to all metrics. Only used together with --test."
+            ),
+        )
 
     if enable_hf_adapter:
         assert enable_hf, "enable_hf must be True when enable_hf_adapter is True."
diff --git a/olive/cli/run.py b/olive/cli/run.py
index 8554ddbe4..7d173f3c0 100644
--- a/olive/cli/run.py
+++ b/olive/cli/run.py
@@ -15,6 +15,7 @@
     mark_test_output_path,
     save_discrepancy_check_results,
     validate_test_output_path,
+    warn_unused_test_metrics,
 )
 from olive.telemetry import action
 
@@ -83,8 +84,9 @@ def run(self):
 
         output_path = run_config.get("output_dir") or run_config.get("engine", {}).get("output_dir")
         validate_test_output_path(output_path, self.args.test)
+        warn_unused_test_metrics(self.args.test, getattr(self.args, "test_metrics", None))
         if self.args.test not in (None, False):
-            run_config = add_discrepancy_check_pass(run_config)
+            run_config = add_discrepancy_check_pass(run_config, getattr(self.args, "test_metrics", None))
         workflow_output = olive_run(
             run_config,
             list_required_packages=self.args.list_required_packages,
diff --git a/test/cli/test_base.py b/test/cli/test_base.py
index bb34cef3f..b09fa96d5 100644
--- a/test/cli/test_base.py
+++ b/test/cli/test_base.py
@@ -338,3 +338,62 @@ def test_get_input_model_config_no_crash_without_onnx_file_name(tmp_path):
 
     # model_path should remain unchanged since no onnx_file_name to guide rewriting
     assert config["config"]["model_path"] == stale_model_path
+
+
+def _discrepancy_run_config():
+    return {
+        "input_model": {"type": "HfModel", "test_model_path": "ref_model"},
+        "output_dir": "out_dir",
+    }
+
+
+def test_add_discrepancy_check_pass_default_enables_all_metrics():
+    from olive.cli.base import add_discrepancy_check_pass
+
+    run_config = add_discrepancy_check_pass(_discrepancy_run_config())
+
+    pass_config = run_config["passes"]["discrepancy_check"]
+    assert pass_config["type"] == "OnnxDiscrepancyCheck"
+    assert pass_config["reference_model_path"] == "ref_model"
+    # mae metric -> threshold enforced; speedup metric -> timing not disabled
+    assert pass_config["max_mae"] == 0.1
+    assert "timing_iterations" not in pass_config
+
+
+def test_add_discrepancy_check_pass_speedup_only_disables_mae():
+    from olive.cli.base import add_discrepancy_check_pass
+
+    run_config = add_discrepancy_check_pass(_discrepancy_run_config(), metrics=["speedup"])
+
+    pass_config = run_config["passes"]["discrepancy_check"]
+    assert "max_mae" not in pass_config
+    assert "timing_iterations" not in pass_config
+
+
+def test_add_discrepancy_check_pass_mae_only_disables_speedup():
+    from olive.cli.base import add_discrepancy_check_pass
+
+    run_config = add_discrepancy_check_pass(_discrepancy_run_config(), metrics=["mae"])
+
+    pass_config = run_config["passes"]["discrepancy_check"]
+    assert pass_config["max_mae"] == 0.1
+    assert pass_config["timing_iterations"] == 0
+
+
+def test_warn_unused_test_metrics_logs_when_test_disabled():
+    from olive.cli.base import warn_unused_test_metrics
+
+    with patch("olive.cli.base.logger") as mock_logger:
+        warn_unused_test_metrics(test=None, metrics=["speedup"])
+
+    mock_logger.warning.assert_called_once()
+    assert "--test_metrics is ignored" in mock_logger.warning.call_args[0][0]
+
+
+def test_warn_unused_test_metrics_silent_when_test_enabled():
+    from olive.cli.base import warn_unused_test_metrics
+
+    with patch("olive.cli.base.logger") as mock_logger:
+        warn_unused_test_metrics(test=True, metrics=["speedup"])
+
+    mock_logger.warning.assert_not_called()

From 15287d831edc19e7e1529340cbcdb2f53fbc7de9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 14:47:04 +0000
Subject: [PATCH 09/80] Document --test_metrics speedup usage

---
 docs/source/how-to/cli/cli-fast-test.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/docs/source/how-to/cli/cli-fast-test.md b/docs/source/how-to/cli/cli-fast-test.md
index 49fa13055..9557e5253 100644
--- a/docs/source/how-to/cli/cli-fast-test.md
+++ b/docs/source/how-to/cli/cli-fast-test.md
@@ -50,6 +50,23 @@ This is a quick way to confirm that:
 
 If you omit the folder and just pass `--test`, `olive run` will save the reduced model under `<output_path>/test_model`.
 
+### Optional: choose which `--test` metrics to run
+
+By default, `--test` evaluates both:
+
+- `mae`: maximum absolute error between the ONNX and reference model outputs
+- `speedup`: ONNX-vs-PyTorch latency measurement
+
+You can select a subset with `--test_metrics`. For example, to run only speedup checks:
+
+```bash
+olive run \
+    --config out/qwen/config.json \
+    --test out/qwen-test-model \
+    --test_metrics speedup \
+    --output_path out/qwen-test-run
+```
+
 ## Step 3: run the full conversion
 
 Once the smoke test succeeds, rerun the conversion on the full Qwen checkpoint by removing `--test`.

From 89d98c43fbadbe1fa5cda302789ba0f4802ebc77 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 15:24:19 +0000
Subject: [PATCH 10/80] Fix default test metrics to be mae-only, make speedup
 opt-in

---
 olive/cli/base.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index fda595367..82632ca1d 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -81,7 +81,8 @@ def add_discrepancy_check_pass(run_config: dict, metrics: Optional[list] = None)
 
     ``metrics`` selects which test metrics to evaluate. Supported values are defined in
     ``TEST_METRICS`` (``"mae"`` for the max-absolute-error accuracy check and ``"speedup"`` for the
-    ONNX-vs-PyTorch latency measurement). When ``None``, all metrics are evaluated.
+    ONNX-vs-PyTorch latency measurement). When ``None``, only ``"mae"`` is evaluated; pass
+    ``["speedup"]`` or ``["mae", "speedup"]`` explicitly to enable timing.
     """
     passes = run_config.get("passes", {})
     # Skip if already configured
@@ -101,7 +102,7 @@ def add_discrepancy_check_pass(run_config: dict, metrics: Optional[list] = None)
         report_dir = str(Path(report_dir).parent)
     logger.debug("Adding OnnxDiscrepancyCheck pass with reference_model_path=%s", reference_model_path)
 
-    selected_metrics = set(metrics) if metrics else set(TEST_METRICS)
+    selected_metrics = set(metrics) if metrics else {"mae"}
     pass_config = {
         "type": "OnnxDiscrepancyCheck",
         "reference_model_path": reference_model_path,

From 7490e559c01f4aad3e8511c9e4acb6dfd897d085 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 15:58:38 +0000
Subject: [PATCH 11/80] Fix test to match new default mae-only behavior

---
 test/cli/test_base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/cli/test_base.py b/test/cli/test_base.py
index b09fa96d5..d4ddffa89 100644
--- a/test/cli/test_base.py
+++ b/test/cli/test_base.py
@@ -347,7 +347,7 @@ def _discrepancy_run_config():
     }
 
 
-def test_add_discrepancy_check_pass_default_enables_all_metrics():
+def test_add_discrepancy_check_pass_default_enables_mae_only():
     from olive.cli.base import add_discrepancy_check_pass
 
     run_config = add_discrepancy_check_pass(_discrepancy_run_config())
@@ -355,9 +355,9 @@ def test_add_discrepancy_check_pass_default_enables_all_metrics():
     pass_config = run_config["passes"]["discrepancy_check"]
     assert pass_config["type"] == "OnnxDiscrepancyCheck"
     assert pass_config["reference_model_path"] == "ref_model"
-    # mae metric -> threshold enforced; speedup metric -> timing not disabled
+    # default: mae only -> threshold enforced, timing disabled
     assert pass_config["max_mae"] == 0.1
-    assert "timing_iterations" not in pass_config
+    assert pass_config["timing_iterations"] == 0
 
 
 def test_add_discrepancy_check_pass_speedup_only_disables_mae():

From bf96e3f5f6b3961e1f2a284a47e1e004a160bc1f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 17:15:56 +0000
Subject: [PATCH 12/80] Fix test_cli.py expected pass config to include
 timing_iterations=0

---
 test/cli/test_cli.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/cli/test_cli.py b/test/cli/test_cli.py
index 59817d830..5e666b94f 100644
--- a/test/cli/test_cli.py
+++ b/test/cli/test_cli.py
@@ -189,8 +189,9 @@ def test_workflow_run_command_with_test_override(mock_run, tmp_path):
                 "discrepancy_check": {
                     "type": "OnnxDiscrepancyCheck",
                     "reference_model_path": test_model_path,
-                    "max_mae": 0.1,
                     "report_output_dir": output_dir,
+                    "max_mae": 0.1,
+                    "timing_iterations": 0,
                 }
             },
         },

From fc5c372a69ea7dee214c06d4c13ae0a32ae1f23e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Mon, 29 Jun 2026 15:29:35 +0200
Subject: [PATCH 13/80] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 olive/cli/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index 82632ca1d..ea4496a21 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -538,7 +538,7 @@ def add_input_model_options(
             help=(
                 "Metrics to evaluate during a --test run: 'mae' enforces the max absolute error between the "
                 "ONNX and reference model outputs, and 'speedup' measures ONNX-vs-PyTorch inference latency. "
-                "Defaults to all metrics. Only used together with --test."
+                "Defaults to 'mae'. Only used together with --test."
             ),
         )
 

From f1077b7f0017c437fb04878fa36f5bcb65ca990e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Mon, 29 Jun 2026 15:29:51 +0200
Subject: [PATCH 14/80] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 olive/passes/onnx/discrepancy_check.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 3ee38a67d..f79934bd2 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -574,8 +574,8 @@ def __call__(self, generated_ids, scores, **kwargs) -> bool:
                 torch.cuda.synchronize()
             transformers_elapsed = time.perf_counter() - start
         if max_new_tokens > 0:
-            transformers_ttft = transformers_latency["ttft"] or transformers_elapsed
-            transformers_ttfn = transformers_latency["ttfn"] or transformers_elapsed
+            transformers_ttft = transformers_latency["ttft"] if transformers_latency["ttft"] is not None else transformers_elapsed
+            transformers_ttfn = transformers_latency["ttfn"] if transformers_latency["ttfn"] is not None else transformers_elapsed
         else:
             transformers_ttft = None
             transformers_ttfn = None

From c1e75e6832b60381913f21334afda09ce075581e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Mon, 29 Jun 2026 15:30:13 +0200
Subject: [PATCH 15/80] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 docs/source/how-to/cli/cli-fast-test.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/how-to/cli/cli-fast-test.md b/docs/source/how-to/cli/cli-fast-test.md
index 9557e5253..6e50e2387 100644
--- a/docs/source/how-to/cli/cli-fast-test.md
+++ b/docs/source/how-to/cli/cli-fast-test.md
@@ -52,11 +52,11 @@ If you omit the folder and just pass `--test`, `olive run` will save the reduced
 
 ### Optional: choose which `--test` metrics to run
 
-By default, `--test` evaluates both:
+By default, `--test` evaluates:
 
 - `mae`: maximum absolute error between the ONNX and reference model outputs
-- `speedup`: ONNX-vs-PyTorch latency measurement
 
+Add `speedup` via `--test_metrics speedup` (or `--test_metrics mae speedup`) to also run latency measurement.
 You can select a subset with `--test_metrics`. For example, to run only speedup checks:
 
 ```bash

From 4c73d0ac4dba8deef8fe468ca8eac7f17677c1b3 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 29 Jun 2026 13:31:10 +0000
Subject: [PATCH 16/80] feat: add llama-cpp integration to OnnxDiscrepancyCheck
 and llama_env CI step

---
 .github/workflows/test-model-fast.yml      |   6 +
 olive/passes/onnx/discrepancy_check.py     | 354 +++++++++++++++++++++
 test/passes/onnx/test_discrepancy_check.py | 171 ++++++++++
 3 files changed, 531 insertions(+)

diff --git a/.github/workflows/test-model-fast.yml b/.github/workflows/test-model-fast.yml
index 20af9b524..2041d15ab 100644
--- a/.github/workflows/test-model-fast.yml
+++ b/.github/workflows/test-model-fast.yml
@@ -30,6 +30,12 @@ jobs:
           python -m pip install -r requirements.txt
           python -m pip install -r test/requirements-test-cpu.txt
 
+      - name: Create llama_env and install llama-cpp-python
+        run: |
+          python -m venv llama_env
+          llama_env/bin/pip install --upgrade pip
+          CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_CUDA=OFF" llama_env/bin/pip install llama-cpp-python
+
       - name: pip freeze
         run: |
           python -m pip freeze
diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index f79934bd2..38bdaa7ad 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -4,6 +4,8 @@
 # --------------------------------------------------------------------------
 import json
 import logging
+import subprocess
+import tempfile
 import time
 from pathlib import Path
 from typing import Optional
@@ -115,6 +117,150 @@ def _format_seconds(value: Optional[float]) -> str:
     return "n/a" if value is None else f"{value:.4f}s"
 
 
+def _build_llama_gguf_tensor_map(num_layers: int) -> dict:
+    """Build a tensor name mapping from HuggingFace LLaMA names to GGUF names.
+
+    Returns a dict where each key is a HuggingFace state-dict tensor name and the
+    corresponding value is the GGUF tensor name expected by llama.cpp.
+    """
+    mapping = {
+        "model.embed_tokens.weight": "token_embd.weight",
+        "model.norm.weight": "output_norm.weight",
+        "lm_head.weight": "output.weight",
+    }
+    for i in range(num_layers):
+        p = f"model.layers.{i}"
+        mapping.update(
+            {
+                f"{p}.input_layernorm.weight": f"blk.{i}.attn_norm.weight",
+                f"{p}.post_attention_layernorm.weight": f"blk.{i}.ffn_norm.weight",
+                f"{p}.self_attn.q_proj.weight": f"blk.{i}.attn_q.weight",
+                f"{p}.self_attn.k_proj.weight": f"blk.{i}.attn_k.weight",
+                f"{p}.self_attn.v_proj.weight": f"blk.{i}.attn_v.weight",
+                f"{p}.self_attn.o_proj.weight": f"blk.{i}.attn_output.weight",
+                f"{p}.mlp.gate_proj.weight": f"blk.{i}.ffn_gate.weight",
+                f"{p}.mlp.up_proj.weight": f"blk.{i}.ffn_up.weight",
+                f"{p}.mlp.down_proj.weight": f"blk.{i}.ffn_down.weight",
+            }
+        )
+    return mapping
+
+
+# ---------------------------------------------------------------------------
+# Helper script executed inside the ``llama_env`` virtual environment.
+# All llama-cpp-python imports are intentionally isolated to this subprocess
+# so the main Olive process does not require llama-cpp-python.
+# ---------------------------------------------------------------------------
+_LLAMA_CPP_HELPER_SCRIPT = '''\
+"""GGUF conversion and llama.cpp inference helper for OnnxDiscrepancyCheck.
+
+This script runs inside the llama_env virtual environment via subprocess.
+It converts exported model weights to GGUF format, then measures first-token
+latency using llama-cpp-python.  Results are written as a JSON object to stdout.
+"""
+import argparse
+import json
+import sys
+import time
+
+import numpy as np
+
+
+def convert_to_gguf(weights_npz: str, config_json: str, gguf_path: str) -> None:
+    """Create a GGUF F32 file from exported NumPy weights and a config dict."""
+    import gguf
+
+    with open(config_json) as fh:
+        cfg = json.load(fh)
+
+    weights = np.load(weights_npz, allow_pickle=False)
+
+    writer = gguf.GGUFWriter(gguf_path, arch="llama")
+
+    # Required architecture metadata
+    writer.add_name("olive-discrepancy-check")
+    writer.add_context_length(cfg["max_position_embeddings"])
+    writer.add_embedding_length(cfg["hidden_size"])
+    writer.add_block_count(cfg["num_hidden_layers"])
+    writer.add_feed_forward_length(cfg["intermediate_size"])
+    writer.add_rope_dimension_count(cfg["hidden_size"] // cfg["num_attention_heads"])
+    writer.add_head_count(cfg["num_attention_heads"])
+    writer.add_head_count_kv(cfg.get("num_key_value_heads", cfg["num_attention_heads"]))
+    writer.add_layer_norm_rms_eps(cfg.get("rms_norm_eps", 1e-5))
+    writer.add_vocab_size(cfg["vocab_size"])
+    writer.add_file_type(0)  # 0 = ALL_F32
+
+    # Optional tokenizer metadata embedded in the config
+    if "tokenizer_tokens" in cfg:
+        try:
+            writer.add_tokenizer_model("llama")
+            writer.add_token_list(cfg["tokenizer_tokens"])
+            writer.add_token_scores(np.array(cfg["tokenizer_scores"], dtype=np.float32))
+            writer.add_token_types(np.array(cfg["tokenizer_types"], dtype=np.int32))
+        except Exception as exc:  # noqa: BLE001
+            print(f"Warning: could not write tokenizer metadata: {exc}", file=sys.stderr)
+
+    for name in weights.files:
+        writer.add_tensor(name, weights[name])
+
+    writer.write_header_to_file()
+    writer.write_kv_data_to_file()
+    writer.write_tensors_to_file()
+    writer.close()
+
+
+def run_inference(gguf_path: str, prompt_tokens: list, max_new_tokens: int, first_n: int) -> dict:
+    """Run greedy generation with llama.cpp and return first-token latency metrics."""
+    from llama_cpp import Llama
+
+    n_ctx = max(512, len(prompt_tokens) + max_new_tokens + 64)
+    llm = Llama(model_path=gguf_path, n_ctx=n_ctx, verbose=False)
+
+    generated = []
+    ttft = None
+    ttfn = None
+    first_token_id = None
+
+    start = time.perf_counter()
+    for token in llm.generate(prompt_tokens, top_k=1, temp=0.0, reset=True):
+        count = len(generated) + 1
+        if count == 1:
+            ttft = time.perf_counter() - start
+            first_token_id = int(token)
+        if count == first_n and ttfn is None:
+            ttfn = time.perf_counter() - start
+        generated.append(int(token))
+        if count >= max_new_tokens:
+            break
+
+    total_time = time.perf_counter() - start
+
+    return {
+        "first_token_id": first_token_id,
+        "generated_tokens": generated,
+        "ttft": ttft,
+        "ttfn": ttfn,
+        "total_time": total_time,
+    }
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="GGUF conversion and llama.cpp inference")
+    parser.add_argument("--weights_npz", required=True)
+    parser.add_argument("--config_json", required=True)
+    parser.add_argument("--gguf_path", required=True)
+    parser.add_argument("--prompt_tokens", required=True, help="JSON-encoded list of token IDs")
+    parser.add_argument("--max_new_tokens", type=int, default=32)
+    parser.add_argument("--first_n", type=int, default=5)
+    args = parser.parse_args()
+
+    prompt_tokens = json.loads(args.prompt_tokens)
+    convert_to_gguf(args.weights_npz, args.config_json, args.gguf_path)
+    result = run_inference(args.gguf_path, prompt_tokens, args.max_new_tokens, args.first_n)
+    print(json.dumps(result))
+'''
+
+
 class OnnxDiscrepancyCheck(Pass):
     """Validates ONNX model outputs against a reference PyTorch model.
 
@@ -229,6 +375,26 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                     "below this threshold, the pass fails."
                 ),
             ),
+            "llama_cpp": PassConfigParam(
+                type_=bool,
+                default_value=False,
+                description=(
+                    "When True, convert the reference HuggingFace model to GGUF format and compare "
+                    "inference with llama.cpp. Measures first-token difference between llama.cpp and "
+                    "the reference PyTorch model as well as latency and speedup. All llama-cpp-python "
+                    "operations are executed in the ``llama_env`` virtual environment via subprocess."
+                ),
+            ),
+            "llama_cpp_env_path": PassConfigParam(
+                type_=Optional[str],
+                default_value=None,
+                description=(
+                    "Path to the virtual environment where llama-cpp-python is installed. "
+                    "Defaults to 'llama_env' relative to the current working directory when "
+                    "``llama_cpp`` is True. Create this environment with: "
+                    "``python -m venv llama_env && llama_env/bin/pip install llama-cpp-python``."
+                ),
+            ),
         }
 
     def _run_for_config(
@@ -439,6 +605,16 @@ def _run_for_config(
                 results.setdefault("failures", []).append(gen_failure)
                 logger.error("ONNX model discrepancy check FAILED: %s", gen_failure)
 
+        # llama.cpp comparison: convert reference model to GGUF and compare latencies
+        if config.llama_cpp:
+            llama_results = self.compare_llama_cpp(
+                config,
+                ref_model,
+                pytorch_latency_s=results.get("pytorch_latency_s"),
+                onnx_latency_s=results.get("onnx_latency_s"),
+            )
+            results.update(llama_results)
+
         # Save results to disk
         report_path = Path(report_dir) / "discrepancy_check_results.json"
         report_path.parent.mkdir(parents=True, exist_ok=True)
@@ -630,6 +806,184 @@ def __call__(self, generated_ids, scores, **kwargs) -> bool:
 
         return gen_results
 
+    @staticmethod
+    def _get_llama_env_python(env_path: str) -> str:
+        """Return the Python interpreter path inside the given virtual environment.
+
+        Checks both the POSIX (``bin/python``) and Windows (``Scripts/python.exe``)
+        layouts so the method works cross-platform.
+        """
+        env = Path(env_path)
+        for candidate in (env / "bin" / "python", env / "Scripts" / "python.exe"):
+            if candidate.exists():
+                return str(candidate)
+        raise RuntimeError(
+            f"Could not find a Python interpreter in the llama_env at '{env_path}'. "
+            "Create the environment with: "
+            "python -m venv llama_env && llama_env/bin/pip install llama-cpp-python"
+        )
+
+    def compare_llama_cpp(
+        self,
+        config: type[BasePassConfig],
+        ref_model,
+        pytorch_latency_s: Optional[float] = None,
+        onnx_latency_s: Optional[float] = None,
+    ) -> dict:
+        """Convert the reference model to GGUF and compare inference with llama.cpp.
+
+        All llama-cpp-python operations are executed inside the ``llama_env`` virtual
+        environment via subprocess, so the main Olive process does not need
+        llama-cpp-python installed.
+
+        The method:
+
+        1. Exports the reference model weights (using GGUF tensor names) and the
+           model config to a temporary directory.
+        2. Runs ``_LLAMA_CPP_HELPER_SCRIPT`` inside ``llama_env`` which creates the
+           GGUF file and measures first-token latency.
+        3. Returns a metrics dict with the llama.cpp results and speedup ratios
+           relative to PyTorch and ONNX when those latencies are provided.
+        """
+        import numpy as np
+        import torch
+        from transformers import AutoConfig, AutoTokenizer
+
+        # Resolve the llama_env Python interpreter
+        env_path = config.llama_cpp_env_path or "llama_env"
+        python_path = self._get_llama_env_python(env_path)
+
+        # Tokenize the generation prompt using the main-env tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(config.reference_model_path)
+        encoded = tokenizer(config.generate_prompt, return_tensors="pt")
+        prompt_token_ids: list[int] = encoded["input_ids"][0].tolist()
+
+        # Run one-token generation with transformers to get the reference first token
+        input_ids = torch.tensor([prompt_token_ids]).to(ref_model.device)
+        with torch.no_grad():
+            gen_out = ref_model.generate(input_ids, max_new_tokens=1, do_sample=False)
+        pytorch_first_token_id = int(gen_out[0, -1].item())
+
+        # Export model weights with GGUF-compatible tensor names
+        hf_config = AutoConfig.from_pretrained(config.reference_model_path)
+        num_layers = hf_config.num_hidden_layers
+        tensor_map = _build_llama_gguf_tensor_map(num_layers)
+
+        state_dict = ref_model.state_dict()
+        mapped_weights: dict[str, np.ndarray] = {}
+        for hf_name, gguf_name in tensor_map.items():
+            if hf_name in state_dict:
+                mapped_weights[gguf_name] = state_dict[hf_name].float().cpu().numpy()
+
+        # Export tokenizer vocabulary so llama.cpp can load the model without errors
+        vocab_size = hf_config.vocab_size
+        tokenizer_tokens: Optional[list[str]] = None
+        tokenizer_scores: Optional[list[float]] = None
+        tokenizer_types: Optional[list[int]] = None
+        try:
+            vocab = tokenizer.get_vocab()
+            tokens_list: list[str] = [""] * vocab_size
+            for tok, idx in vocab.items():
+                if idx < vocab_size:
+                    tokens_list[idx] = tok
+            tokenizer_tokens = tokens_list
+            tokenizer_scores = [0.0] * vocab_size
+            tokenizer_types = [1] * vocab_size  # 1 = NORMAL token type
+        except Exception:
+            logger.debug("OnnxDiscrepancyCheck: could not export tokenizer vocabulary for GGUF.")
+
+        # Build the config dict for the helper script
+        gguf_cfg: dict = {
+            "max_position_embeddings": hf_config.max_position_embeddings,
+            "hidden_size": hf_config.hidden_size,
+            "num_hidden_layers": num_layers,
+            "intermediate_size": hf_config.intermediate_size,
+            "num_attention_heads": hf_config.num_attention_heads,
+            "num_key_value_heads": getattr(hf_config, "num_key_value_heads", hf_config.num_attention_heads),
+            "rms_norm_eps": getattr(hf_config, "rms_norm_eps", 1e-5),
+            "vocab_size": vocab_size,
+        }
+        if tokenizer_tokens is not None:
+            gguf_cfg["tokenizer_tokens"] = tokenizer_tokens
+            gguf_cfg["tokenizer_scores"] = tokenizer_scores
+            gguf_cfg["tokenizer_types"] = tokenizer_types
+
+        max_new_tokens = config.generate_max_new_tokens
+        first_n = max(1, min(config.time_to_first_n_tokens, max_new_tokens)) if max_new_tokens > 0 else 1
+
+        # Write temp files and invoke the helper script inside llama_env
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmpdir_path = Path(tmpdir)
+            weights_npz = str(tmpdir_path / "weights.npz")
+            config_json = str(tmpdir_path / "config.json")
+            gguf_path = str(tmpdir_path / "model.gguf")
+            script_path = str(tmpdir_path / "llama_cpp_helper.py")
+
+            np.savez(weights_npz, **mapped_weights)
+            (tmpdir_path / "config.json").write_text(json.dumps(gguf_cfg))
+            (tmpdir_path / "llama_cpp_helper.py").write_text(_LLAMA_CPP_HELPER_SCRIPT)
+
+            proc = subprocess.run(
+                [
+                    python_path,
+                    script_path,
+                    "--weights_npz",
+                    weights_npz,
+                    "--config_json",
+                    config_json,
+                    "--gguf_path",
+                    gguf_path,
+                    "--prompt_tokens",
+                    json.dumps(prompt_token_ids),
+                    "--max_new_tokens",
+                    str(max_new_tokens),
+                    "--first_n",
+                    str(first_n),
+                ],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+
+        llama_out: dict = json.loads(proc.stdout)
+
+        llama_first_token_id: Optional[int] = llama_out.get("first_token_id")
+        llama_ttft: Optional[float] = llama_out.get("ttft")
+        llama_ttfn: Optional[float] = llama_out.get("ttfn")
+        llama_total: Optional[float] = llama_out.get("total_time")
+
+        # Speedup: compare llama.cpp TTFT with single-pass PyTorch / ONNX latency
+        llama_speedup_vs_pytorch: Optional[float] = (
+            pytorch_latency_s / llama_ttft if (pytorch_latency_s is not None and llama_ttft) else None
+        )
+        llama_speedup_vs_onnx: Optional[float] = (
+            onnx_latency_s / llama_ttft if (onnx_latency_s is not None and llama_ttft) else None
+        )
+
+        results = {
+            "llama_cpp_pytorch_first_token_id": pytorch_first_token_id,
+            "llama_cpp_first_token_id": llama_first_token_id,
+            "llama_cpp_first_token_matches_pytorch": llama_first_token_id == pytorch_first_token_id,
+            "llama_cpp_ttft_s": llama_ttft,
+            "llama_cpp_ttfn_s": llama_ttfn,
+            "llama_cpp_total_time_s": llama_total,
+            "llama_cpp_speedup_vs_pytorch": llama_speedup_vs_pytorch,
+            "llama_cpp_speedup_vs_onnx": llama_speedup_vs_onnx,
+        }
+
+        logger.info(
+            "OnnxDiscrepancyCheck llama.cpp comparison: first_token_matches_pytorch=%s, "
+            "ttft=%s, ttfn=%s, total=%s, speedup_vs_pytorch=%s, speedup_vs_onnx=%s",
+            results["llama_cpp_first_token_matches_pytorch"],
+            _format_seconds(llama_ttft),
+            _format_seconds(llama_ttfn),
+            _format_seconds(llama_total),
+            f"{llama_speedup_vs_pytorch:.2f}x" if llama_speedup_vs_pytorch is not None else "n/a",
+            f"{llama_speedup_vs_onnx:.2f}x" if llama_speedup_vs_onnx is not None else "n/a",
+        )
+
+        return results
+
     def _export_reference_model(self, ref_model, output_model_path: str):
         """Save the reference PyTorch model weights for direct comparison."""
         import torch
diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index aef82c76f..19e6e2136 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -7,6 +7,8 @@
 import sys
 from unittest.mock import MagicMock, patch
 
+import pytest
+
 from olive.passes.onnx.discrepancy_check import _longest_common_token_sequence
 
 
@@ -411,3 +413,172 @@ def test_measure_speedup_returns_latencies_and_speedup(self):
         assert result == (2.0, 1.0, 2.0)
         assert ref_model.call_count == 3
         assert session.run.call_count == 3
+
+
+class TestCompareLlamaCpp:
+    """Unit tests for OnnxDiscrepancyCheck.compare_llama_cpp."""
+
+    def _make_config(self):
+        config = MagicMock()
+        config.reference_model_path = "mock_model"
+        config.generate_prompt = "Hello world"
+        config.generate_max_new_tokens = 10
+        config.time_to_first_n_tokens = 5
+        config.llama_cpp_env_path = "/mock/llama_env"
+        return config
+
+    def _make_hf_config(self):
+        hf_cfg = MagicMock()
+        hf_cfg.max_position_embeddings = 64
+        hf_cfg.hidden_size = 128
+        hf_cfg.num_hidden_layers = 2
+        hf_cfg.intermediate_size = 256
+        hf_cfg.num_attention_heads = 8
+        hf_cfg.num_key_value_heads = 8
+        hf_cfg.rms_norm_eps = 1e-5
+        hf_cfg.vocab_size = 32
+        return hf_cfg
+
+    def test_get_llama_env_python_posix(self, tmp_path):
+        """Test that the POSIX Python path is returned when it exists."""
+        from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck
+
+        (tmp_path / "bin").mkdir()
+        python = tmp_path / "bin" / "python"
+        python.touch()
+
+        result = OnnxDiscrepancyCheck._get_llama_env_python(str(tmp_path))
+        assert result == str(python)
+
+    def test_get_llama_env_python_missing_raises(self, tmp_path):
+        """Test that a RuntimeError is raised when no interpreter is found."""
+        import pytest
+
+        from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck
+
+        with pytest.raises(RuntimeError, match="llama_env"):
+            OnnxDiscrepancyCheck._get_llama_env_python(str(tmp_path))
+
+    def test_compare_llama_cpp_returns_expected_metrics(self):
+        """Test that compare_llama_cpp returns all expected keys and correct values."""
+        import json
+
+        import torch
+
+        from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck
+
+        config = self._make_config()
+
+        mock_ref_model = MagicMock()
+        mock_ref_model.device = torch.device("cpu")
+        # First token from transformers: 42
+        mock_ref_model.generate.return_value = torch.tensor([[1, 2, 3, 42]])
+        mock_ref_model.state_dict.return_value = {}
+
+        llama_output = {
+            "first_token_id": 42,
+            "generated_tokens": [42, 43, 44, 45, 46],
+            "ttft": 0.05,
+            "ttfn": 0.25,
+            "total_time": 0.50,
+        }
+
+        mock_proc = MagicMock()
+        mock_proc.stdout = json.dumps(llama_output)
+
+        mock_tokenizer = MagicMock()
+        mock_tokenizer.return_value = MagicMock(
+            input_ids=torch.tensor([[1, 2, 3]]),
+            __getitem__=lambda self, key: torch.tensor([[1, 2, 3]]) if key == "input_ids" else None,
+        )
+        mock_tokenizer.return_value.__getitem__ = lambda self, k: (
+            torch.tensor([[1, 2, 3]]) if k == "input_ids" else None
+        )
+        # tokenizer(prompt) returns a dict with "input_ids" as a list
+        encoded = MagicMock()
+        encoded.__getitem__ = MagicMock(side_effect=lambda k: torch.tensor([[1, 2, 3]]) if k == "input_ids" else None)
+        mock_tokenizer.return_value = encoded
+        mock_tokenizer.get_vocab = MagicMock(return_value={})
+
+        with (
+            patch.object(OnnxDiscrepancyCheck, "_get_llama_env_python", return_value="/mock/llama_env/bin/python"),
+            patch("subprocess.run", return_value=mock_proc),
+            patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer),
+            patch("transformers.AutoConfig.from_pretrained", return_value=self._make_hf_config()),
+            patch("numpy.savez"),
+        ):
+            pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck)
+            result = pass_instance.compare_llama_cpp(
+                config,
+                mock_ref_model,
+                pytorch_latency_s=0.10,
+                onnx_latency_s=0.05,
+            )
+
+        expected_keys = {
+            "llama_cpp_first_token_id",
+            "llama_cpp_pytorch_first_token_id",
+            "llama_cpp_first_token_matches_pytorch",
+            "llama_cpp_ttft_s",
+            "llama_cpp_ttfn_s",
+            "llama_cpp_total_time_s",
+            "llama_cpp_speedup_vs_pytorch",
+            "llama_cpp_speedup_vs_onnx",
+        }
+        assert expected_keys <= set(result.keys())
+
+        assert result["llama_cpp_first_token_id"] == 42
+        assert result["llama_cpp_ttft_s"] == pytest.approx(0.05)
+        assert result["llama_cpp_ttfn_s"] == pytest.approx(0.25)
+        assert result["llama_cpp_total_time_s"] == pytest.approx(0.50)
+        # speedup = pytorch_latency / llama_ttft = 0.10 / 0.05 = 2.0
+        assert result["llama_cpp_speedup_vs_pytorch"] == pytest.approx(2.0)
+        # speedup = onnx_latency / llama_ttft = 0.05 / 0.05 = 1.0
+        assert result["llama_cpp_speedup_vs_onnx"] == pytest.approx(1.0)
+
+    def test_compare_llama_cpp_no_latency_baselines(self):
+        """Speedup fields are None when pytorch/onnx latencies are not provided."""
+        import json
+
+        import torch
+
+        from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck
+
+        config = self._make_config()
+
+        mock_ref_model = MagicMock()
+        mock_ref_model.device = torch.device("cpu")
+        mock_ref_model.generate.return_value = torch.tensor([[1, 2, 3, 7]])
+        mock_ref_model.state_dict.return_value = {}
+
+        llama_output = {
+            "first_token_id": 7,
+            "generated_tokens": [7, 8],
+            "ttft": 0.10,
+            "ttfn": None,
+            "total_time": 0.20,
+        }
+
+        mock_proc = MagicMock()
+        mock_proc.stdout = json.dumps(llama_output)
+
+        encoded = MagicMock()
+        encoded.__getitem__ = MagicMock(side_effect=lambda k: torch.tensor([[1, 2, 3]]) if k == "input_ids" else None)
+        mock_tokenizer = MagicMock()
+        mock_tokenizer.return_value = encoded
+        mock_tokenizer.get_vocab = MagicMock(return_value={})
+
+        with (
+            patch.object(OnnxDiscrepancyCheck, "_get_llama_env_python", return_value="/mock/llama_env/bin/python"),
+            patch("subprocess.run", return_value=mock_proc),
+            patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer),
+            patch("transformers.AutoConfig.from_pretrained", return_value=self._make_hf_config()),
+            patch("numpy.savez"),
+        ):
+            pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck)
+            result = pass_instance.compare_llama_cpp(config, mock_ref_model)
+
+        assert result["llama_cpp_speedup_vs_pytorch"] is None
+        assert result["llama_cpp_speedup_vs_onnx"] is None
+        assert result["llama_cpp_first_token_id"] == 7
+        assert result["llama_cpp_first_token_matches_pytorch"] is True

From 6dffbabc0d3b04d69998dae25a6ec6fa9993394c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 29 Jun 2026 13:51:42 +0000
Subject: [PATCH 17/80] refactor: use save_pretrained for llama-cpp GGUF
 conversion and install from pre-built wheels

---
 .github/workflows/test-model-fast.yml  |   2 +-
 olive/passes/onnx/discrepancy_check.py | 200 ++++++++++++-------------
 2 files changed, 98 insertions(+), 104 deletions(-)

diff --git a/.github/workflows/test-model-fast.yml b/.github/workflows/test-model-fast.yml
index 2041d15ab..8b5d92a6d 100644
--- a/.github/workflows/test-model-fast.yml
+++ b/.github/workflows/test-model-fast.yml
@@ -34,7 +34,7 @@ jobs:
         run: |
           python -m venv llama_env
           llama_env/bin/pip install --upgrade pip
-          CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_CUDA=OFF" llama_env/bin/pip install llama-cpp-python
+          llama_env/bin/pip install gguf safetensors llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
 
       - name: pip freeze
         run: |
diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 38bdaa7ad..28103a765 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -117,12 +117,31 @@ def _format_seconds(value: Optional[float]) -> str:
     return "n/a" if value is None else f"{value:.4f}s"
 
 
-def _build_llama_gguf_tensor_map(num_layers: int) -> dict:
-    """Build a tensor name mapping from HuggingFace LLaMA names to GGUF names.
+# ---------------------------------------------------------------------------
+# Helper script executed inside the ``llama_env`` virtual environment.
+# All llama-cpp-python / gguf imports are intentionally isolated to this
+# subprocess so the main Olive process does not require those packages.
+# ---------------------------------------------------------------------------
+_LLAMA_CPP_HELPER_SCRIPT = '''\
+"""GGUF conversion and llama.cpp inference helper for OnnxDiscrepancyCheck.
 
-    Returns a dict where each key is a HuggingFace state-dict tensor name and the
-    corresponding value is the GGUF tensor name expected by llama.cpp.
-    """
+This script runs inside the llama_env virtual environment via subprocess.
+It converts a HuggingFace model directory to GGUF format using gguf.GGUFWriter,
+then measures first-token latency using llama-cpp-python.  Results are written
+as a JSON object to stdout.
+"""
+import argparse
+import glob
+import json
+import os
+import sys
+import time
+
+import numpy as np
+
+
+def _build_llama_tensor_map(num_layers):
+    """Map HuggingFace LLaMA tensor names to GGUF tensor names."""
     mapping = {
         "model.embed_tokens.weight": "token_embd.weight",
         "model.norm.weight": "output_norm.weight",
@@ -146,62 +165,76 @@ def _build_llama_gguf_tensor_map(num_layers: int) -> dict:
     return mapping
 
 
-# ---------------------------------------------------------------------------
-# Helper script executed inside the ``llama_env`` virtual environment.
-# All llama-cpp-python imports are intentionally isolated to this subprocess
-# so the main Olive process does not require llama-cpp-python.
-# ---------------------------------------------------------------------------
-_LLAMA_CPP_HELPER_SCRIPT = '''\
-"""GGUF conversion and llama.cpp inference helper for OnnxDiscrepancyCheck.
+def _load_hf_weights(model_dir):
+    """Load model weights from a HuggingFace model directory.
 
-This script runs inside the llama_env virtual environment via subprocess.
-It converts exported model weights to GGUF format, then measures first-token
-latency using llama-cpp-python.  Results are written as a JSON object to stdout.
-"""
-import argparse
-import json
-import sys
-import time
+    Prefers safetensors format; falls back to PyTorch bin files if safetensors
+    is unavailable.
+    """
+    safetensors_files = sorted(glob.glob(os.path.join(model_dir, "*.safetensors")))
+    if safetensors_files:
+        from safetensors import safe_open
 
-import numpy as np
+        weights = {}
+        for sf_path in safetensors_files:
+            with safe_open(sf_path, framework="numpy", device="cpu") as f:
+                for key in f.keys():
+                    weights[key] = f.get_tensor(key).astype(np.float32)
+        return weights
+
+    raise FileNotFoundError(
+        f"No safetensors weight files found in {model_dir}. "
+        "Ensure the model was saved with safe_serialization=True."
+    )
 
 
-def convert_to_gguf(weights_npz: str, config_json: str, gguf_path: str) -> None:
-    """Create a GGUF F32 file from exported NumPy weights and a config dict."""
+def convert_to_gguf(model_dir, gguf_path):
+    """Create a GGUF F32 file from a HuggingFace model directory."""
     import gguf
 
-    with open(config_json) as fh:
+    with open(os.path.join(model_dir, "config.json")) as fh:
         cfg = json.load(fh)
 
-    weights = np.load(weights_npz, allow_pickle=False)
+    num_layers = cfg["num_hidden_layers"]
+    weights = _load_hf_weights(model_dir)
+    tensor_map = _build_llama_tensor_map(num_layers)
 
     writer = gguf.GGUFWriter(gguf_path, arch="llama")
-
-    # Required architecture metadata
     writer.add_name("olive-discrepancy-check")
     writer.add_context_length(cfg["max_position_embeddings"])
     writer.add_embedding_length(cfg["hidden_size"])
-    writer.add_block_count(cfg["num_hidden_layers"])
+    writer.add_block_count(num_layers)
     writer.add_feed_forward_length(cfg["intermediate_size"])
     writer.add_rope_dimension_count(cfg["hidden_size"] // cfg["num_attention_heads"])
     writer.add_head_count(cfg["num_attention_heads"])
     writer.add_head_count_kv(cfg.get("num_key_value_heads", cfg["num_attention_heads"]))
     writer.add_layer_norm_rms_eps(cfg.get("rms_norm_eps", 1e-5))
     writer.add_vocab_size(cfg["vocab_size"])
-    writer.add_file_type(0)  # 0 = ALL_F32
+    writer.add_file_type(0)  # ALL_F32
 
-    # Optional tokenizer metadata embedded in the config
-    if "tokenizer_tokens" in cfg:
+    # Add tokenizer metadata from the saved tokenizer.json if present
+    tokenizer_json_path = os.path.join(model_dir, "tokenizer.json")
+    if os.path.exists(tokenizer_json_path):
         try:
-            writer.add_tokenizer_model("llama")
-            writer.add_token_list(cfg["tokenizer_tokens"])
-            writer.add_token_scores(np.array(cfg["tokenizer_scores"], dtype=np.float32))
-            writer.add_token_types(np.array(cfg["tokenizer_types"], dtype=np.int32))
+            with open(tokenizer_json_path) as f:
+                tokenizer_data = json.load(f)
+            vocab = tokenizer_data.get("model", {}).get("vocab", {})
+            if vocab:
+                vocab_size = cfg["vocab_size"]
+                tokens_list = [""] * vocab_size
+                for tok, idx in vocab.items():
+                    if 0 <= idx < vocab_size:
+                        tokens_list[idx] = tok
+                writer.add_tokenizer_model("llama")
+                writer.add_token_list(tokens_list)
+                writer.add_token_scores(np.zeros(vocab_size, dtype=np.float32))
+                writer.add_token_types(np.ones(vocab_size, dtype=np.int32))
         except Exception as exc:  # noqa: BLE001
             print(f"Warning: could not write tokenizer metadata: {exc}", file=sys.stderr)
 
-    for name in weights.files:
-        writer.add_tensor(name, weights[name])
+    for hf_name, gguf_name in tensor_map.items():
+        if hf_name in weights:
+            writer.add_tensor(gguf_name, weights[hf_name])
 
     writer.write_header_to_file()
     writer.write_kv_data_to_file()
@@ -209,7 +242,7 @@ def convert_to_gguf(weights_npz: str, config_json: str, gguf_path: str) -> None:
     writer.close()
 
 
-def run_inference(gguf_path: str, prompt_tokens: list, max_new_tokens: int, first_n: int) -> dict:
+def run_inference(gguf_path, prompt_tokens, max_new_tokens, first_n):
     """Run greedy generation with llama.cpp and return first-token latency metrics."""
     from llama_cpp import Llama
 
@@ -246,8 +279,7 @@ def run_inference(gguf_path: str, prompt_tokens: list, max_new_tokens: int, firs
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="GGUF conversion and llama.cpp inference")
-    parser.add_argument("--weights_npz", required=True)
-    parser.add_argument("--config_json", required=True)
+    parser.add_argument("--model_dir", required=True, help="HuggingFace model directory")
     parser.add_argument("--gguf_path", required=True)
     parser.add_argument("--prompt_tokens", required=True, help="JSON-encoded list of token IDs")
     parser.add_argument("--max_new_tokens", type=int, default=32)
@@ -255,7 +287,7 @@ def run_inference(gguf_path: str, prompt_tokens: list, max_new_tokens: int, firs
     args = parser.parse_args()
 
     prompt_tokens = json.loads(args.prompt_tokens)
-    convert_to_gguf(args.weights_npz, args.config_json, args.gguf_path)
+    convert_to_gguf(args.model_dir, args.gguf_path)
     result = run_inference(args.gguf_path, prompt_tokens, args.max_new_tokens, args.first_n)
     print(json.dumps(result))
 '''
@@ -392,7 +424,9 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                     "Path to the virtual environment where llama-cpp-python is installed. "
                     "Defaults to 'llama_env' relative to the current working directory when "
                     "``llama_cpp`` is True. Create this environment with: "
-                    "``python -m venv llama_env && llama_env/bin/pip install llama-cpp-python``."
+                    "``python -m venv llama_env && llama_env/bin/pip install gguf safetensors "
+                    "llama-cpp-python --extra-index-url "
+                    "https://abetlen.github.io/llama-cpp-python/whl/cpu``."
                 ),
             ),
         }
@@ -750,8 +784,12 @@ def __call__(self, generated_ids, scores, **kwargs) -> bool:
                 torch.cuda.synchronize()
             transformers_elapsed = time.perf_counter() - start
         if max_new_tokens > 0:
-            transformers_ttft = transformers_latency["ttft"] if transformers_latency["ttft"] is not None else transformers_elapsed
-            transformers_ttfn = transformers_latency["ttfn"] if transformers_latency["ttfn"] is not None else transformers_elapsed
+            transformers_ttft = (
+                transformers_latency["ttft"] if transformers_latency["ttft"] is not None else transformers_elapsed
+            )
+            transformers_ttfn = (
+                transformers_latency["ttfn"] if transformers_latency["ttfn"] is not None else transformers_elapsed
+            )
         else:
             transformers_ttft = None
             transformers_ttfn = None
@@ -820,7 +858,8 @@ def _get_llama_env_python(env_path: str) -> str:
         raise RuntimeError(
             f"Could not find a Python interpreter in the llama_env at '{env_path}'. "
             "Create the environment with: "
-            "python -m venv llama_env && llama_env/bin/pip install llama-cpp-python"
+            "python -m venv llama_env && llama_env/bin/pip install gguf safetensors "
+            "llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu"
         )
 
     def compare_llama_cpp(
@@ -838,16 +877,15 @@ def compare_llama_cpp(
 
         The method:
 
-        1. Exports the reference model weights (using GGUF tensor names) and the
-           model config to a temporary directory.
-        2. Runs ``_LLAMA_CPP_HELPER_SCRIPT`` inside ``llama_env`` which creates the
-           GGUF file and measures first-token latency.
+        1. Saves the reference model and tokenizer to a temporary directory using
+           ``save_pretrained`` (standard HuggingFace format).
+        2. Runs ``_LLAMA_CPP_HELPER_SCRIPT`` inside ``llama_env`` which converts the
+           saved model directory to a GGUF file and measures first-token latency.
         3. Returns a metrics dict with the llama.cpp results and speedup ratios
            relative to PyTorch and ONNX when those latencies are provided.
         """
-        import numpy as np
         import torch
-        from transformers import AutoConfig, AutoTokenizer
+        from transformers import AutoTokenizer
 
         # Resolve the llama_env Python interpreter
         env_path = config.llama_cpp_env_path or "llama_env"
@@ -864,73 +902,29 @@ def compare_llama_cpp(
             gen_out = ref_model.generate(input_ids, max_new_tokens=1, do_sample=False)
         pytorch_first_token_id = int(gen_out[0, -1].item())
 
-        # Export model weights with GGUF-compatible tensor names
-        hf_config = AutoConfig.from_pretrained(config.reference_model_path)
-        num_layers = hf_config.num_hidden_layers
-        tensor_map = _build_llama_gguf_tensor_map(num_layers)
-
-        state_dict = ref_model.state_dict()
-        mapped_weights: dict[str, np.ndarray] = {}
-        for hf_name, gguf_name in tensor_map.items():
-            if hf_name in state_dict:
-                mapped_weights[gguf_name] = state_dict[hf_name].float().cpu().numpy()
-
-        # Export tokenizer vocabulary so llama.cpp can load the model without errors
-        vocab_size = hf_config.vocab_size
-        tokenizer_tokens: Optional[list[str]] = None
-        tokenizer_scores: Optional[list[float]] = None
-        tokenizer_types: Optional[list[int]] = None
-        try:
-            vocab = tokenizer.get_vocab()
-            tokens_list: list[str] = [""] * vocab_size
-            for tok, idx in vocab.items():
-                if idx < vocab_size:
-                    tokens_list[idx] = tok
-            tokenizer_tokens = tokens_list
-            tokenizer_scores = [0.0] * vocab_size
-            tokenizer_types = [1] * vocab_size  # 1 = NORMAL token type
-        except Exception:
-            logger.debug("OnnxDiscrepancyCheck: could not export tokenizer vocabulary for GGUF.")
-
-        # Build the config dict for the helper script
-        gguf_cfg: dict = {
-            "max_position_embeddings": hf_config.max_position_embeddings,
-            "hidden_size": hf_config.hidden_size,
-            "num_hidden_layers": num_layers,
-            "intermediate_size": hf_config.intermediate_size,
-            "num_attention_heads": hf_config.num_attention_heads,
-            "num_key_value_heads": getattr(hf_config, "num_key_value_heads", hf_config.num_attention_heads),
-            "rms_norm_eps": getattr(hf_config, "rms_norm_eps", 1e-5),
-            "vocab_size": vocab_size,
-        }
-        if tokenizer_tokens is not None:
-            gguf_cfg["tokenizer_tokens"] = tokenizer_tokens
-            gguf_cfg["tokenizer_scores"] = tokenizer_scores
-            gguf_cfg["tokenizer_types"] = tokenizer_types
-
         max_new_tokens = config.generate_max_new_tokens
         first_n = max(1, min(config.time_to_first_n_tokens, max_new_tokens)) if max_new_tokens > 0 else 1
 
         # Write temp files and invoke the helper script inside llama_env
         with tempfile.TemporaryDirectory() as tmpdir:
             tmpdir_path = Path(tmpdir)
-            weights_npz = str(tmpdir_path / "weights.npz")
-            config_json = str(tmpdir_path / "config.json")
+            model_dir = str(tmpdir_path / "hf_model")
             gguf_path = str(tmpdir_path / "model.gguf")
             script_path = str(tmpdir_path / "llama_cpp_helper.py")
 
-            np.savez(weights_npz, **mapped_weights)
-            (tmpdir_path / "config.json").write_text(json.dumps(gguf_cfg))
+            # Save model and tokenizer in standard HuggingFace format so that the
+            # helper script can convert them to GGUF without custom weight mapping.
+            ref_model.save_pretrained(model_dir, safe_serialization=True)
+            tokenizer.save_pretrained(model_dir)
+
             (tmpdir_path / "llama_cpp_helper.py").write_text(_LLAMA_CPP_HELPER_SCRIPT)
 
             proc = subprocess.run(
                 [
                     python_path,
                     script_path,
-                    "--weights_npz",
-                    weights_npz,
-                    "--config_json",
-                    config_json,
+                    "--model_dir",
+                    model_dir,
                     "--gguf_path",
                     gguf_path,
                     "--prompt_tokens",

From aeef3dfbe952ed04999ad772b6800e28adb97e30 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 29 Jun 2026 14:24:11 +0000
Subject: [PATCH 18/80] Use convert_hf_to_gguf.py CLI for GGUF conversion
 instead of custom function

---
 .github/workflows/test-model-fast.yml  |   2 +
 olive/passes/onnx/discrepancy_check.py | 187 ++++++++-----------------
 2 files changed, 59 insertions(+), 130 deletions(-)

diff --git a/.github/workflows/test-model-fast.yml b/.github/workflows/test-model-fast.yml
index 8b5d92a6d..c31338d8e 100644
--- a/.github/workflows/test-model-fast.yml
+++ b/.github/workflows/test-model-fast.yml
@@ -35,6 +35,8 @@ jobs:
           python -m venv llama_env
           llama_env/bin/pip install --upgrade pip
           llama_env/bin/pip install gguf safetensors llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+          llama_env/bin/pip install transformers sentencepiece protobuf
+          curl -fsSL "https://raw.githubusercontent.com/ggerganov/llama.cpp/master/convert_hf_to_gguf.py" -o llama_env/convert_hf_to_gguf.py
 
       - name: pip freeze
         run: |
diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 28103a765..e83935236 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -123,124 +123,19 @@ def _format_seconds(value: Optional[float]) -> str:
 # subprocess so the main Olive process does not require those packages.
 # ---------------------------------------------------------------------------
 _LLAMA_CPP_HELPER_SCRIPT = '''\
-"""GGUF conversion and llama.cpp inference helper for OnnxDiscrepancyCheck.
+"""llama.cpp inference helper for OnnxDiscrepancyCheck.
 
 This script runs inside the llama_env virtual environment via subprocess.
-It converts a HuggingFace model directory to GGUF format using gguf.GGUFWriter,
-then measures first-token latency using llama-cpp-python.  Results are written
-as a JSON object to stdout.
+It measures first-token latency using llama-cpp-python on a pre-converted GGUF file.
+Results are written as a JSON object to stdout.
+
+GGUF conversion is done separately via the convert_hf_to_gguf.py CLI from llama.cpp
+before this script is invoked.
 """
 import argparse
-import glob
 import json
-import os
-import sys
 import time
 
-import numpy as np
-
-
-def _build_llama_tensor_map(num_layers):
-    """Map HuggingFace LLaMA tensor names to GGUF tensor names."""
-    mapping = {
-        "model.embed_tokens.weight": "token_embd.weight",
-        "model.norm.weight": "output_norm.weight",
-        "lm_head.weight": "output.weight",
-    }
-    for i in range(num_layers):
-        p = f"model.layers.{i}"
-        mapping.update(
-            {
-                f"{p}.input_layernorm.weight": f"blk.{i}.attn_norm.weight",
-                f"{p}.post_attention_layernorm.weight": f"blk.{i}.ffn_norm.weight",
-                f"{p}.self_attn.q_proj.weight": f"blk.{i}.attn_q.weight",
-                f"{p}.self_attn.k_proj.weight": f"blk.{i}.attn_k.weight",
-                f"{p}.self_attn.v_proj.weight": f"blk.{i}.attn_v.weight",
-                f"{p}.self_attn.o_proj.weight": f"blk.{i}.attn_output.weight",
-                f"{p}.mlp.gate_proj.weight": f"blk.{i}.ffn_gate.weight",
-                f"{p}.mlp.up_proj.weight": f"blk.{i}.ffn_up.weight",
-                f"{p}.mlp.down_proj.weight": f"blk.{i}.ffn_down.weight",
-            }
-        )
-    return mapping
-
-
-def _load_hf_weights(model_dir):
-    """Load model weights from a HuggingFace model directory.
-
-    Prefers safetensors format; falls back to PyTorch bin files if safetensors
-    is unavailable.
-    """
-    safetensors_files = sorted(glob.glob(os.path.join(model_dir, "*.safetensors")))
-    if safetensors_files:
-        from safetensors import safe_open
-
-        weights = {}
-        for sf_path in safetensors_files:
-            with safe_open(sf_path, framework="numpy", device="cpu") as f:
-                for key in f.keys():
-                    weights[key] = f.get_tensor(key).astype(np.float32)
-        return weights
-
-    raise FileNotFoundError(
-        f"No safetensors weight files found in {model_dir}. "
-        "Ensure the model was saved with safe_serialization=True."
-    )
-
-
-def convert_to_gguf(model_dir, gguf_path):
-    """Create a GGUF F32 file from a HuggingFace model directory."""
-    import gguf
-
-    with open(os.path.join(model_dir, "config.json")) as fh:
-        cfg = json.load(fh)
-
-    num_layers = cfg["num_hidden_layers"]
-    weights = _load_hf_weights(model_dir)
-    tensor_map = _build_llama_tensor_map(num_layers)
-
-    writer = gguf.GGUFWriter(gguf_path, arch="llama")
-    writer.add_name("olive-discrepancy-check")
-    writer.add_context_length(cfg["max_position_embeddings"])
-    writer.add_embedding_length(cfg["hidden_size"])
-    writer.add_block_count(num_layers)
-    writer.add_feed_forward_length(cfg["intermediate_size"])
-    writer.add_rope_dimension_count(cfg["hidden_size"] // cfg["num_attention_heads"])
-    writer.add_head_count(cfg["num_attention_heads"])
-    writer.add_head_count_kv(cfg.get("num_key_value_heads", cfg["num_attention_heads"]))
-    writer.add_layer_norm_rms_eps(cfg.get("rms_norm_eps", 1e-5))
-    writer.add_vocab_size(cfg["vocab_size"])
-    writer.add_file_type(0)  # ALL_F32
-
-    # Add tokenizer metadata from the saved tokenizer.json if present
-    tokenizer_json_path = os.path.join(model_dir, "tokenizer.json")
-    if os.path.exists(tokenizer_json_path):
-        try:
-            with open(tokenizer_json_path) as f:
-                tokenizer_data = json.load(f)
-            vocab = tokenizer_data.get("model", {}).get("vocab", {})
-            if vocab:
-                vocab_size = cfg["vocab_size"]
-                tokens_list = [""] * vocab_size
-                for tok, idx in vocab.items():
-                    if 0 <= idx < vocab_size:
-                        tokens_list[idx] = tok
-                writer.add_tokenizer_model("llama")
-                writer.add_token_list(tokens_list)
-                writer.add_token_scores(np.zeros(vocab_size, dtype=np.float32))
-                writer.add_token_types(np.ones(vocab_size, dtype=np.int32))
-        except Exception as exc:  # noqa: BLE001
-            print(f"Warning: could not write tokenizer metadata: {exc}", file=sys.stderr)
-
-    for hf_name, gguf_name in tensor_map.items():
-        if hf_name in weights:
-            writer.add_tensor(gguf_name, weights[hf_name])
-
-    writer.write_header_to_file()
-    writer.write_kv_data_to_file()
-    writer.write_tensors_to_file()
-    writer.close()
-
 
 def run_inference(gguf_path, prompt_tokens, max_new_tokens, first_n):
     """Run greedy generation with llama.cpp and return first-token latency metrics."""
@@ -278,8 +173,7 @@ def run_inference(gguf_path, prompt_tokens, max_new_tokens, first_n):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="GGUF conversion and llama.cpp inference")
-    parser.add_argument("--model_dir", required=True, help="HuggingFace model directory")
+    parser = argparse.ArgumentParser(description="llama.cpp inference helper")
     parser.add_argument("--gguf_path", required=True)
     parser.add_argument("--prompt_tokens", required=True, help="JSON-encoded list of token IDs")
     parser.add_argument("--max_new_tokens", type=int, default=32)
@@ -287,7 +181,6 @@ def run_inference(gguf_path, prompt_tokens, max_new_tokens, first_n):
     args = parser.parse_args()
 
     prompt_tokens = json.loads(args.prompt_tokens)
-    convert_to_gguf(args.model_dir, args.gguf_path)
     result = run_inference(args.gguf_path, prompt_tokens, args.max_new_tokens, args.first_n)
     print(json.dumps(result))
 '''
@@ -411,22 +304,28 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                 type_=bool,
                 default_value=False,
                 description=(
-                    "When True, convert the reference HuggingFace model to GGUF format and compare "
-                    "inference with llama.cpp. Measures first-token difference between llama.cpp and "
-                    "the reference PyTorch model as well as latency and speedup. All llama-cpp-python "
-                    "operations are executed in the ``llama_env`` virtual environment via subprocess."
+                    "When True, convert the reference HuggingFace model to GGUF format using "
+                    "``convert_hf_to_gguf.py`` from llama.cpp and compare inference with llama.cpp. "
+                    "Measures first-token difference between llama.cpp and the reference PyTorch model "
+                    "as well as latency and speedup. All llama-cpp-python operations are executed in "
+                    "the ``llama_env`` virtual environment via subprocess."
                 ),
             ),
             "llama_cpp_env_path": PassConfigParam(
                 type_=Optional[str],
                 default_value=None,
                 description=(
-                    "Path to the virtual environment where llama-cpp-python is installed. "
+                    "Path to the virtual environment where llama-cpp-python and "
+                    "``convert_hf_to_gguf.py`` are installed. "
                     "Defaults to 'llama_env' relative to the current working directory when "
-                    "``llama_cpp`` is True. Create this environment with: "
+                    "``llama_cpp`` is True. Create this environment and download the conversion "
+                    "script with: "
                     "``python -m venv llama_env && llama_env/bin/pip install gguf safetensors "
+                    "transformers sentencepiece protobuf "
                     "llama-cpp-python --extra-index-url "
-                    "https://abetlen.github.io/llama-cpp-python/whl/cpu``."
+                    "https://abetlen.github.io/llama-cpp-python/whl/cpu && "
+                    "curl -fsSL https://raw.githubusercontent.com/ggerganov/llama.cpp/master/convert_hf_to_gguf.py "
+                    "-o llama_env/convert_hf_to_gguf.py``."
                 ),
             ),
         }
@@ -862,6 +761,26 @@ def _get_llama_env_python(env_path: str) -> str:
             "llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu"
         )
 
+    @staticmethod
+    def _get_convert_script(env_path: str) -> str:
+        r"""Return the path to the ``convert_hf_to_gguf.py`` conversion script.
+
+        The script is expected to be placed at the root of the virtual environment
+        directory (i.e. ``{env_path}/convert_hf_to_gguf.py``).  Download it with::
+
+            curl -fsSL https://raw.githubusercontent.com/ggerganov/llama.cpp/master/convert_hf_to_gguf.py \
+                -o {env_path}/convert_hf_to_gguf.py
+        """
+        script = Path(env_path) / "convert_hf_to_gguf.py"
+        if script.exists():
+            return str(script)
+        raise RuntimeError(
+            f"Could not find convert_hf_to_gguf.py in '{env_path}'. "
+            "Download it from the llama.cpp repository: "
+            "curl -fsSL https://raw.githubusercontent.com/ggerganov/llama.cpp/master/convert_hf_to_gguf.py "
+            f"-o {env_path}/convert_hf_to_gguf.py"
+        )
+
     def compare_llama_cpp(
         self,
         config: type[BasePassConfig],
@@ -879,17 +798,20 @@ def compare_llama_cpp(
 
         1. Saves the reference model and tokenizer to a temporary directory using
            ``save_pretrained`` (standard HuggingFace format).
-        2. Runs ``_LLAMA_CPP_HELPER_SCRIPT`` inside ``llama_env`` which converts the
-           saved model directory to a GGUF file and measures first-token latency.
-        3. Returns a metrics dict with the llama.cpp results and speedup ratios
+        2. Calls ``convert_hf_to_gguf.py`` from llama.cpp via the command line to
+           convert the saved directory to a GGUF F32 file.
+        3. Runs ``_LLAMA_CPP_HELPER_SCRIPT`` inside ``llama_env`` to measure
+           first-token latency with llama-cpp-python on the converted GGUF file.
+        4. Returns a metrics dict with the llama.cpp results and speedup ratios
            relative to PyTorch and ONNX when those latencies are provided.
         """
         import torch
         from transformers import AutoTokenizer
 
-        # Resolve the llama_env Python interpreter
+        # Resolve the llama_env Python interpreter and conversion script
         env_path = config.llama_cpp_env_path or "llama_env"
         python_path = self._get_llama_env_python(env_path)
+        convert_script = self._get_convert_script(env_path)
 
         # Tokenize the generation prompt using the main-env tokenizer
         tokenizer = AutoTokenizer.from_pretrained(config.reference_model_path)
@@ -905,26 +827,31 @@ def compare_llama_cpp(
         max_new_tokens = config.generate_max_new_tokens
         first_n = max(1, min(config.time_to_first_n_tokens, max_new_tokens)) if max_new_tokens > 0 else 1
 
-        # Write temp files and invoke the helper script inside llama_env
         with tempfile.TemporaryDirectory() as tmpdir:
             tmpdir_path = Path(tmpdir)
             model_dir = str(tmpdir_path / "hf_model")
             gguf_path = str(tmpdir_path / "model.gguf")
             script_path = str(tmpdir_path / "llama_cpp_helper.py")
 
-            # Save model and tokenizer in standard HuggingFace format so that the
-            # helper script can convert them to GGUF without custom weight mapping.
+            # Save model and tokenizer in standard HuggingFace format.
             ref_model.save_pretrained(model_dir, safe_serialization=True)
             tokenizer.save_pretrained(model_dir)
 
+            # Step 1: Convert to GGUF using the official convert_hf_to_gguf.py CLI.
+            subprocess.run(
+                [python_path, convert_script, model_dir, "--outfile", gguf_path, "--outtype", "f32"],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+
+            # Step 2: Run inference inside llama_env using the pre-converted GGUF file.
             (tmpdir_path / "llama_cpp_helper.py").write_text(_LLAMA_CPP_HELPER_SCRIPT)
 
             proc = subprocess.run(
                 [
                     python_path,
                     script_path,
-                    "--model_dir",
-                    model_dir,
                     "--gguf_path",
                     gguf_path,
                     "--prompt_tokens",

From f8192d3d087aa3fdb5000b4ea0395e971eee7daa Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 29 Jun 2026 14:36:01 +0000
Subject: [PATCH 19/80] Remove duplicate pytest import inside test function

---
 test/passes/onnx/test_discrepancy_check.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index 19e6e2136..92b4e9b6d 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -452,8 +452,6 @@ def test_get_llama_env_python_posix(self, tmp_path):
 
     def test_get_llama_env_python_missing_raises(self, tmp_path):
         """Test that a RuntimeError is raised when no interpreter is found."""
-        import pytest
-
         from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck
 
         with pytest.raises(RuntimeError, match="llama_env"):

From 78f77ed61fa98f2d697a2dcbb1434851e5598f85 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 29 Jun 2026 15:23:28 +0000
Subject: [PATCH 20/80] Add --test_llama_path CLI option for specifying
 llama_env virtual environment path

---
 olive/cli/base.py     | 40 +++++++++++++++++++++++++++++++++++-----
 test/cli/test_base.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index ea4496a21..d7f74d6b6 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -70,19 +70,27 @@ def mark_test_output_path(output_path: Optional[str]) -> None:
     _get_test_output_marker_path(output_path).write_text(json.dumps({"type": "olive_hf_test_output"}, indent=2))
 
 
-def warn_unused_test_metrics(test, metrics: Optional[list]) -> None:
-    """Warn when --test_metrics is provided without --test, since it has no effect."""
+def warn_unused_test_metrics(test, metrics: Optional[list], llama_path: Optional[str] = None) -> None:
+    """Warn when --test_metrics or --test_llama_path is provided without --test, since it has no effect."""
     if metrics and test in (None, False):
         logger.warning("--test_metrics is ignored because --test is not enabled.")
+    if llama_path and test in (None, False):
+        logger.warning("--test_llama_path is ignored because --test is not enabled.")
 
 
-def add_discrepancy_check_pass(run_config: dict, metrics: Optional[list] = None) -> dict:
+def add_discrepancy_check_pass(
+    run_config: dict, metrics: Optional[list] = None, llama_env_path: Optional[str] = None
+) -> dict:
     """Inject OnnxDiscrepancyCheck pass when --test is active and not already configured.
 
     ``metrics`` selects which test metrics to evaluate. Supported values are defined in
     ``TEST_METRICS`` (``"mae"`` for the max-absolute-error accuracy check and ``"speedup"`` for the
     ONNX-vs-PyTorch latency measurement). When ``None``, only ``"mae"`` is evaluated; pass
     ``["speedup"]`` or ``["mae", "speedup"]`` explicitly to enable timing.
+
+    ``llama_env_path`` is the path to the llama_env virtual environment used for llama.cpp inference.
+    When provided, the ``llama_cpp`` flag is enabled on the pass and the path is forwarded as
+    ``llama_cpp_env_path``.
     """
     passes = run_config.get("passes", {})
     # Skip if already configured
@@ -114,6 +122,10 @@ def add_discrepancy_check_pass(run_config: dict, metrics: Optional[list] = None)
     # Disable the latency/speedup measurement when the speedup metric is not requested.
     if "speedup" not in selected_metrics:
         pass_config["timing_iterations"] = 0
+    # Enable llama.cpp comparison when a venv path is provided.
+    if llama_env_path:
+        pass_config["llama_cpp"] = True
+        pass_config["llama_cpp_env_path"] = llama_env_path
 
     passes["discrepancy_check"] = pass_config
     run_config["passes"] = passes
@@ -159,13 +171,21 @@ def _run_workflow(self):
         from olive.workflows import run as olive_run
 
         validate_test_output_path(self.args.output_path, getattr(self.args, "test", None))
-        warn_unused_test_metrics(getattr(self.args, "test", None), getattr(self.args, "test_metrics", None))
+        warn_unused_test_metrics(
+            getattr(self.args, "test", None),
+            getattr(self.args, "test_metrics", None),
+            getattr(self.args, "test_llama_path", None),
+        )
         Path(self.args.output_path).mkdir(parents=True, exist_ok=True)
 
         with tempfile.TemporaryDirectory(prefix="olive-cli-tmp-", dir=self.args.output_path) as tempdir:
             run_config = self._get_run_config(tempdir)
             if getattr(self.args, "test", None) not in (None, False):
-                run_config = add_discrepancy_check_pass(run_config, getattr(self.args, "test_metrics", None))
+                run_config = add_discrepancy_check_pass(
+                    run_config,
+                    getattr(self.args, "test_metrics", None),
+                    getattr(self.args, "test_llama_path", None),
+                )
             if self.args.save_config_file or self.args.dry_run:
                 self._save_config_file(run_config)
             if self.args.dry_run:
@@ -541,6 +561,16 @@ def add_input_model_options(
                 "Defaults to 'mae'. Only used together with --test."
             ),
         )
+        model_group.add_argument(
+            "--test_llama_path",
+            type=str,
+            default=None,
+            help=(
+                "Path to the llama_env virtual environment used to run llama.cpp inference during a --test run. "
+                "When provided, the ONNX model is also compared against llama.cpp (GGUF format) and results "
+                "include first-token latency and speedup metrics. Only used together with --test."
+            ),
+        )
 
     if enable_hf_adapter:
         assert enable_hf, "enable_hf must be True when enable_hf_adapter is True."
diff --git a/test/cli/test_base.py b/test/cli/test_base.py
index d4ddffa89..d61f378e3 100644
--- a/test/cli/test_base.py
+++ b/test/cli/test_base.py
@@ -397,3 +397,33 @@ def test_warn_unused_test_metrics_silent_when_test_enabled():
         warn_unused_test_metrics(test=True, metrics=["speedup"])
 
     mock_logger.warning.assert_not_called()
+
+
+def test_warn_unused_test_metrics_logs_llama_path_when_test_disabled():
+    from olive.cli.base import warn_unused_test_metrics
+
+    with patch("olive.cli.base.logger") as mock_logger:
+        warn_unused_test_metrics(test=None, metrics=None, llama_path="/path/to/llama_env")
+
+    mock_logger.warning.assert_called_once()
+    assert "--test_llama_path is ignored" in mock_logger.warning.call_args[0][0]
+
+
+def test_add_discrepancy_check_pass_llama_env_path_sets_config():
+    from olive.cli.base import add_discrepancy_check_pass
+
+    run_config = add_discrepancy_check_pass(_discrepancy_run_config(), llama_env_path="/path/to/llama_env")
+
+    pass_config = run_config["passes"]["discrepancy_check"]
+    assert pass_config["llama_cpp"] is True
+    assert pass_config["llama_cpp_env_path"] == "/path/to/llama_env"
+
+
+def test_add_discrepancy_check_pass_no_llama_env_path_omits_llama_config():
+    from olive.cli.base import add_discrepancy_check_pass
+
+    run_config = add_discrepancy_check_pass(_discrepancy_run_config())
+
+    pass_config = run_config["passes"]["discrepancy_check"]
+    assert "llama_cpp" not in pass_config
+    assert "llama_cpp_env_path" not in pass_config

From d1a40a646bbbe7e4494818a01ee74edaa0a50476 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 29 Jun 2026 15:31:47 +0000
Subject: [PATCH 21/80] Support comma-separated values for --test_metrics (e.g.
 mae,speedup)

---
 olive/cli/base.py     | 42 ++++++++++++++++++++++++++++++++++++++----
 test/cli/test_base.py | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 4 deletions(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index d7f74d6b6..b46b8f15b 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -26,6 +26,39 @@
 TEST_METRICS = ("mae", "speedup")
 
 
+def _parse_test_metrics(value: str) -> list:
+    """Parse a comma- or space-separated list of test metric names.
+
+    Accepts values like ``'mae'``, ``'mae,speedup'``, or ``'mae speedup'`` and
+    returns a flat list of validated metric names.  Raises ``argparse.ArgumentTypeError``
+    for any unrecognised name.
+    """
+    import argparse
+
+    names = [m.strip() for m in value.replace(",", " ").split() if m.strip()]
+    invalid = [n for n in names if n not in TEST_METRICS]
+    if invalid:
+        raise argparse.ArgumentTypeError(
+            f"invalid choice(s): {invalid!r} (choose from {list(TEST_METRICS)})"
+        )
+    return names
+
+
+def _flatten_test_metrics(raw) -> Optional[list]:
+    """Flatten the nested list produced by argparse when nargs="+" and type returns a list.
+
+    ``argparse`` calls the ``type`` function once per token, so
+    ``--test_metrics mae,speedup`` yields ``[["mae", "speedup"]]`` and
+    ``--test_metrics mae speedup`` yields ``[["mae"], ["speedup"]]``.
+    This function flattens both forms to ``["mae", "speedup"]``.
+    Returns ``None`` when ``raw`` is ``None`` or empty.
+    """
+    if not raw:
+        return None
+    flat = [item for sublist in raw for item in (sublist if isinstance(sublist, list) else [sublist])]
+    return flat or None
+
+
 def _get_test_output_marker_path(output_path: str) -> Path:
     return Path(output_path) / TEST_OUTPUT_MARKER_FILE
 
@@ -171,9 +204,10 @@ def _run_workflow(self):
         from olive.workflows import run as olive_run
 
         validate_test_output_path(self.args.output_path, getattr(self.args, "test", None))
+        test_metrics = _flatten_test_metrics(getattr(self.args, "test_metrics", None))
         warn_unused_test_metrics(
             getattr(self.args, "test", None),
-            getattr(self.args, "test_metrics", None),
+            test_metrics,
             getattr(self.args, "test_llama_path", None),
         )
         Path(self.args.output_path).mkdir(parents=True, exist_ok=True)
@@ -183,7 +217,7 @@ def _run_workflow(self):
             if getattr(self.args, "test", None) not in (None, False):
                 run_config = add_discrepancy_check_pass(
                     run_config,
-                    getattr(self.args, "test_metrics", None),
+                    test_metrics,
                     getattr(self.args, "test_llama_path", None),
                 )
             if self.args.save_config_file or self.args.dry_run:
@@ -552,12 +586,12 @@ def add_input_model_options(
         )
         model_group.add_argument(
             "--test_metrics",
-            type=str,
+            type=_parse_test_metrics,
             nargs="+",
-            choices=list(TEST_METRICS),
             help=(
                 "Metrics to evaluate during a --test run: 'mae' enforces the max absolute error between the "
                 "ONNX and reference model outputs, and 'speedup' measures ONNX-vs-PyTorch inference latency. "
+                "Accepts space- or comma-separated values (e.g. 'mae,speedup' or 'mae speedup'). "
                 "Defaults to 'mae'. Only used together with --test."
             ),
         )
diff --git a/test/cli/test_base.py b/test/cli/test_base.py
index d61f378e3..e7c5ee3ee 100644
--- a/test/cli/test_base.py
+++ b/test/cli/test_base.py
@@ -427,3 +427,44 @@ def test_add_discrepancy_check_pass_no_llama_env_path_omits_llama_config():
     pass_config = run_config["passes"]["discrepancy_check"]
     assert "llama_cpp" not in pass_config
     assert "llama_cpp_env_path" not in pass_config
+
+
+def test_parse_test_metrics_comma_separated():
+    from olive.cli.base import _parse_test_metrics
+
+    assert _parse_test_metrics("mae,speedup") == ["mae", "speedup"]
+
+
+def test_parse_test_metrics_single():
+    from olive.cli.base import _parse_test_metrics
+
+    assert _parse_test_metrics("mae") == ["mae"]
+
+
+def test_parse_test_metrics_invalid_raises():
+    import argparse
+
+    from olive.cli.base import _parse_test_metrics
+
+    with pytest.raises(argparse.ArgumentTypeError, match="invalid choice"):
+        _parse_test_metrics("unknown")
+
+
+def test_flatten_test_metrics_nested_lists():
+    from olive.cli.base import _flatten_test_metrics
+
+    # Simulates: --test_metrics mae,speedup  → [["mae", "speedup"]]
+    assert _flatten_test_metrics([["mae", "speedup"]]) == ["mae", "speedup"]
+
+
+def test_flatten_test_metrics_space_separated_tokens():
+    from olive.cli.base import _flatten_test_metrics
+
+    # Simulates: --test_metrics mae speedup  → [["mae"], ["speedup"]]
+    assert _flatten_test_metrics([["mae"], ["speedup"]]) == ["mae", "speedup"]
+
+
+def test_flatten_test_metrics_none_returns_none():
+    from olive.cli.base import _flatten_test_metrics
+
+    assert _flatten_test_metrics(None) is None

From 0bc6e0a502ca08ac4b6e6b6bc0119f5422150a37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Mon, 29 Jun 2026 17:54:10 +0200
Subject: [PATCH 22/80] fix path

---
 .github/workflows/test-model-fast.yml | 2 +-
 olive/cli/run.py                      | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test-model-fast.yml b/.github/workflows/test-model-fast.yml
index c31338d8e..1ac57d0ea 100644
--- a/.github/workflows/test-model-fast.yml
+++ b/.github/workflows/test-model-fast.yml
@@ -35,7 +35,7 @@ jobs:
           python -m venv llama_env
           llama_env/bin/pip install --upgrade pip
           llama_env/bin/pip install gguf safetensors llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
-          llama_env/bin/pip install transformers sentencepiece protobuf
+          llama_env/bin/pip install transformers sentencepiece protobuf tabulate
           curl -fsSL "https://raw.githubusercontent.com/ggerganov/llama.cpp/master/convert_hf_to_gguf.py" -o llama_env/convert_hf_to_gguf.py
 
       - name: pip freeze
diff --git a/olive/cli/run.py b/olive/cli/run.py
index 7d173f3c0..b488c2b8e 100644
--- a/olive/cli/run.py
+++ b/olive/cli/run.py
@@ -5,6 +5,7 @@
 from argparse import ArgumentParser
 
 from olive.cli.base import (
+    _flatten_test_metrics,
     BaseOliveCLICommand,
     add_discrepancy_check_pass,
     add_hf_test_model_config,
@@ -86,7 +87,12 @@ def run(self):
         validate_test_output_path(output_path, self.args.test)
         warn_unused_test_metrics(self.args.test, getattr(self.args, "test_metrics", None))
         if self.args.test not in (None, False):
-            run_config = add_discrepancy_check_pass(run_config, getattr(self.args, "test_metrics", None))
+            test_metrics = _flatten_test_metrics(getattr(self.args, "test_metrics", None))
+            run_config = add_discrepancy_check_pass(
+                run_config,
+                metrics=test_metrics,
+                llama_env_path=getattr(self.args, "test_llama_path", None),
+            )
         workflow_output = olive_run(
             run_config,
             list_required_packages=self.args.list_required_packages,

From 3ba9f1c825435bdd21c67b8aa85446ebea8edaae Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 29 Jun 2026 16:01:40 +0000
Subject: [PATCH 23/80] Store GGUF and HF model files in output_dir instead of
 temp directory

---
 olive/passes/onnx/discrepancy_check.py     | 81 +++++++++++-----------
 test/passes/onnx/test_discrepancy_check.py | 13 +++-
 2 files changed, 51 insertions(+), 43 deletions(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index e83935236..7353b3365 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -5,7 +5,6 @@
 import json
 import logging
 import subprocess
-import tempfile
 import time
 from pathlib import Path
 from typing import Optional
@@ -543,6 +542,7 @@ def _run_for_config(
             llama_results = self.compare_llama_cpp(
                 config,
                 ref_model,
+                output_dir=report_dir,
                 pytorch_latency_s=results.get("pytorch_latency_s"),
                 onnx_latency_s=results.get("onnx_latency_s"),
             )
@@ -785,6 +785,7 @@ def compare_llama_cpp(
         self,
         config: type[BasePassConfig],
         ref_model,
+        output_dir: str,
         pytorch_latency_s: Optional[float] = None,
         onnx_latency_s: Optional[float] = None,
     ) -> dict:
@@ -796,10 +797,10 @@ def compare_llama_cpp(
 
         The method:
 
-        1. Saves the reference model and tokenizer to a temporary directory using
+        1. Saves the reference model and tokenizer to ``output_dir/hf_model`` using
            ``save_pretrained`` (standard HuggingFace format).
         2. Calls ``convert_hf_to_gguf.py`` from llama.cpp via the command line to
-           convert the saved directory to a GGUF F32 file.
+           convert the saved directory to a GGUF F32 file at ``output_dir/model.gguf``.
         3. Runs ``_LLAMA_CPP_HELPER_SCRIPT`` inside ``llama_env`` to measure
            first-token latency with llama-cpp-python on the converted GGUF file.
         4. Returns a metrics dict with the llama.cpp results and speedup ratios
@@ -827,44 +828,44 @@ def compare_llama_cpp(
         max_new_tokens = config.generate_max_new_tokens
         first_n = max(1, min(config.time_to_first_n_tokens, max_new_tokens)) if max_new_tokens > 0 else 1
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            tmpdir_path = Path(tmpdir)
-            model_dir = str(tmpdir_path / "hf_model")
-            gguf_path = str(tmpdir_path / "model.gguf")
-            script_path = str(tmpdir_path / "llama_cpp_helper.py")
-
-            # Save model and tokenizer in standard HuggingFace format.
-            ref_model.save_pretrained(model_dir, safe_serialization=True)
-            tokenizer.save_pretrained(model_dir)
-
-            # Step 1: Convert to GGUF using the official convert_hf_to_gguf.py CLI.
-            subprocess.run(
-                [python_path, convert_script, model_dir, "--outfile", gguf_path, "--outtype", "f32"],
-                capture_output=True,
-                text=True,
-                check=True,
-            )
+        output_dir_path = Path(output_dir)
+        output_dir_path.mkdir(parents=True, exist_ok=True)
+        model_dir = str(output_dir_path / "hf_model")
+        gguf_path = str(output_dir_path / "model.gguf")
+        script_path = str(output_dir_path / "llama_cpp_helper.py")
+
+        # Save model and tokenizer in standard HuggingFace format.
+        ref_model.save_pretrained(model_dir, safe_serialization=True)
+        tokenizer.save_pretrained(model_dir)
+
+        # Step 1: Convert to GGUF using the official convert_hf_to_gguf.py CLI.
+        subprocess.run(
+            [python_path, convert_script, model_dir, "--outfile", gguf_path, "--outtype", "f32"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
 
-            # Step 2: Run inference inside llama_env using the pre-converted GGUF file.
-            (tmpdir_path / "llama_cpp_helper.py").write_text(_LLAMA_CPP_HELPER_SCRIPT)
-
-            proc = subprocess.run(
-                [
-                    python_path,
-                    script_path,
-                    "--gguf_path",
-                    gguf_path,
-                    "--prompt_tokens",
-                    json.dumps(prompt_token_ids),
-                    "--max_new_tokens",
-                    str(max_new_tokens),
-                    "--first_n",
-                    str(first_n),
-                ],
-                capture_output=True,
-                text=True,
-                check=True,
-            )
+        # Step 2: Run inference inside llama_env using the pre-converted GGUF file.
+        (output_dir_path / "llama_cpp_helper.py").write_text(_LLAMA_CPP_HELPER_SCRIPT)
+
+        proc = subprocess.run(
+            [
+                python_path,
+                script_path,
+                "--gguf_path",
+                gguf_path,
+                "--prompt_tokens",
+                json.dumps(prompt_token_ids),
+                "--max_new_tokens",
+                str(max_new_tokens),
+                "--first_n",
+                str(first_n),
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
 
         llama_out: dict = json.loads(proc.stdout)
 
diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index 92b4e9b6d..152525761 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -457,7 +457,7 @@ def test_get_llama_env_python_missing_raises(self, tmp_path):
         with pytest.raises(RuntimeError, match="llama_env"):
             OnnxDiscrepancyCheck._get_llama_env_python(str(tmp_path))
 
-    def test_compare_llama_cpp_returns_expected_metrics(self):
+    def test_compare_llama_cpp_returns_expected_metrics(self, tmp_path):
         """Test that compare_llama_cpp returns all expected keys and correct values."""
         import json
 
@@ -500,6 +500,9 @@ def test_compare_llama_cpp_returns_expected_metrics(self):
 
         with (
             patch.object(OnnxDiscrepancyCheck, "_get_llama_env_python", return_value="/mock/llama_env/bin/python"),
+            patch.object(
+                OnnxDiscrepancyCheck, "_get_convert_script", return_value="/mock/llama_env/convert_hf_to_gguf.py"
+            ),
             patch("subprocess.run", return_value=mock_proc),
             patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer),
             patch("transformers.AutoConfig.from_pretrained", return_value=self._make_hf_config()),
@@ -509,6 +512,7 @@ def test_compare_llama_cpp_returns_expected_metrics(self):
             result = pass_instance.compare_llama_cpp(
                 config,
                 mock_ref_model,
+                output_dir=str(tmp_path),
                 pytorch_latency_s=0.10,
                 onnx_latency_s=0.05,
             )
@@ -534,7 +538,7 @@ def test_compare_llama_cpp_returns_expected_metrics(self):
         # speedup = onnx_latency / llama_ttft = 0.05 / 0.05 = 1.0
         assert result["llama_cpp_speedup_vs_onnx"] == pytest.approx(1.0)
 
-    def test_compare_llama_cpp_no_latency_baselines(self):
+    def test_compare_llama_cpp_no_latency_baselines(self, tmp_path):
         """Speedup fields are None when pytorch/onnx latencies are not provided."""
         import json
 
@@ -568,13 +572,16 @@ def test_compare_llama_cpp_no_latency_baselines(self):
 
         with (
             patch.object(OnnxDiscrepancyCheck, "_get_llama_env_python", return_value="/mock/llama_env/bin/python"),
+            patch.object(
+                OnnxDiscrepancyCheck, "_get_convert_script", return_value="/mock/llama_env/convert_hf_to_gguf.py"
+            ),
             patch("subprocess.run", return_value=mock_proc),
             patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer),
             patch("transformers.AutoConfig.from_pretrained", return_value=self._make_hf_config()),
             patch("numpy.savez"),
         ):
             pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck)
-            result = pass_instance.compare_llama_cpp(config, mock_ref_model)
+            result = pass_instance.compare_llama_cpp(config, mock_ref_model, output_dir=str(tmp_path))
 
         assert result["llama_cpp_speedup_vs_pytorch"] is None
         assert result["llama_cpp_speedup_vs_onnx"] is None

From 568699be7d7a059bfcc707a3018fcf7d7bff51af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Mon, 29 Jun 2026 18:25:01 +0200
Subject: [PATCH 24/80] add missing depenencies

---
 .github/workflows/test-model-fast.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-model-fast.yml b/.github/workflows/test-model-fast.yml
index 1ac57d0ea..1a5fad7e2 100644
--- a/.github/workflows/test-model-fast.yml
+++ b/.github/workflows/test-model-fast.yml
@@ -35,7 +35,7 @@ jobs:
           python -m venv llama_env
           llama_env/bin/pip install --upgrade pip
           llama_env/bin/pip install gguf safetensors llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
-          llama_env/bin/pip install transformers sentencepiece protobuf tabulate
+          llama_env/bin/pip install transformers sentencepiece protobuf tabulate gguf
           curl -fsSL "https://raw.githubusercontent.com/ggerganov/llama.cpp/master/convert_hf_to_gguf.py" -o llama_env/convert_hf_to_gguf.py
 
       - name: pip freeze

From b9fa512cf6d9d3293d3f6935416f0348d0bc8bd0 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 29 Jun 2026 16:31:15 +0000
Subject: [PATCH 25/80] Fix convert_hf_to_gguf.py: clone conversion/ directory
 alongside script

---
 .github/workflows/test-model-fast.yml  |  5 ++-
 olive/passes/onnx/discrepancy_check.py | 52 ++++++++++++++++++--------
 2 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/test-model-fast.yml b/.github/workflows/test-model-fast.yml
index 1a5fad7e2..9f5e6c198 100644
--- a/.github/workflows/test-model-fast.yml
+++ b/.github/workflows/test-model-fast.yml
@@ -36,7 +36,10 @@ jobs:
           llama_env/bin/pip install --upgrade pip
           llama_env/bin/pip install gguf safetensors llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
           llama_env/bin/pip install transformers sentencepiece protobuf tabulate gguf
-          curl -fsSL "https://raw.githubusercontent.com/ggerganov/llama.cpp/master/convert_hf_to_gguf.py" -o llama_env/convert_hf_to_gguf.py
+          git clone --depth=1 --filter=blob:none --sparse https://github.com/ggerganov/llama.cpp.git /tmp/llama_cpp_repo
+          cd /tmp/llama_cpp_repo && git sparse-checkout set convert_hf_to_gguf.py conversion
+          cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py llama_env/
+          cp -r /tmp/llama_cpp_repo/conversion llama_env/
 
       - name: pip freeze
         run: |
diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 7353b3365..fc5d6eaf2 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -317,14 +317,17 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                     "Path to the virtual environment where llama-cpp-python and "
                     "``convert_hf_to_gguf.py`` are installed. "
                     "Defaults to 'llama_env' relative to the current working directory when "
-                    "``llama_cpp`` is True. Create this environment and download the conversion "
-                    "script with: "
+                    "``llama_cpp`` is True. Create this environment and obtain the conversion "
+                    "script and its dependencies with: "
                     "``python -m venv llama_env && llama_env/bin/pip install gguf safetensors "
                     "transformers sentencepiece protobuf "
                     "llama-cpp-python --extra-index-url "
                     "https://abetlen.github.io/llama-cpp-python/whl/cpu && "
-                    "curl -fsSL https://raw.githubusercontent.com/ggerganov/llama.cpp/master/convert_hf_to_gguf.py "
-                    "-o llama_env/convert_hf_to_gguf.py``."
+                    "git clone --depth=1 --filter=blob:none --sparse "
+                    "https://github.com/ggerganov/llama.cpp.git /tmp/llama_cpp_repo && "
+                    "cd /tmp/llama_cpp_repo && git sparse-checkout set convert_hf_to_gguf.py conversion && "
+                    "cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py llama_env/ && "
+                    "cp -r /tmp/llama_cpp_repo/conversion llama_env/``."
                 ),
             ),
         }
@@ -765,21 +768,38 @@ def _get_llama_env_python(env_path: str) -> str:
     def _get_convert_script(env_path: str) -> str:
         r"""Return the path to the ``convert_hf_to_gguf.py`` conversion script.
 
-        The script is expected to be placed at the root of the virtual environment
-        directory (i.e. ``{env_path}/convert_hf_to_gguf.py``).  Download it with::
+        The script and the accompanying ``conversion/`` package must be placed at the root
+        of the virtual environment directory (i.e. ``{env_path}/convert_hf_to_gguf.py`` and
+        ``{env_path}/conversion/``).  Obtain them via a sparse clone::
 
-            curl -fsSL https://raw.githubusercontent.com/ggerganov/llama.cpp/master/convert_hf_to_gguf.py \
-                -o {env_path}/convert_hf_to_gguf.py
+            git clone --depth=1 --filter=blob:none --sparse \
+                https://github.com/ggerganov/llama.cpp.git /tmp/llama_cpp_repo
+            cd /tmp/llama_cpp_repo && git sparse-checkout set convert_hf_to_gguf.py conversion
+            cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py {env_path}/
+            cp -r /tmp/llama_cpp_repo/conversion {env_path}/
         """
-        script = Path(env_path) / "convert_hf_to_gguf.py"
-        if script.exists():
-            return str(script)
-        raise RuntimeError(
-            f"Could not find convert_hf_to_gguf.py in '{env_path}'. "
-            "Download it from the llama.cpp repository: "
-            "curl -fsSL https://raw.githubusercontent.com/ggerganov/llama.cpp/master/convert_hf_to_gguf.py "
-            f"-o {env_path}/convert_hf_to_gguf.py"
+        env = Path(env_path)
+        script = env / "convert_hf_to_gguf.py"
+        conversion_pkg = env / "conversion"
+        setup_cmd = (
+            f"git clone --depth=1 --filter=blob:none --sparse "
+            f"https://github.com/ggerganov/llama.cpp.git /tmp/llama_cpp_repo && "
+            f"cd /tmp/llama_cpp_repo && git sparse-checkout set convert_hf_to_gguf.py conversion && "
+            f"cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py {env_path}/ && "
+            f"cp -r /tmp/llama_cpp_repo/conversion {env_path}/"
         )
+        if not script.exists():
+            raise RuntimeError(
+                f"Could not find convert_hf_to_gguf.py in '{env_path}'. "
+                f"Clone it from the llama.cpp repository: {setup_cmd}"
+            )
+        if not conversion_pkg.exists():
+            raise RuntimeError(
+                f"Could not find the 'conversion' package in '{env_path}'. "
+                "convert_hf_to_gguf.py requires the 'conversion/' directory alongside it. "
+                f"Clone it from the llama.cpp repository: {setup_cmd}"
+            )
+        return str(script)
 
     def compare_llama_cpp(
         self,

From 55a6515d99b1abcf3a50a82b582a0bdc88c00ab2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Mon, 29 Jun 2026 18:45:25 +0200
Subject: [PATCH 26/80] add missing arguments'

---
 .github/workflows/test-model-fast.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-model-fast.yml b/.github/workflows/test-model-fast.yml
index 9f5e6c198..0bc6a6d63 100644
--- a/.github/workflows/test-model-fast.yml
+++ b/.github/workflows/test-model-fast.yml
@@ -37,7 +37,7 @@ jobs:
           llama_env/bin/pip install gguf safetensors llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
           llama_env/bin/pip install transformers sentencepiece protobuf tabulate gguf
           git clone --depth=1 --filter=blob:none --sparse https://github.com/ggerganov/llama.cpp.git /tmp/llama_cpp_repo
-          cd /tmp/llama_cpp_repo && git sparse-checkout set convert_hf_to_gguf.py conversion
+          cd /tmp/llama_cpp_repo && git sparse-checkout set convert_hf_to_gguf.py conversion --skip-checks
           cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py llama_env/
           cp -r /tmp/llama_cpp_repo/conversion llama_env/
 

From b084fc33cda9de8a6153cfff6910b95634ff7d69 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 29 Jun 2026 16:54:18 +0000
Subject: [PATCH 27/80] Fix CI step: use LLAMA_ENV var and git -C to avoid cd
 changing cwd

---
 .github/workflows/test-model-fast.yml  | 15 ++++++++-------
 olive/passes/onnx/discrepancy_check.py |  6 +++---
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/test-model-fast.yml b/.github/workflows/test-model-fast.yml
index 0bc6a6d63..c74af0238 100644
--- a/.github/workflows/test-model-fast.yml
+++ b/.github/workflows/test-model-fast.yml
@@ -32,14 +32,15 @@ jobs:
 
       - name: Create llama_env and install llama-cpp-python
         run: |
-          python -m venv llama_env
-          llama_env/bin/pip install --upgrade pip
-          llama_env/bin/pip install gguf safetensors llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
-          llama_env/bin/pip install transformers sentencepiece protobuf tabulate gguf
+          LLAMA_ENV="$(pwd)/llama_env"
+          python -m venv "$LLAMA_ENV"
+          "$LLAMA_ENV/bin/pip" install --upgrade pip
+          "$LLAMA_ENV/bin/pip" install gguf safetensors llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+          "$LLAMA_ENV/bin/pip" install transformers sentencepiece protobuf tabulate gguf
           git clone --depth=1 --filter=blob:none --sparse https://github.com/ggerganov/llama.cpp.git /tmp/llama_cpp_repo
-          cd /tmp/llama_cpp_repo && git sparse-checkout set convert_hf_to_gguf.py conversion --skip-checks
-          cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py llama_env/
-          cp -r /tmp/llama_cpp_repo/conversion llama_env/
+          git -C /tmp/llama_cpp_repo sparse-checkout set convert_hf_to_gguf.py conversion
+          cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py "$LLAMA_ENV/"
+          cp -r /tmp/llama_cpp_repo/conversion "$LLAMA_ENV/"
 
       - name: pip freeze
         run: |
diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index fc5d6eaf2..a7597dd0b 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -325,7 +325,7 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                     "https://abetlen.github.io/llama-cpp-python/whl/cpu && "
                     "git clone --depth=1 --filter=blob:none --sparse "
                     "https://github.com/ggerganov/llama.cpp.git /tmp/llama_cpp_repo && "
-                    "cd /tmp/llama_cpp_repo && git sparse-checkout set convert_hf_to_gguf.py conversion && "
+                    "git -C /tmp/llama_cpp_repo sparse-checkout set convert_hf_to_gguf.py conversion && "
                     "cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py llama_env/ && "
                     "cp -r /tmp/llama_cpp_repo/conversion llama_env/``."
                 ),
@@ -774,7 +774,7 @@ def _get_convert_script(env_path: str) -> str:
 
             git clone --depth=1 --filter=blob:none --sparse \
                 https://github.com/ggerganov/llama.cpp.git /tmp/llama_cpp_repo
-            cd /tmp/llama_cpp_repo && git sparse-checkout set convert_hf_to_gguf.py conversion
+            git -C /tmp/llama_cpp_repo sparse-checkout set convert_hf_to_gguf.py conversion
             cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py {env_path}/
             cp -r /tmp/llama_cpp_repo/conversion {env_path}/
         """
@@ -784,7 +784,7 @@ def _get_convert_script(env_path: str) -> str:
         setup_cmd = (
             f"git clone --depth=1 --filter=blob:none --sparse "
             f"https://github.com/ggerganov/llama.cpp.git /tmp/llama_cpp_repo && "
-            f"cd /tmp/llama_cpp_repo && git sparse-checkout set convert_hf_to_gguf.py conversion && "
+            f"git -C /tmp/llama_cpp_repo sparse-checkout set convert_hf_to_gguf.py conversion && "
             f"cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py {env_path}/ && "
             f"cp -r /tmp/llama_cpp_repo/conversion {env_path}/"
         )

From c08375f3a6e3599bcee59579fff470cc781c4b52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Mon, 29 Jun 2026 18:59:46 +0200
Subject: [PATCH 28/80] add missing argument

---
 .github/workflows/test-model-fast.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-model-fast.yml b/.github/workflows/test-model-fast.yml
index c74af0238..fb03db2e1 100644
--- a/.github/workflows/test-model-fast.yml
+++ b/.github/workflows/test-model-fast.yml
@@ -38,7 +38,7 @@ jobs:
           "$LLAMA_ENV/bin/pip" install gguf safetensors llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
           "$LLAMA_ENV/bin/pip" install transformers sentencepiece protobuf tabulate gguf
           git clone --depth=1 --filter=blob:none --sparse https://github.com/ggerganov/llama.cpp.git /tmp/llama_cpp_repo
-          git -C /tmp/llama_cpp_repo sparse-checkout set convert_hf_to_gguf.py conversion
+          git -C /tmp/llama_cpp_repo sparse-checkout set convert_hf_to_gguf.py conversion --skip-checks
           cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py "$LLAMA_ENV/"
           cp -r /tmp/llama_cpp_repo/conversion "$LLAMA_ENV/"
 

From 35c191d5c12a51484f3cd06179bee57a51857ae2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Mon, 29 Jun 2026 19:21:06 +0200
Subject: [PATCH 29/80] lint

---
 olive/cli/base.py | 4 +---
 olive/cli/run.py  | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index b46b8f15b..a701cac55 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -38,9 +38,7 @@ def _parse_test_metrics(value: str) -> list:
     names = [m.strip() for m in value.replace(",", " ").split() if m.strip()]
     invalid = [n for n in names if n not in TEST_METRICS]
     if invalid:
-        raise argparse.ArgumentTypeError(
-            f"invalid choice(s): {invalid!r} (choose from {list(TEST_METRICS)})"
-        )
+        raise argparse.ArgumentTypeError(f"invalid choice(s): {invalid!r} (choose from {list(TEST_METRICS)})")
     return names
 
 
diff --git a/olive/cli/run.py b/olive/cli/run.py
index b488c2b8e..55c7ffb9b 100644
--- a/olive/cli/run.py
+++ b/olive/cli/run.py
@@ -5,8 +5,8 @@
 from argparse import ArgumentParser
 
 from olive.cli.base import (
-    _flatten_test_metrics,
     BaseOliveCLICommand,
+    _flatten_test_metrics,
     add_discrepancy_check_pass,
     add_hf_test_model_config,
     add_input_model_options,

From 8a44f3a56e28363550339c800427331c495b74cb Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 29 Jun 2026 17:40:15 +0000
Subject: [PATCH 30/80] Fix 404 error: resolve relative test_model_path to
 absolute before HF/ORT GenAI lookups

---
 olive/cli/base.py                  | 4 ++++
 olive/passes/onnx/model_builder.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index a701cac55..dd171615f 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -134,6 +134,10 @@ def add_discrepancy_check_pass(
     reference_model_path = input_model.get("test_model_path")
     if not reference_model_path:
         return run_config
+    # Resolve to absolute path so ORT GenAI and transformers always find a local
+    # directory rather than treating a relative path like "out/my-test" as a
+    # HuggingFace "org/repo" model identifier.
+    reference_model_path = str(Path(reference_model_path).resolve())
 
     # Determine output directory for discrepancy results
     report_dir = run_config.get("output_dir") or run_config.get("engine", {}).get("output_dir")
diff --git a/olive/passes/onnx/model_builder.py b/olive/passes/onnx/model_builder.py
index e4aa0fdbc..5321c13d1 100644
--- a/olive/passes/onnx/model_builder.py
+++ b/olive/passes/onnx/model_builder.py
@@ -251,7 +251,7 @@ def _run_for_config(
                     )
                 if not is_test_model_dir(model.test_model_path):
                     model.load_model(cache_model=False)
-                model_path = model.test_model_path
+                model_path = str(Path(model.test_model_path).resolve())
             # provide the model path as input path, model builder uses input_path for quantized models
             input_path = model_path
             if model.adapter_path:

From d65d753c89199ceafdae8bf0db2e5d0ab63a55d5 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 30 Jun 2026 07:08:27 +0000
Subject: [PATCH 31/80] Add num_hidden_layers parameter to OnnxDiscrepancyCheck
 (default 2)

---
 olive/passes/onnx/discrepancy_check.py | 27 +++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index a7597dd0b..1750b08ad 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -330,6 +330,15 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                     "cp -r /tmp/llama_cpp_repo/conversion llama_env/``."
                 ),
             ),
+            "num_hidden_layers": PassConfigParam(
+                type_=int,
+                default_value=2,
+                description=(
+                    "Override the number of hidden layers in the reference model before loading it. "
+                    "Reduces the model size for faster testing. The modified configuration is saved "
+                    "alongside the discrepancy check results."
+                ),
+            ),
         }
 
     def _run_for_config(
@@ -377,7 +386,17 @@ def _run_for_config(
                 f"Got architectures={architectures}"
             )
 
-        ref_model = AutoModelForCausalLM.from_pretrained(config.reference_model_path)
+        # Override the number of hidden layers when requested (speeds up testing with smaller models).
+        if config.num_hidden_layers is not None:
+            for attr_name in ("num_hidden_layers", "num_layers", "n_layer", "n_layers"):
+                if hasattr(ref_cfg, attr_name):
+                    setattr(ref_cfg, attr_name, config.num_hidden_layers)
+            logger.info(
+                "OnnxDiscrepancyCheck: overriding num_hidden_layers to %d on reference model config.",
+                config.num_hidden_layers,
+            )
+
+        ref_model = AutoModelForCausalLM.from_pretrained(config.reference_model_path, config=ref_cfg)
         ref_model.eval()
 
         # Determine the floating-point dtype used by the ONNX model weights and
@@ -426,6 +445,12 @@ def _run_for_config(
         if config.save_reference_model_state_dict:
             self._export_reference_model(ref_model, report_dir)
 
+        # Save the (potentially modified) model config alongside the results so the
+        # exact configuration used for this test run is always reproducible.
+        config_save_path = Path(report_dir) / "reference_model_config.json"
+        config_save_path.parent.mkdir(parents=True, exist_ok=True)
+        config_save_path.write_text(ref_cfg.to_json_string())
+
         session = model.prepare_session(
             device=device,
             execution_providers=[execution_provider] if execution_provider else None,

From a6dcb8e53b4733bf01ade717c0047186f0faafea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Tue, 30 Jun 2026 10:28:25 +0200
Subject: [PATCH 32/80] update documentation

---
 docs/source/how-to/cli/cli-fast-test.md |  2 ++
 olive/passes/onnx/discrepancy_check.py  | 19 -------------------
 2 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/docs/source/how-to/cli/cli-fast-test.md b/docs/source/how-to/cli/cli-fast-test.md
index 6e50e2387..24899b074 100644
--- a/docs/source/how-to/cli/cli-fast-test.md
+++ b/docs/source/how-to/cli/cli-fast-test.md
@@ -18,9 +18,11 @@ olive optimize \
     --precision int4 \
     --output_path out/qwen \
     --dry_run
+    --test out/qwen-test-model
 ```
 
 This creates `out/qwen/config.json` without launching the full conversion yet.
+It also adds a pass OnnxDiscrepancyCheck if not present.
 
 ## Step 2: run a fast smoke test with `olive run --test`
 
diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 1750b08ad..7ea87c0ff 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -330,15 +330,6 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                     "cp -r /tmp/llama_cpp_repo/conversion llama_env/``."
                 ),
             ),
-            "num_hidden_layers": PassConfigParam(
-                type_=int,
-                default_value=2,
-                description=(
-                    "Override the number of hidden layers in the reference model before loading it. "
-                    "Reduces the model size for faster testing. The modified configuration is saved "
-                    "alongside the discrepancy check results."
-                ),
-            ),
         }
 
     def _run_for_config(
@@ -386,16 +377,6 @@ def _run_for_config(
                 f"Got architectures={architectures}"
             )
 
-        # Override the number of hidden layers when requested (speeds up testing with smaller models).
-        if config.num_hidden_layers is not None:
-            for attr_name in ("num_hidden_layers", "num_layers", "n_layer", "n_layers"):
-                if hasattr(ref_cfg, attr_name):
-                    setattr(ref_cfg, attr_name, config.num_hidden_layers)
-            logger.info(
-                "OnnxDiscrepancyCheck: overriding num_hidden_layers to %d on reference model config.",
-                config.num_hidden_layers,
-            )
-
         ref_model = AutoModelForCausalLM.from_pretrained(config.reference_model_path, config=ref_cfg)
         ref_model.eval()
 

From 8f4eff2c451c3b26be00a359f3b924bcea610fe0 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 30 Jun 2026 08:48:33 +0000
Subject: [PATCH 33/80] Fix add_discrepancy_check_pass to update existing pass;
 add llama_cpp/num_hidden_layers docs

---
 docs/source/how-to/cli/cli-fast-test.md | 58 +++++++++++++++++++++----
 olive/cli/base.py                       | 45 ++++++++++++++++---
 test/cli/test_base.py                   | 53 +++++++++++++++++++++-
 3 files changed, 139 insertions(+), 17 deletions(-)

diff --git a/docs/source/how-to/cli/cli-fast-test.md b/docs/source/how-to/cli/cli-fast-test.md
index 24899b074..164d57c40 100644
--- a/docs/source/how-to/cli/cli-fast-test.md
+++ b/docs/source/how-to/cli/cli-fast-test.md
@@ -2,7 +2,9 @@
 
 If you are converting a large language model, it is often useful to validate the Olive command, environment, and conversion recipe on a much smaller model before spending time on the full checkpoint.
 
-The `--test` option does that for Hugging Face models. Olive keeps the same model architecture, reduces it to a random 2-layer test model, saves it to the folder you provide, and reuses that folder on later runs.
+The `--test` option does that for Hugging Face models. Olive keeps the same model architecture, reduces it to a random **2-layer** test model, saves it to the folder you provide, and reuses that folder on later runs.
+
+> **Why 2 layers?**  Olive automatically overrides the `num_hidden_layers` field of the model configuration to `2` (regardless of the original depth) before creating the test model.  This keeps the random test checkpoint small and fast to convert while preserving the model's architecture family (tokeniser, attention pattern, etc.).  The same 2-layer limit is applied when the `OnnxDiscrepancyCheck` pass loads the reference model for numerical comparison.
 
 This example uses [`Qwen/Qwen3-0.6B`](https://huggingface.co/Qwen/Qwen3-0.6B), but the same pattern works for other supported Hugging Face LLMs.
 
@@ -17,16 +19,16 @@ olive optimize \
     --provider CPUExecutionProvider \
     --precision int4 \
     --output_path out/qwen \
-    --dry_run
+    --dry_run \
     --test out/qwen-test-model
 ```
 
 This creates `out/qwen/config.json` without launching the full conversion yet.
-It also adds a pass OnnxDiscrepancyCheck if not present.
+It also inserts an `OnnxDiscrepancyCheck` pass (if one is not already present) that will compare the generated ONNX model against the 2-layer reference model.
 
 ## Step 2: run a fast smoke test with `olive run --test`
 
-Use the generated config with `olive run` and pass `--test` so Olive swaps in a reduced random Qwen model.
+Use the generated config with `olive run` and pass `--test` so Olive swaps in the reduced random model.
 
 ```bash
 olive run \
@@ -37,10 +39,10 @@ olive run \
 
 What this does:
 
-- `--test out/qwen-test-model` creates a reduced random Qwen model and saves it in `out/qwen-test-model`
-- later runs reuse the same saved test model instead of recreating it
+- `--test out/qwen-test-model` creates a reduced random Qwen model (2 hidden layers) and saves it in `out/qwen-test-model` on the first run; later runs reuse the same saved test model instead of recreating it
 - `--output_path out/qwen-test-run` gives the smoke test its own output folder, so the generated ONNX artifacts are easy to find
 - Olive marks that output folder as a test-only run and refuses to reuse a non-test conversion folder for `--test`
+- The saved model configuration (`reference_model_config.json`) is written alongside the discrepancy results so you can inspect exactly which config was used
 
 After the smoke test finishes, look under `out/qwen-test-run` for the exported ONNX model and related files.
 
@@ -59,16 +61,56 @@ By default, `--test` evaluates:
 - `mae`: maximum absolute error between the ONNX and reference model outputs
 
 Add `speedup` via `--test_metrics speedup` (or `--test_metrics mae speedup`) to also run latency measurement.
-You can select a subset with `--test_metrics`. For example, to run only speedup checks:
+
+> **Note:** `--test_metrics` is always respected even when the config was generated by `olive optimize --dry_run --test`, because Olive updates the existing `OnnxDiscrepancyCheck` settings each time `olive run --test` is invoked.
+
+For example, to run both accuracy and latency checks:
+
+```bash
+olive run \
+    --config out/qwen/config.json \
+    --test out/qwen-test-model \
+    --test_metrics mae,speedup \
+    --output_path out/qwen-test-run
+```
+
+Comma-separated (`mae,speedup`) and space-separated (`mae speedup`) forms are both accepted.
+
+### Optional: compare against llama.cpp
+
+If you have a `llama_env` virtual environment with `llama-cpp-python` installed, you can also compare the generated ONNX model against a llama.cpp GGUF conversion of the same reference model.
 
 ```bash
 olive run \
     --config out/qwen/config.json \
     --test out/qwen-test-model \
-    --test_metrics speedup \
+    --test_metrics mae,speedup \
+    --test_llama_path ./llama_env \
     --output_path out/qwen-test-run
 ```
 
+`--test_llama_path` points to the virtual environment that contains `llama-cpp-python` and `convert_hf_to_gguf.py` (from the llama.cpp repository). When provided, Olive:
+
+1. Saves the reference HuggingFace model to `<output_path>/hf_model` using `save_pretrained`.
+2. Calls `convert_hf_to_gguf.py` inside the virtual environment to produce a GGUF F32 file at `<output_path>/model.gguf`.
+3. Runs inference with `llama_cpp.Llama` inside the virtual environment and reports first-token latency and speedup metrics alongside the regular MAE and ONNX speedup results.
+
+All `llama-cpp-python` imports are strictly isolated to the subprocess — the main Olive process never imports them.
+
+To set up the `llama_env` virtual environment:
+
+```bash
+python -m venv llama_env
+llama_env/bin/pip install gguf safetensors transformers sentencepiece protobuf \
+    llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+git clone --depth=1 --filter=blob:none --sparse \
+    https://github.com/ggerganov/llama.cpp.git /tmp/llama_cpp_repo
+git -C /tmp/llama_cpp_repo sparse-checkout set convert_hf_to_gguf.py conversion
+LLAMA_ENV="$(pwd)/llama_env"
+cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py "$LLAMA_ENV/"
+cp -r /tmp/llama_cpp_repo/conversion "$LLAMA_ENV/"
+```
+
 ## Step 3: run the full conversion
 
 Once the smoke test succeeds, rerun the conversion on the full Qwen checkpoint by removing `--test`.
diff --git a/olive/cli/base.py b/olive/cli/base.py
index dd171615f..2cc4cb5f6 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -112,7 +112,7 @@ def warn_unused_test_metrics(test, metrics: Optional[list], llama_path: Optional
 def add_discrepancy_check_pass(
     run_config: dict, metrics: Optional[list] = None, llama_env_path: Optional[str] = None
 ) -> dict:
-    """Inject OnnxDiscrepancyCheck pass when --test is active and not already configured.
+    """Inject or update an OnnxDiscrepancyCheck pass in the run config when --test is active.
 
     ``metrics`` selects which test metrics to evaluate. Supported values are defined in
     ``TEST_METRICS`` (``"mae"`` for the max-absolute-error accuracy check and ``"speedup"`` for the
@@ -122,12 +122,13 @@ def add_discrepancy_check_pass(
     ``llama_env_path`` is the path to the llama_env virtual environment used for llama.cpp inference.
     When provided, the ``llama_cpp`` flag is enabled on the pass and the path is forwarded as
     ``llama_cpp_env_path``.
+
+    If an OnnxDiscrepancyCheck pass is already present in the config (e.g. because the config was
+    generated by ``olive optimize --dry_run --test``), the dynamic runtime fields
+    (``reference_model_path``, ``report_output_dir``, metric and llama.cpp settings) are updated so
+    that the current ``--test_metrics`` and ``--output_path`` values take effect.
     """
     passes = run_config.get("passes", {})
-    # Skip if already configured
-    for pass_config in passes.values():
-        if isinstance(pass_config, dict) and pass_config.get("type", "").lower() == "onnxdiscrepancycheck":
-            return run_config
 
     # Get the reference model path from the input_model test_model_path
     input_model = run_config.get("input_model", {})
@@ -143,10 +144,40 @@ def add_discrepancy_check_pass(
     report_dir = run_config.get("output_dir") or run_config.get("engine", {}).get("output_dir")
     if report_dir and Path(report_dir).suffix and not Path(report_dir).is_dir():
         report_dir = str(Path(report_dir).parent)
-    logger.debug("Adding OnnxDiscrepancyCheck pass with reference_model_path=%s", reference_model_path)
 
     selected_metrics = set(metrics) if metrics else {"mae"}
-    pass_config = {
+
+    # If the pass already exists, update the dynamic runtime fields rather than re-creating it from
+    # scratch.  This handles the common pattern of running ``olive optimize --dry_run --test …``
+    # (which saves a config with a pre-populated OnnxDiscrepancyCheck) and then ``olive run --config
+    # … --test … --test_metrics …``.  Without this update the ``--test_metrics`` selection and the
+    # new ``--output_path`` would be silently ignored.
+    for pass_cfg in passes.values():
+        if isinstance(pass_cfg, dict) and pass_cfg.get("type", "").lower() == "onnxdiscrepancycheck":
+            pass_cfg["reference_model_path"] = reference_model_path
+            if report_dir is not None:
+                pass_cfg["report_output_dir"] = report_dir
+            # Respect --test_metrics: enable/disable mae threshold and speedup measurement.
+            if "mae" in selected_metrics:
+                pass_cfg.setdefault("max_mae", 0.1)
+            else:
+                pass_cfg.pop("max_mae", None)
+            if "speedup" in selected_metrics:
+                pass_cfg.pop("timing_iterations", None)  # use the default (5 iterations)
+            else:
+                pass_cfg["timing_iterations"] = 0
+            # Enable llama.cpp when a venv path is provided.
+            if llama_env_path:
+                pass_cfg["llama_cpp"] = True
+                pass_cfg["llama_cpp_env_path"] = llama_env_path
+            logger.debug(
+                "Updated existing OnnxDiscrepancyCheck pass with reference_model_path=%s", reference_model_path
+            )
+            return run_config
+
+    logger.debug("Adding OnnxDiscrepancyCheck pass with reference_model_path=%s", reference_model_path)
+
+    pass_config: dict = {
         "type": "OnnxDiscrepancyCheck",
         "reference_model_path": reference_model_path,
         "report_output_dir": report_dir,
diff --git a/test/cli/test_base.py b/test/cli/test_base.py
index e7c5ee3ee..4e8be12f6 100644
--- a/test/cli/test_base.py
+++ b/test/cli/test_base.py
@@ -354,7 +354,7 @@ def test_add_discrepancy_check_pass_default_enables_mae_only():
 
     pass_config = run_config["passes"]["discrepancy_check"]
     assert pass_config["type"] == "OnnxDiscrepancyCheck"
-    assert pass_config["reference_model_path"] == "ref_model"
+    assert pass_config["reference_model_path"] == str(Path("ref_model").resolve())
     # default: mae only -> threshold enforced, timing disabled
     assert pass_config["max_mae"] == 0.1
     assert pass_config["timing_iterations"] == 0
@@ -429,7 +429,56 @@ def test_add_discrepancy_check_pass_no_llama_env_path_omits_llama_config():
     assert "llama_cpp_env_path" not in pass_config
 
 
-def test_parse_test_metrics_comma_separated():
+def test_add_discrepancy_check_pass_updates_existing_pass():
+    """When OnnxDiscrepancyCheck already exists in the config, its runtime fields are updated."""
+    from olive.cli.base import add_discrepancy_check_pass
+
+    # Simulate a config generated by `olive optimize --dry_run --test` - the pass already exists
+    # with stale settings (old output dir, timing disabled because no --test_metrics was given).
+    config = _discrepancy_run_config()
+    config["passes"] = {
+        "discrepancy_check": {
+            "type": "OnnxDiscrepancyCheck",
+            "reference_model_path": "/old/abs/path",
+            "report_output_dir": "/old/out_dir",
+            "timing_iterations": 0,  # speedup was not requested at generate-time
+        }
+    }
+    config["input_model"]["test_model_path"] = "new_ref_model"
+    config["output_dir"] = "new_out_dir"
+
+    result = add_discrepancy_check_pass(config, metrics=["mae", "speedup"])
+
+    pass_config = result["passes"]["discrepancy_check"]
+    # Reference model path and output dir must be updated to the current values.
+    assert pass_config["reference_model_path"] == str(Path("new_ref_model").resolve())
+    assert pass_config["report_output_dir"] == "new_out_dir"
+    # With metrics=["speedup"], timing_iterations should NOT be forced to 0.
+    assert "timing_iterations" not in pass_config
+    # With metrics including "mae", max_mae threshold should be set.
+    assert pass_config["max_mae"] == 0.1
+
+
+def test_add_discrepancy_check_pass_updates_existing_pass_speedup_only():
+    """Updating an existing pass with speedup-only metrics removes mae threshold."""
+    from olive.cli.base import add_discrepancy_check_pass
+
+    config = _discrepancy_run_config()
+    config["passes"] = {
+        "dc": {
+            "type": "onnxdiscrepancycheck",  # case-insensitive type match
+            "reference_model_path": "/old/path",
+            "max_mae": 0.1,
+            "timing_iterations": 0,
+        }
+    }
+
+    result = add_discrepancy_check_pass(config, metrics=["speedup"])
+
+    pass_config = result["passes"]["dc"]
+    assert "max_mae" not in pass_config
+    assert "timing_iterations" not in pass_config
+
     from olive.cli.base import _parse_test_metrics
 
     assert _parse_test_metrics("mae,speedup") == ["mae", "speedup"]

From ca2c09458e1b786e006a7804eb6c87e2333d57a0 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 30 Jun 2026 09:52:49 +0000
Subject: [PATCH 34/80] Fix test model cache persistence + add
 num_hidden_layers to OnnxDiscrepancyCheck

- model_builder: copy test model to 'reference_hf_model/' alongside ONNX output
  so it survives engine cache hits; set model_attributes['hf_reference_model_dir']
  and exclude the dir from additional_files
- discrepancy_check: resolve reference_model_path with fallback to
  '<onnx_dir>/reference_hf_model/' when the original path no longer exists;
  add num_hidden_layers parameter; pass resolved ref_path to compare_generation
  and compare_llama_cpp instead of config.reference_model_path
- base.py: add num_hidden_layers=2 to OnnxDiscrepancyCheck pass config
- docs: clarify WHERE (_apply_test_model_config) and WHEN (first olive run)
  layers are reduced; document cache fallback behaviour
---
 docs/source/how-to/cli/cli-fast-test.md | 16 ++++++-
 olive/cli/base.py                       |  2 +
 olive/passes/onnx/discrepancy_check.py  | 55 ++++++++++++++++++++++---
 olive/passes/onnx/model_builder.py      | 18 +++++++-
 4 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/docs/source/how-to/cli/cli-fast-test.md b/docs/source/how-to/cli/cli-fast-test.md
index 164d57c40..588ada470 100644
--- a/docs/source/how-to/cli/cli-fast-test.md
+++ b/docs/source/how-to/cli/cli-fast-test.md
@@ -4,7 +4,17 @@ If you are converting a large language model, it is often useful to validate the
 
 The `--test` option does that for Hugging Face models. Olive keeps the same model architecture, reduces it to a random **2-layer** test model, saves it to the folder you provide, and reuses that folder on later runs.
 
-> **Why 2 layers?**  Olive automatically overrides the `num_hidden_layers` field of the model configuration to `2` (regardless of the original depth) before creating the test model.  This keeps the random test checkpoint small and fast to convert while preserving the model's architecture family (tokeniser, attention pattern, etc.).  The same 2-layer limit is applied when the `OnnxDiscrepancyCheck` pass loads the reference model for numerical comparison.
+> **Why 2 layers?**  When `olive run` executes the model-builder pass for the first time it calls
+> `_apply_test_model_config` (in `olive/common/hf/utils.py`) to override every hidden-layer count
+> field (`num_hidden_layers`, `num_layers`, `n_layer`, `n_layers`) to `2` before the random test
+> checkpoint is created.  This keeps the checkpoint small and fast to convert while preserving the
+> model's architecture family (tokeniser, attention pattern, etc.).
+>
+> **Note:** `olive optimize --dry_run` only generates the workflow config — it does **not** run any
+> passes or create the test model directory.  The test model is created the first time you run
+> `olive run --test`.  On subsequent `olive run` calls that hit the model-builder cache, Olive
+> automatically falls back to a copy of the test model saved alongside the cached ONNX artifacts,
+> so the test model directory is not required to exist on disk.
 
 This example uses [`Qwen/Qwen3-0.6B`](https://huggingface.co/Qwen/Qwen3-0.6B), but the same pattern works for other supported Hugging Face LLMs.
 
@@ -39,7 +49,7 @@ olive run \
 
 What this does:
 
-- `--test out/qwen-test-model` creates a reduced random Qwen model (2 hidden layers) and saves it in `out/qwen-test-model` on the first run; later runs reuse the same saved test model instead of recreating it
+- `--test out/qwen-test-model` creates a reduced random Qwen model (2 hidden layers) and saves it in `out/qwen-test-model` on the first run; on later runs Olive reuses the saved test model — or, if the model-builder output is cached and `out/qwen-test-model` no longer exists, it automatically falls back to the copy saved inside the ONNX cache directory
 - `--output_path out/qwen-test-run` gives the smoke test its own output folder, so the generated ONNX artifacts are easy to find
 - Olive marks that output folder as a test-only run and refuses to reuse a non-test conversion folder for `--test`
 - The saved model configuration (`reference_model_config.json`) is written alongside the discrepancy results so you can inspect exactly which config was used
@@ -131,6 +141,8 @@ The saved test model is useful beyond the first smoke test:
 - you can reuse the same HF test model later when comparing the Hugging Face model against the exported ONNX model
 - you avoid recreating a new random test checkpoint every time
 
+Even if you delete the test model folder, `OnnxDiscrepancyCheck` will automatically use the copy saved inside the model-builder cache directory (`reference_hf_model/` alongside the ONNX artifacts), so the comparison step continues to work.
+
 ## Related docs
 
 - [How to use the `olive optimize` command to optimize a Pytorch model](cli-optimize)
diff --git a/olive/cli/base.py b/olive/cli/base.py
index 2cc4cb5f6..2ef862e5f 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -155,6 +155,7 @@ def add_discrepancy_check_pass(
     for pass_cfg in passes.values():
         if isinstance(pass_cfg, dict) and pass_cfg.get("type", "").lower() == "onnxdiscrepancycheck":
             pass_cfg["reference_model_path"] = reference_model_path
+            pass_cfg["num_hidden_layers"] = 2
             if report_dir is not None:
                 pass_cfg["report_output_dir"] = report_dir
             # Respect --test_metrics: enable/disable mae threshold and speedup measurement.
@@ -181,6 +182,7 @@ def add_discrepancy_check_pass(
         "type": "OnnxDiscrepancyCheck",
         "reference_model_path": reference_model_path,
         "report_output_dir": report_dir,
+        "num_hidden_layers": 2,
     }
     # Enforce the max-absolute-error threshold only when the accuracy metric is requested.
     if "mae" in selected_metrics:
diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 7ea87c0ff..ee6062582 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -330,6 +330,19 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                     "cp -r /tmp/llama_cpp_repo/conversion llama_env/``."
                 ),
             ),
+            "num_hidden_layers": PassConfigParam(
+                type_=Optional[int],
+                default_value=None,
+                description=(
+                    "When set, overrides the number of hidden layers in the reference HuggingFace "
+                    "model config before loading it.  Useful when ``reference_model_path`` points "
+                    "to a pre-saved small test model that already has a reduced layer count — the "
+                    "override is then a no-op but makes the intended layer count explicit in the "
+                    "saved ``reference_model_config.json`` report.  Supports ``num_hidden_layers``, "
+                    "``num_layers``, ``n_layer``, and ``n_layers`` to cover both BERT-style and "
+                    "GPT-style model families."
+                ),
+            ),
         }
 
     def _run_for_config(
@@ -369,7 +382,34 @@ def _run_for_config(
         # Load reference PyTorch model
         from transformers import AutoConfig, AutoModelForCausalLM
 
-        ref_cfg = AutoConfig.from_pretrained(config.reference_model_path)
+        # Resolve the reference model path.  Use the configured path if it exists as a local
+        # directory; otherwise fall back to the ``reference_hf_model`` copy that ModelBuilder
+        # saves alongside the ONNX output.  That copy is written on the first successful build
+        # and is preserved across engine cache hits, so OnnxDiscrepancyCheck keeps working even
+        # when the original ``test_model_path`` (e.g. ``out/tiny-test``) has been deleted.
+        ref_path = config.reference_model_path
+        if not Path(ref_path).is_dir():
+            hf_ref_dir = (model.model_attributes or {}).get("hf_reference_model_dir", "reference_hf_model")
+            fallback = Path(model.model_path).parent / hf_ref_dir
+            if fallback.is_dir():
+                logger.info(
+                    "Reference model not found at %r; using cached copy at %r.",
+                    ref_path,
+                    str(fallback),
+                )
+                ref_path = str(fallback)
+            else:
+                raise RuntimeError(
+                    f"Reference model directory {ref_path!r} does not exist and no cached copy was "
+                    f"found at {str(fallback)!r}. Re-run the optimization workflow (olive run) to "
+                    "recreate the test model."
+                )
+
+        ref_cfg = AutoConfig.from_pretrained(ref_path)
+        if config.num_hidden_layers is not None:
+            from olive.common.hf.utils import _apply_test_model_config
+
+            ref_cfg = _apply_test_model_config(ref_cfg, {"num_hidden_layers": config.num_hidden_layers})
         architectures = getattr(ref_cfg, "architectures", None) or []
         if not any("ForCausalLM" in arch for arch in architectures):
             raise ValueError(
@@ -377,7 +417,7 @@ def _run_for_config(
                 f"Got architectures={architectures}"
             )
 
-        ref_model = AutoModelForCausalLM.from_pretrained(config.reference_model_path, config=ref_cfg)
+        ref_model = AutoModelForCausalLM.from_pretrained(ref_path, config=ref_cfg)
         ref_model.eval()
 
         # Determine the floating-point dtype used by the ONNX model weights and
@@ -533,7 +573,7 @@ def _run_for_config(
 
         # Generation token sequence comparison (transformers vs ONNX Runtime GenAI)
         if config.genai_model_path:
-            gen_results = self.compare_generation(config, ref_model)
+            gen_results = self.compare_generation(config, ref_model, ref_model_path=ref_path)
             longest_common = gen_results["longest_common_token_sequence"]
             results.update(gen_results)
             results["genai_model_path"] = config.genai_model_path
@@ -554,6 +594,7 @@ def _run_for_config(
                 output_dir=report_dir,
                 pytorch_latency_s=results.get("pytorch_latency_s"),
                 onnx_latency_s=results.get("onnx_latency_s"),
+                ref_model_path=ref_path,
             )
             results.update(llama_results)
 
@@ -640,7 +681,7 @@ def _measure_speedup(
 
         return pytorch_time, onnx_time, speedup
 
-    def compare_generation(self, config: type[BasePassConfig], ref_model) -> dict:
+    def compare_generation(self, config: type[BasePassConfig], ref_model, *, ref_model_path: str) -> dict:
         """Run generation on both transformers and GenAI and compare them.
 
         Returns a dict with the longest common token sequence length and the time-to-first-token
@@ -653,7 +694,7 @@ def compare_generation(self, config: type[BasePassConfig], ref_model) -> dict:
             raise ImportError("Please install `onnxruntime-genai` to enable generation comparison.") from exc
         from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList
 
-        tokenizer = AutoTokenizer.from_pretrained(config.reference_model_path)
+        tokenizer = AutoTokenizer.from_pretrained(ref_model_path)
 
         max_new_tokens = config.generate_max_new_tokens
         first_n = max(1, min(config.time_to_first_n_tokens, max_new_tokens)) if max_new_tokens > 0 else 0
@@ -814,6 +855,8 @@ def compare_llama_cpp(
         output_dir: str,
         pytorch_latency_s: Optional[float] = None,
         onnx_latency_s: Optional[float] = None,
+        *,
+        ref_model_path: str,
     ) -> dict:
         """Convert the reference model to GGUF and compare inference with llama.cpp.
 
@@ -841,7 +884,7 @@ def compare_llama_cpp(
         convert_script = self._get_convert_script(env_path)
 
         # Tokenize the generation prompt using the main-env tokenizer
-        tokenizer = AutoTokenizer.from_pretrained(config.reference_model_path)
+        tokenizer = AutoTokenizer.from_pretrained(ref_model_path)
         encoded = tokenizer(config.generate_prompt, return_tensors="pt")
         prompt_token_ids: list[int] = encoded["input_ids"][0].tolist()
 
diff --git a/olive/passes/onnx/model_builder.py b/olive/passes/onnx/model_builder.py
index 5321c13d1..e52b8c97e 100644
--- a/olive/passes/onnx/model_builder.py
+++ b/olive/passes/onnx/model_builder.py
@@ -9,6 +9,7 @@
 import json
 import logging
 import os
+import shutil
 from enum import IntEnum
 from pathlib import Path
 from typing import Any, ClassVar, Union
@@ -379,6 +380,18 @@ def _run_for_config(
                     output_model_filepath.parent,
                 )
 
+        # When a test model was used as the build input, save a copy alongside the generated
+        # ONNX files so it survives across engine cache hits.  OnnxDiscrepancyCheck will fall
+        # back to this copy when ``reference_model_path`` (e.g. ``out/tiny-test``) no longer
+        # exists on disk — for example on subsequent ``olive run`` invocations that hit the
+        # engine output cache and never re-execute this pass.
+        if not metadata_only and model.test_model_config:
+            ref_copy_path = output_model_filepath.parent / "reference_hf_model"
+            if not ref_copy_path.exists():
+                shutil.copytree(model_path, str(ref_copy_path))
+                logger.debug("Saved reference HF model copy to %s", ref_copy_path)
+            model_attributes["hf_reference_model_dir"] = "reference_hf_model"
+
         # add additional files generated by model builder to model_attributes
         additional_files = model_attributes.get("additional_files") or []
         if metadata_only:
@@ -392,8 +405,9 @@ def _run_for_config(
             primary_model_paths = {str(fp) for fp in primary_onnx_files}
             model_attributes["additional_files"] = sorted(
                 set(additional_files)
-                # all files in the output directory except the model and model.data files
-                | {str(fp) for fp in output_model_filepath.parent.iterdir()}
+                # all files in the output directory except the model, model.data files,
+                # and the reference HF model copy (handled separately via model_attributes)
+                | {str(fp) for fp in output_model_filepath.parent.iterdir() if fp.name != "reference_hf_model"}
                 - primary_model_paths
                 - {f"{path}.data" for path in primary_model_paths}
             )

From e76ef03c43c47de871761073dbe6f728c87f44a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Tue, 30 Jun 2026 13:05:12 +0200
Subject: [PATCH 35/80] remove num_hidden_layers

---
 olive/cli/base.py                      |  2 --
 olive/passes/onnx/discrepancy_check.py | 17 -----------------
 2 files changed, 19 deletions(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index 2ef862e5f..2cc4cb5f6 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -155,7 +155,6 @@ def add_discrepancy_check_pass(
     for pass_cfg in passes.values():
         if isinstance(pass_cfg, dict) and pass_cfg.get("type", "").lower() == "onnxdiscrepancycheck":
             pass_cfg["reference_model_path"] = reference_model_path
-            pass_cfg["num_hidden_layers"] = 2
             if report_dir is not None:
                 pass_cfg["report_output_dir"] = report_dir
             # Respect --test_metrics: enable/disable mae threshold and speedup measurement.
@@ -182,7 +181,6 @@ def add_discrepancy_check_pass(
         "type": "OnnxDiscrepancyCheck",
         "reference_model_path": reference_model_path,
         "report_output_dir": report_dir,
-        "num_hidden_layers": 2,
     }
     # Enforce the max-absolute-error threshold only when the accuracy metric is requested.
     if "mae" in selected_metrics:
diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index ee6062582..c2dfe71da 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -330,19 +330,6 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                     "cp -r /tmp/llama_cpp_repo/conversion llama_env/``."
                 ),
             ),
-            "num_hidden_layers": PassConfigParam(
-                type_=Optional[int],
-                default_value=None,
-                description=(
-                    "When set, overrides the number of hidden layers in the reference HuggingFace "
-                    "model config before loading it.  Useful when ``reference_model_path`` points "
-                    "to a pre-saved small test model that already has a reduced layer count — the "
-                    "override is then a no-op but makes the intended layer count explicit in the "
-                    "saved ``reference_model_config.json`` report.  Supports ``num_hidden_layers``, "
-                    "``num_layers``, ``n_layer``, and ``n_layers`` to cover both BERT-style and "
-                    "GPT-style model families."
-                ),
-            ),
         }
 
     def _run_for_config(
@@ -406,10 +393,6 @@ def _run_for_config(
                 )
 
         ref_cfg = AutoConfig.from_pretrained(ref_path)
-        if config.num_hidden_layers is not None:
-            from olive.common.hf.utils import _apply_test_model_config
-
-            ref_cfg = _apply_test_model_config(ref_cfg, {"num_hidden_layers": config.num_hidden_layers})
         architectures = getattr(ref_cfg, "architectures", None) or []
         if not any("ForCausalLM" in arch for arch in architectures):
             raise ValueError(

From d195d9f3af8777c9d276b58c350e81b23470e3b7 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 30 Jun 2026 11:19:08 +0000
Subject: [PATCH 36/80] Pre-create test model config dir during --dry_run
 --test

---
 olive/cli/base.py        | 20 ++++++++++++++++++++
 olive/common/hf/utils.py | 34 +++++++++++++++++++++++++++++++++-
 test/common/test_hf.py   | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index 2cc4cb5f6..e507836b4 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -198,6 +198,25 @@ def add_discrepancy_check_pass(
     return run_config
 
 
+def _save_test_model_config_for_dry_run(run_config: dict) -> None:
+    """Pre-create the test model directory with a modified config during --dry_run --test.
+
+    Saves the HuggingFace config (with the reduced hidden-layer count) and the
+    test-model marker file to the ``test_model_path`` directory so that subsequent
+    ``olive run`` calls can find the directory.  Model weights are created the first
+    time ``load_model_from_task`` is invoked during an actual run.
+    """
+    from olive.common.hf.utils import save_test_model_config
+
+    input_model = run_config.get("input_model", {})
+    test_model_path = input_model.get("test_model_path")
+    model_path = input_model.get("model_path")
+    test_model_config = input_model.get("test_model_config")
+    if not (test_model_path and model_path and test_model_config):
+        return
+    save_test_model_config(model_path, test_model_config, test_model_path)
+
+
 def save_discrepancy_check_results(workflow_output, output_path: str) -> None:
     """Save discrepancy check results from model attributes to the output directory."""
     if not workflow_output or not workflow_output.has_output_model():
@@ -258,6 +277,7 @@ def _run_workflow(self):
             if self.args.dry_run:
                 if getattr(self.args, "test", None) not in (None, False):
                     mark_test_output_path(self.args.output_path)
+                    _save_test_model_config_for_dry_run(run_config)
                 print("Dry run mode enabled. Configuration file is generated but no optimization is performed.")
                 return None
             workflow_output = olive_run(run_config)
diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py
index 75649d2e3..74326e83f 100644
--- a/olive/common/hf/utils.py
+++ b/olive/common/hf/utils.py
@@ -110,6 +110,26 @@ def _save_test_model(model: "PreTrainedModel", output_dir: str, test_model_confi
     _write_test_model_marker(output_path, test_model_config)
 
 
+def save_test_model_config(
+    model_name_or_path: str, test_model_config: Optional[dict[str, Any]], test_model_path: str
+) -> None:
+    """Save a modified config.json (without model weights) to *test_model_path*.
+
+    Used during ``--dry_run --test`` to pre-create the test model directory with the
+    reduced-layer config so that subsequent ``olive run`` calls can find the directory
+    and complete it with random weights the first time ModelBuilder runs.
+    """
+    output_path = Path(test_model_path)
+    if is_test_model_dir(output_path):
+        logger.debug("Test model config directory already exists at %s.", output_path)
+        return
+    output_path.mkdir(parents=True, exist_ok=True)
+    model_config = get_model_config(model_name_or_path, test_model_config=test_model_config)
+    model_config.save_pretrained(str(output_path))
+    _write_test_model_marker(output_path, test_model_config)
+    logger.info("Saved test model config to %s.", output_path)
+
+
 def _validate_path(test_model_dir: Path, test_model_path: str):
     if not test_model_dir or not test_model_dir.exists():
         return
@@ -174,7 +194,19 @@ def load_model_from_task(
             if test_model_config:
                 test_model_dir = Path(test_model_path) if test_model_path else None
                 if test_model_dir and is_test_model_dir(test_model_dir):
-                    model = from_pretrained(model_class, test_model_path, "model", **kwargs)
+                    # Check if model weights are present.  A config-only directory (created by
+                    # ``save_test_model_config`` during ``--dry_run --test``) has a config.json
+                    # and a marker file but no weight shards yet.  In that case, create a random
+                    # model from the saved config and persist the weights so subsequent loads
+                    # can use the saved directory directly.
+                    _has_weights = any(test_model_dir.glob("*.safetensors")) or any(
+                        test_model_dir.glob("pytorch_model*.bin")
+                    )
+                    if _has_weights:
+                        model = from_pretrained(model_class, test_model_path, "model", **kwargs)
+                    else:
+                        model = _load_test_model(model_class, model_config, kwargs.get("trust_remote_code"))
+                        _save_test_model(model, test_model_path, test_model_config)
                 else:
                     _validate_path(test_model_dir, test_model_path)
                     model = _load_test_model(model_class, model_config, kwargs.get("trust_remote_code"))
diff --git a/test/common/test_hf.py b/test/common/test_hf.py
index 9f1ad736e..c3095382f 100644
--- a/test/common/test_hf.py
+++ b/test/common/test_hf.py
@@ -106,6 +106,8 @@ def test_load_model_from_task_test_model_config_reuses_saved_model(tmp_path):
     test_model_path.mkdir()
     (test_model_path / "config.json").write_text("{}")
     (test_model_path / TEST_MODEL_MARKER_FILE).write_text(json.dumps({"type": "olive_hf_test_model"}))
+    # Add a dummy weight shard so the weights-present branch is exercised.
+    (test_model_path / "model.safetensors").write_bytes(b"dummy")
     loaded_model = MagicMock(spec=torch.nn.Module)
 
     with (
@@ -129,6 +131,37 @@ def test_load_model_from_task_test_model_config_reuses_saved_model(tmp_path):
     assert mock_from_pretrained.call_args_list[1].args[1] == str(test_model_path)
 
 
+def test_load_model_from_task_test_model_config_completes_config_only_dir(tmp_path):
+    """A config-only test model dir (created during --dry_run) should be completed with weights."""
+    model_config = BertConfig(num_hidden_layers=12)  # pylint: disable=unexpected-keyword-arg
+    test_model_path = tmp_path / "config_only_test_model"
+    test_model_path.mkdir()
+    # Simulate a config-only dir created by save_test_model_config during --dry_run:
+    # has config.json + marker but no weight shards.
+    (test_model_path / "config.json").write_text("{}")
+    (test_model_path / TEST_MODEL_MARKER_FILE).write_text(json.dumps({"type": "olive_hf_test_model"}))
+    created_model = MagicMock()
+
+    with (
+        patch("transformers.pipelines.check_task") as mock_check_task,
+        patch("olive.common.hf.utils.from_pretrained", return_value=model_config),
+    ):
+        mock_model_class = MagicMock()
+        mock_model_class.from_config.return_value = created_model
+        mock_check_task.return_value = ("text-classification", {"pt": (mock_model_class,)}, None)
+
+        model = load_model_from_task(
+            "text-classification",
+            "dummy-model",
+            test_model_config={"num_hidden_layers": 2},
+            test_model_path=str(test_model_path),
+        )
+
+    assert model is created_model
+    mock_model_class.from_config.assert_called_once()
+    created_model.save_pretrained.assert_called_once_with(str(test_model_path))
+
+
 def test_load_model_from_task_test_model_config_rejects_non_test_model_dir(tmp_path):
     model_config = BertConfig(num_hidden_layers=12)  # pylint: disable=unexpected-keyword-arg
     test_model_path = tmp_path / "saved_test_model"

From bce0baf11f5e808425bb3fa1ec31bfd5f63186a9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 30 Jun 2026 11:39:28 +0000
Subject: [PATCH 37/80] Add SaveTestModelConfig pass to create test model
 config directory

---
 olive/cli/base.py                             | 53 ++++++++++---------
 olive/olive_config.json                       |  8 +++
 .../passes/pytorch/save_test_model_config.py  | 53 +++++++++++++++++++
 test/cli/test_base.py                         | 45 +++++++++++++---
 4 files changed, 127 insertions(+), 32 deletions(-)
 create mode 100644 olive/passes/pytorch/save_test_model_config.py

diff --git a/olive/cli/base.py b/olive/cli/base.py
index e507836b4..fee464760 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -112,7 +112,7 @@ def warn_unused_test_metrics(test, metrics: Optional[list], llama_path: Optional
 def add_discrepancy_check_pass(
     run_config: dict, metrics: Optional[list] = None, llama_env_path: Optional[str] = None
 ) -> dict:
-    """Inject or update an OnnxDiscrepancyCheck pass in the run config when --test is active.
+    """Inject or update a SaveTestModelConfig and an OnnxDiscrepancyCheck pass when --test is active.
 
     ``metrics`` selects which test metrics to evaluate. Supported values are defined in
     ``TEST_METRICS`` (``"mae"`` for the max-absolute-error accuracy check and ``"speedup"`` for the
@@ -123,10 +123,19 @@ def add_discrepancy_check_pass(
     When provided, the ``llama_cpp`` flag is enabled on the pass and the path is forwarded as
     ``llama_cpp_env_path``.
 
-    If an OnnxDiscrepancyCheck pass is already present in the config (e.g. because the config was
-    generated by ``olive optimize --dry_run --test``), the dynamic runtime fields
-    (``reference_model_path``, ``report_output_dir``, metric and llama.cpp settings) are updated so
-    that the current ``--test_metrics`` and ``--output_path`` values take effect.
+    Two passes are managed:
+
+    * ``SaveTestModelConfig`` — inserted at the *beginning* of the passes dict so that the
+      test-model directory (containing only ``config.json`` and the marker file) is created
+      before any other pass runs.  This ensures subsequent passes can find the directory even
+      on the first ``olive run`` after ``olive optimize --dry_run --test``.
+
+    * ``OnnxDiscrepancyCheck`` — appended at the end to compare the ONNX model against the
+      reference HuggingFace model.  If an instance is already present in the config (e.g.
+      from a previous ``--dry_run --test`` invocation), its dynamic runtime fields
+      (``reference_model_path``, ``report_output_dir``, metric and llama.cpp settings) are
+      updated in-place so that the current ``--test_metrics`` and ``--output_path`` values
+      always take effect.
     """
     passes = run_config.get("passes", {})
 
@@ -140,6 +149,19 @@ def add_discrepancy_check_pass(
     # HuggingFace "org/repo" model identifier.
     reference_model_path = str(Path(reference_model_path).resolve())
 
+    # --- SaveTestModelConfig pass (injected at the beginning) ---
+    # Ensure the pass is present and positioned before any other pass so that
+    # the test-model directory is created on the first real run.
+    has_save_pass = any(
+        isinstance(cfg, dict) and cfg.get("type", "").lower() == "savetestmodelconfig" for cfg in passes.values()
+    )
+    if not has_save_pass:
+        logger.debug("Adding SaveTestModelConfig pass at the beginning of the passes dict")
+        new_passes = {"save_test_model_config": {"type": "SaveTestModelConfig"}}
+        new_passes.update(passes)
+        passes = new_passes
+        run_config["passes"] = passes
+
     # Determine output directory for discrepancy results
     report_dir = run_config.get("output_dir") or run_config.get("engine", {}).get("output_dir")
     if report_dir and Path(report_dir).suffix and not Path(report_dir).is_dir():
@@ -147,6 +169,7 @@ def add_discrepancy_check_pass(
 
     selected_metrics = set(metrics) if metrics else {"mae"}
 
+    # --- OnnxDiscrepancyCheck pass ---
     # If the pass already exists, update the dynamic runtime fields rather than re-creating it from
     # scratch.  This handles the common pattern of running ``olive optimize --dry_run --test …``
     # (which saves a config with a pre-populated OnnxDiscrepancyCheck) and then ``olive run --config
@@ -198,25 +221,6 @@ def add_discrepancy_check_pass(
     return run_config
 
 
-def _save_test_model_config_for_dry_run(run_config: dict) -> None:
-    """Pre-create the test model directory with a modified config during --dry_run --test.
-
-    Saves the HuggingFace config (with the reduced hidden-layer count) and the
-    test-model marker file to the ``test_model_path`` directory so that subsequent
-    ``olive run`` calls can find the directory.  Model weights are created the first
-    time ``load_model_from_task`` is invoked during an actual run.
-    """
-    from olive.common.hf.utils import save_test_model_config
-
-    input_model = run_config.get("input_model", {})
-    test_model_path = input_model.get("test_model_path")
-    model_path = input_model.get("model_path")
-    test_model_config = input_model.get("test_model_config")
-    if not (test_model_path and model_path and test_model_config):
-        return
-    save_test_model_config(model_path, test_model_config, test_model_path)
-
-
 def save_discrepancy_check_results(workflow_output, output_path: str) -> None:
     """Save discrepancy check results from model attributes to the output directory."""
     if not workflow_output or not workflow_output.has_output_model():
@@ -277,7 +281,6 @@ def _run_workflow(self):
             if self.args.dry_run:
                 if getattr(self.args, "test", None) not in (None, False):
                     mark_test_output_path(self.args.output_path)
-                    _save_test_model_config_for_dry_run(run_config)
                 print("Dry run mode enabled. Configuration file is generated but no optimization is performed.")
                 return None
             workflow_output = olive_run(run_config)
diff --git a/olive/olive_config.json b/olive/olive_config.json
index 88a096c2f..d3b21ae89 100644
--- a/olive/olive_config.json
+++ b/olive/olive_config.json
@@ -642,6 +642,14 @@
             "supported_algorithms": [ "rtn" ],
             "supported_quantization_encodings": [  ]
         },
+        "SaveTestModelConfig": {
+            "module_path": "olive.passes.pytorch.save_test_model_config.SaveTestModelConfig",
+            "supported_providers": [ "*" ],
+            "supported_accelerators": [ "*" ],
+            "supported_precisions": [ "*" ],
+            "supported_algorithms": [  ],
+            "supported_quantization_encodings": [  ]
+        },
         "SelectiveMixedPrecision": {
             "module_path": "olive.passes.pytorch.selective_mixed_precision.SelectiveMixedPrecision",
             "supported_providers": [ "*" ],
diff --git a/olive/passes/pytorch/save_test_model_config.py b/olive/passes/pytorch/save_test_model_config.py
new file mode 100644
index 000000000..74569be24
--- /dev/null
+++ b/olive/passes/pytorch/save_test_model_config.py
@@ -0,0 +1,53 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import logging
+
+from olive.hardware.accelerator import AcceleratorSpec
+from olive.model import HfModelHandler
+from olive.passes import Pass
+from olive.passes.pass_config import BasePassConfig, PassConfigParam
+
+logger = logging.getLogger(__name__)
+
+
+class SaveTestModelConfig(Pass):
+    """Saves the HuggingFace model config with a reduced layer count to the test_model_path directory.
+
+    When ``test_model_path`` and ``test_model_config`` are set on the input
+    ``HfModelHandler``, this pass creates the target directory and writes
+    ``config.json`` (with the modified number of hidden layers) plus the
+    Olive test-model marker file.  The model weights are *not* written here;
+    a subsequent ``ModelBuilder`` (or any other pass that calls
+    ``HfModelHandler.load_model``) will generate and persist them on first
+    use.
+
+    The pass is a no-op when neither ``test_model_path`` nor
+    ``test_model_config`` is set on the model, and it is idempotent — running
+    it a second time on a directory that already has the marker file is safe.
+
+    The input model is returned unchanged.
+    """
+
+    @classmethod
+    def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassConfigParam]:
+        return {}
+
+    def _run_for_config(
+        self, model: HfModelHandler, config: type[BasePassConfig], output_model_path: str
+    ) -> HfModelHandler:
+        from olive.common.hf.utils import save_test_model_config
+
+        test_model_path = model.test_model_path
+        test_model_config = model.test_model_config
+        if test_model_path and test_model_config:
+            logger.info("Saving test model config to %s", test_model_path)
+            save_test_model_config(model.model_name_or_path, test_model_config, test_model_path)
+        else:
+            logger.debug(
+                "SaveTestModelConfig: test_model_path=%r, test_model_config=%r — nothing to save.",
+                test_model_path,
+                test_model_config,
+            )
+        return model
diff --git a/test/cli/test_base.py b/test/cli/test_base.py
index 4e8be12f6..5100c0b0c 100644
--- a/test/cli/test_base.py
+++ b/test/cli/test_base.py
@@ -352,7 +352,12 @@ def test_add_discrepancy_check_pass_default_enables_mae_only():
 
     run_config = add_discrepancy_check_pass(_discrepancy_run_config())
 
-    pass_config = run_config["passes"]["discrepancy_check"]
+    passes = run_config["passes"]
+    # SaveTestModelConfig must be the first pass
+    first_key = next(iter(passes))
+    assert passes[first_key]["type"] == "SaveTestModelConfig"
+
+    pass_config = passes["discrepancy_check"]
     assert pass_config["type"] == "OnnxDiscrepancyCheck"
     assert pass_config["reference_model_path"] == str(Path("ref_model").resolve())
     # default: mae only -> threshold enforced, timing disabled
@@ -365,7 +370,11 @@ def test_add_discrepancy_check_pass_speedup_only_disables_mae():
 
     run_config = add_discrepancy_check_pass(_discrepancy_run_config(), metrics=["speedup"])
 
-    pass_config = run_config["passes"]["discrepancy_check"]
+    passes = run_config["passes"]
+    first_key = next(iter(passes))
+    assert passes[first_key]["type"] == "SaveTestModelConfig"
+
+    pass_config = passes["discrepancy_check"]
     assert "max_mae" not in pass_config
     assert "timing_iterations" not in pass_config
 
@@ -375,7 +384,11 @@ def test_add_discrepancy_check_pass_mae_only_disables_speedup():
 
     run_config = add_discrepancy_check_pass(_discrepancy_run_config(), metrics=["mae"])
 
-    pass_config = run_config["passes"]["discrepancy_check"]
+    passes = run_config["passes"]
+    first_key = next(iter(passes))
+    assert passes[first_key]["type"] == "SaveTestModelConfig"
+
+    pass_config = passes["discrepancy_check"]
     assert pass_config["max_mae"] == 0.1
     assert pass_config["timing_iterations"] == 0
 
@@ -414,7 +427,11 @@ def test_add_discrepancy_check_pass_llama_env_path_sets_config():
 
     run_config = add_discrepancy_check_pass(_discrepancy_run_config(), llama_env_path="/path/to/llama_env")
 
-    pass_config = run_config["passes"]["discrepancy_check"]
+    passes = run_config["passes"]
+    first_key = next(iter(passes))
+    assert passes[first_key]["type"] == "SaveTestModelConfig"
+
+    pass_config = passes["discrepancy_check"]
     assert pass_config["llama_cpp"] is True
     assert pass_config["llama_cpp_env_path"] == "/path/to/llama_env"
 
@@ -424,7 +441,11 @@ def test_add_discrepancy_check_pass_no_llama_env_path_omits_llama_config():
 
     run_config = add_discrepancy_check_pass(_discrepancy_run_config())
 
-    pass_config = run_config["passes"]["discrepancy_check"]
+    passes = run_config["passes"]
+    first_key = next(iter(passes))
+    assert passes[first_key]["type"] == "SaveTestModelConfig"
+
+    pass_config = passes["discrepancy_check"]
     assert "llama_cpp" not in pass_config
     assert "llama_cpp_env_path" not in pass_config
 
@@ -449,7 +470,12 @@ def test_add_discrepancy_check_pass_updates_existing_pass():
 
     result = add_discrepancy_check_pass(config, metrics=["mae", "speedup"])
 
-    pass_config = result["passes"]["discrepancy_check"]
+    passes = result["passes"]
+    # SaveTestModelConfig must be injected at the beginning
+    first_key = next(iter(passes))
+    assert passes[first_key]["type"] == "SaveTestModelConfig"
+
+    pass_config = passes["discrepancy_check"]
     # Reference model path and output dir must be updated to the current values.
     assert pass_config["reference_model_path"] == str(Path("new_ref_model").resolve())
     assert pass_config["report_output_dir"] == "new_out_dir"
@@ -475,7 +501,12 @@ def test_add_discrepancy_check_pass_updates_existing_pass_speedup_only():
 
     result = add_discrepancy_check_pass(config, metrics=["speedup"])
 
-    pass_config = result["passes"]["dc"]
+    passes = result["passes"]
+    # SaveTestModelConfig must be injected at the beginning
+    first_key = next(iter(passes))
+    assert passes[first_key]["type"] == "SaveTestModelConfig"
+
+    pass_config = passes["dc"]
     assert "max_mae" not in pass_config
     assert "timing_iterations" not in pass_config
 

From 0483c52451f4b2fbed252ac671e5f8444bac1e99 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 30 Jun 2026 19:41:08 +0000
Subject: [PATCH 38/80] SaveTestModelConfig pass now saves random model weights
 in addition to config

---
 .../passes/pytorch/save_test_model_config.py  | 38 ++++++++++++-------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/olive/passes/pytorch/save_test_model_config.py b/olive/passes/pytorch/save_test_model_config.py
index 74569be24..850b51fc1 100644
--- a/olive/passes/pytorch/save_test_model_config.py
+++ b/olive/passes/pytorch/save_test_model_config.py
@@ -13,19 +13,17 @@
 
 
 class SaveTestModelConfig(Pass):
-    """Saves the HuggingFace model config with a reduced layer count to the test_model_path directory.
+    """Saves a random-initialised HuggingFace model to the test_model_path directory.
 
     When ``test_model_path`` and ``test_model_config`` are set on the input
-    ``HfModelHandler``, this pass creates the target directory and writes
-    ``config.json`` (with the modified number of hidden layers) plus the
-    Olive test-model marker file.  The model weights are *not* written here;
-    a subsequent ``ModelBuilder`` (or any other pass that calls
-    ``HfModelHandler.load_model``) will generate and persist them on first
-    use.
+    ``HfModelHandler``, this pass creates the target directory, writes
+    ``config.json`` (with the modified number of hidden layers), the Olive
+    test-model marker file, *and* the random model weights (safetensors).
 
     The pass is a no-op when neither ``test_model_path`` nor
     ``test_model_config`` is set on the model, and it is idempotent — running
-    it a second time on a directory that already has the marker file is safe.
+    it a second time on a directory that already contains both the marker file
+    and model weights is safe.
 
     The input model is returned unchanged.
     """
@@ -37,17 +35,31 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
     def _run_for_config(
         self, model: HfModelHandler, config: type[BasePassConfig], output_model_path: str
     ) -> HfModelHandler:
-        from olive.common.hf.utils import save_test_model_config
+        from pathlib import Path
+
+        from olive.common.hf.utils import is_test_model_dir
 
         test_model_path = model.test_model_path
         test_model_config = model.test_model_config
-        if test_model_path and test_model_config:
-            logger.info("Saving test model config to %s", test_model_path)
-            save_test_model_config(model.model_name_or_path, test_model_config, test_model_path)
-        else:
+        if not (test_model_path and test_model_config):
             logger.debug(
                 "SaveTestModelConfig: test_model_path=%r, test_model_config=%r — nothing to save.",
                 test_model_path,
                 test_model_config,
             )
+            return model
+
+        test_model_dir = Path(test_model_path)
+        _has_weights = is_test_model_dir(test_model_dir) and (
+            any(test_model_dir.glob("*.safetensors")) or any(test_model_dir.glob("pytorch_model*.bin"))
+        )
+        if _has_weights:
+            logger.debug("Test model already saved at %s — skipping.", test_model_path)
+            return model
+
+        logger.info("Saving test random model to %s", test_model_path)
+        # load_model calls load_model_from_task which creates a random-initialised model
+        # from the reduced config and persists it (weights + config.json + marker) to
+        # test_model_path on the first call.
+        model.load_model(cache_model=False)
         return model

From df9602e6cf3eba4a29e042a8d6e26f3e4764bc18 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 1 Jul 2026 07:09:16 +0000
Subject: [PATCH 39/80] Fix test_metrics not saved in dry_run: always write
 timing_iterations explicitly

---
 olive/cli/base.py | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index fee464760..170550a9c 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -167,7 +167,11 @@ def add_discrepancy_check_pass(
     if report_dir and Path(report_dir).suffix and not Path(report_dir).is_dir():
         report_dir = str(Path(report_dir).parent)
 
-    selected_metrics = set(metrics) if metrics else {"mae"}
+    # Only apply metric-related changes when the caller explicitly provided --test_metrics.
+    # When metrics is None (not supplied by the user), metric settings already present in
+    # the config (e.g. from a previous --dry_run --test_metrics run) are left untouched.
+    metrics_explicit = metrics is not None
+    selected_metrics = set(metrics) if metrics_explicit else {"mae"}
 
     # --- OnnxDiscrepancyCheck pass ---
     # If the pass already exists, update the dynamic runtime fields rather than re-creating it from
@@ -180,15 +184,19 @@ def add_discrepancy_check_pass(
             pass_cfg["reference_model_path"] = reference_model_path
             if report_dir is not None:
                 pass_cfg["report_output_dir"] = report_dir
-            # Respect --test_metrics: enable/disable mae threshold and speedup measurement.
-            if "mae" in selected_metrics:
-                pass_cfg.setdefault("max_mae", 0.1)
-            else:
-                pass_cfg.pop("max_mae", None)
-            if "speedup" in selected_metrics:
-                pass_cfg.pop("timing_iterations", None)  # use the default (5 iterations)
-            else:
-                pass_cfg["timing_iterations"] = 0
+            # Only modify metric settings when --test_metrics was explicitly provided.
+            # Without this guard a bare ``olive run --test`` (no --test_metrics) would
+            # reset timing_iterations to 0, discarding any speedup setting that was
+            # written by a prior ``olive optimize --dry_run --test_metrics mae,speedup``.
+            if metrics_explicit:
+                if "mae" in selected_metrics:
+                    pass_cfg.setdefault("max_mae", 0.1)
+                else:
+                    pass_cfg.pop("max_mae", None)
+                if "speedup" in selected_metrics:
+                    pass_cfg.pop("timing_iterations", None)  # use the default (5 iterations)
+                else:
+                    pass_cfg["timing_iterations"] = 0
             # Enable llama.cpp when a venv path is provided.
             if llama_env_path:
                 pass_cfg["llama_cpp"] = True
@@ -208,9 +216,10 @@ def add_discrepancy_check_pass(
     # Enforce the max-absolute-error threshold only when the accuracy metric is requested.
     if "mae" in selected_metrics:
         pass_config["max_mae"] = 0.1
-    # Disable the latency/speedup measurement when the speedup metric is not requested.
-    if "speedup" not in selected_metrics:
-        pass_config["timing_iterations"] = 0
+    # Always write timing_iterations explicitly so the config is self-contained and not
+    # affected by future changes to the pass default.  Use 0 to disable speedup or the
+    # default of 5 iterations when speedup is requested.
+    pass_config["timing_iterations"] = 5 if "speedup" in selected_metrics else 0
     # Enable llama.cpp comparison when a venv path is provided.
     if llama_env_path:
         pass_config["llama_cpp"] = True

From 34a08e5057433a94111811f598d590a25cf866db Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 1 Jul 2026 07:28:09 +0000
Subject: [PATCH 40/80] Fix missing ref_model_path in test calls and write
 timing_iterations explicitly on update

---
 olive/cli/base.py                          |  6 ++----
 test/passes/onnx/test_discrepancy_check.py | 11 +++++++----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index 170550a9c..feede83cd 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -193,10 +193,8 @@ def add_discrepancy_check_pass(
                     pass_cfg.setdefault("max_mae", 0.1)
                 else:
                     pass_cfg.pop("max_mae", None)
-                if "speedup" in selected_metrics:
-                    pass_cfg.pop("timing_iterations", None)  # use the default (5 iterations)
-                else:
-                    pass_cfg["timing_iterations"] = 0
+                # Always write timing_iterations explicitly so the saved config is self-contained.
+                pass_cfg["timing_iterations"] = 5 if "speedup" in selected_metrics else 0
             # Enable llama.cpp when a venv path is provided.
             if llama_env_path:
                 pass_cfg["llama_cpp"] = True
diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index 152525761..721fc3f2d 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -106,7 +106,7 @@ def get_next_tokens_side_effect():
             patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer),
         ):
             pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck)
-            result = pass_instance.compare_generation(config, mock_ref_model)
+            result = pass_instance.compare_generation(config, mock_ref_model, ref_model_path=config.reference_model_path)
 
         mock_generator.append_tokens.assert_called_once_with([[1, 2, 3]])
         # Common prefix: [1, 2, 3, 10, 11] = 5 tokens before divergence
@@ -172,7 +172,7 @@ def get_next_tokens_side_effect():
             patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer),
         ):
             pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck)
-            result = pass_instance.compare_generation(config, mock_ref_model)
+            result = pass_instance.compare_generation(config, mock_ref_model, ref_model_path=config.reference_model_path)
 
         mock_generator.append_tokens.assert_called_once_with([[10, 20]])
         # All 5 tokens match
@@ -227,7 +227,7 @@ def test_compare_generation_with_zero_max_new_tokens(self):
             patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer),
         ):
             pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck)
-            result = pass_instance.compare_generation(config, mock_ref_model)
+            result = pass_instance.compare_generation(config, mock_ref_model, ref_model_path=config.reference_model_path)
 
         assert mock_ref_model.generate.call_count == 1
         assert mock_ref_model.generate.call_args.kwargs["max_new_tokens"] == 0
@@ -515,6 +515,7 @@ def test_compare_llama_cpp_returns_expected_metrics(self, tmp_path):
                 output_dir=str(tmp_path),
                 pytorch_latency_s=0.10,
                 onnx_latency_s=0.05,
+                ref_model_path=config.reference_model_path,
             )
 
         expected_keys = {
@@ -581,7 +582,9 @@ def test_compare_llama_cpp_no_latency_baselines(self, tmp_path):
             patch("numpy.savez"),
         ):
             pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck)
-            result = pass_instance.compare_llama_cpp(config, mock_ref_model, output_dir=str(tmp_path))
+            result = pass_instance.compare_llama_cpp(
+                config, mock_ref_model, output_dir=str(tmp_path), ref_model_path=config.reference_model_path
+            )
 
         assert result["llama_cpp_speedup_vs_pytorch"] is None
         assert result["llama_cpp_speedup_vs_onnx"] is None

From 5bf40870fedd921856fef2136c5c70dd62ff7d77 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 1 Jul 2026 07:48:19 +0000
Subject: [PATCH 41/80] Add test_metrics parameter to OnnxDiscrepancyCheck and
 store it in generated config

---
 olive/cli/base.py                      | 20 ++++++-----------
 olive/passes/onnx/discrepancy_check.py | 30 ++++++++++++++++++++++----
 test/cli/test_base.py                  | 29 ++++++++++---------------
 3 files changed, 43 insertions(+), 36 deletions(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index feede83cd..873a6c738 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -186,15 +186,12 @@ def add_discrepancy_check_pass(
                 pass_cfg["report_output_dir"] = report_dir
             # Only modify metric settings when --test_metrics was explicitly provided.
             # Without this guard a bare ``olive run --test`` (no --test_metrics) would
-            # reset timing_iterations to 0, discarding any speedup setting that was
+            # reset test_metrics to the default, discarding any speedup setting that was
             # written by a prior ``olive optimize --dry_run --test_metrics mae,speedup``.
             if metrics_explicit:
-                if "mae" in selected_metrics:
-                    pass_cfg.setdefault("max_mae", 0.1)
-                else:
-                    pass_cfg.pop("max_mae", None)
-                # Always write timing_iterations explicitly so the saved config is self-contained.
-                pass_cfg["timing_iterations"] = 5 if "speedup" in selected_metrics else 0
+                # Store the human-readable test_metrics list so users can see what is
+                # being evaluated by inspecting config.json (e.g. "test_metrics": ["speedup"]).
+                pass_cfg["test_metrics"] = sorted(selected_metrics)
             # Enable llama.cpp when a venv path is provided.
             if llama_env_path:
                 pass_cfg["llama_cpp"] = True
@@ -210,14 +207,9 @@ def add_discrepancy_check_pass(
         "type": "OnnxDiscrepancyCheck",
         "reference_model_path": reference_model_path,
         "report_output_dir": report_dir,
+        # Store the human-readable metric list so users can inspect what will be evaluated.
+        "test_metrics": sorted(selected_metrics),
     }
-    # Enforce the max-absolute-error threshold only when the accuracy metric is requested.
-    if "mae" in selected_metrics:
-        pass_config["max_mae"] = 0.1
-    # Always write timing_iterations explicitly so the config is self-contained and not
-    # affected by future changes to the pass default.  Use 0 to disable speedup or the
-    # default of 5 iterations when speedup is requested.
-    pass_config["timing_iterations"] = 5 if "speedup" in selected_metrics else 0
     # Enable llama.cpp comparison when a venv path is provided.
     if llama_env_path:
         pass_config["llama_cpp"] = True
diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index c2dfe71da..946d18528 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -227,6 +227,17 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                     "This allows direct comparison between the reference and optimized models."
                 ),
             ),
+            "test_metrics": PassConfigParam(
+                type_=Optional[list[str]],
+                default_value=None,
+                description=(
+                    "List of test metrics to evaluate. Accepted values are ``'mae'`` (max absolute error "
+                    "between ONNX and reference PyTorch outputs) and ``'speedup'`` (ONNX-vs-PyTorch "
+                    "inference latency). When set, this field takes precedence over ``timing_iterations`` "
+                    "and ``max_mae``: ``'speedup'`` enables timing, ``'mae'`` enforces the MAE threshold. "
+                    "Example: ``['mae', 'speedup']``. Set by the CLI ``--test_metrics`` option."
+                ),
+            ),
             "max_mae": PassConfigParam(
                 type_=Optional[float],
                 default_value=None,
@@ -511,8 +522,19 @@ def _run_for_config(
         )
         logger.info(summary)
 
+        # Resolve effective metric settings: test_metrics takes precedence when set.
+        # This lets the CLI store a human-readable ["mae", "speedup"] list in the config
+        # while still supporting the lower-level timing_iterations / max_mae controls for
+        # advanced users and backward compatibility with older configs.
+        if config.test_metrics is not None:
+            effective_timing_iterations = 5 if "speedup" in config.test_metrics else 0
+            effective_max_mae = 0.1 if "mae" in config.test_metrics else None
+        else:
+            effective_timing_iterations = config.timing_iterations
+            effective_max_mae = config.max_mae
+
         # Measure inference speedup (ONNX vs PyTorch) on the target device
-        if config.timing_iterations > 0:
+        if effective_timing_iterations > 0:
             timing = self._measure_speedup(
                 ref_model,
                 session,
@@ -520,7 +542,7 @@ def _run_for_config(
                 io_config,
                 torch_device,
                 config.warmup_iterations,
-                config.timing_iterations,
+                effective_timing_iterations,
             )
             if timing is not None:
                 pytorch_time, onnx_time, speedup = timing
@@ -530,12 +552,12 @@ def _run_for_config(
         else:
             logger.info(
                 "OnnxDiscrepancyCheck speedup measurement skipped because timing_iterations=%d.",
-                config.timing_iterations,
+                effective_timing_iterations,
             )
 
         # Check thresholds
         failures = []
-        if config.max_mae is not None and max_abs_error > config.max_mae:
+        if effective_max_mae is not None and max_abs_error > effective_max_mae:
             failures.append(f"Max absolute error {max_abs_error:.6f} exceeds threshold {config.max_mae:.6f}")
         if config.max_elements_above_0_1 is not None and count_above_0_1 > config.max_elements_above_0_1:
             failures.append(
diff --git a/test/cli/test_base.py b/test/cli/test_base.py
index 5100c0b0c..8d6c25391 100644
--- a/test/cli/test_base.py
+++ b/test/cli/test_base.py
@@ -360,9 +360,8 @@ def test_add_discrepancy_check_pass_default_enables_mae_only():
     pass_config = passes["discrepancy_check"]
     assert pass_config["type"] == "OnnxDiscrepancyCheck"
     assert pass_config["reference_model_path"] == str(Path("ref_model").resolve())
-    # default: mae only -> threshold enforced, timing disabled
-    assert pass_config["max_mae"] == 0.1
-    assert pass_config["timing_iterations"] == 0
+    # default: mae only -> test_metrics stores the human-readable selection
+    assert pass_config["test_metrics"] == ["mae"]
 
 
 def test_add_discrepancy_check_pass_speedup_only_disables_mae():
@@ -375,8 +374,7 @@ def test_add_discrepancy_check_pass_speedup_only_disables_mae():
     assert passes[first_key]["type"] == "SaveTestModelConfig"
 
     pass_config = passes["discrepancy_check"]
-    assert "max_mae" not in pass_config
-    assert "timing_iterations" not in pass_config
+    assert pass_config["test_metrics"] == ["speedup"]
 
 
 def test_add_discrepancy_check_pass_mae_only_disables_speedup():
@@ -389,8 +387,7 @@ def test_add_discrepancy_check_pass_mae_only_disables_speedup():
     assert passes[first_key]["type"] == "SaveTestModelConfig"
 
     pass_config = passes["discrepancy_check"]
-    assert pass_config["max_mae"] == 0.1
-    assert pass_config["timing_iterations"] == 0
+    assert pass_config["test_metrics"] == ["mae"]
 
 
 def test_warn_unused_test_metrics_logs_when_test_disabled():
@@ -455,14 +452,14 @@ def test_add_discrepancy_check_pass_updates_existing_pass():
     from olive.cli.base import add_discrepancy_check_pass
 
     # Simulate a config generated by `olive optimize --dry_run --test` - the pass already exists
-    # with stale settings (old output dir, timing disabled because no --test_metrics was given).
+    # with stale settings (old output dir, only mae was requested at generate-time).
     config = _discrepancy_run_config()
     config["passes"] = {
         "discrepancy_check": {
             "type": "OnnxDiscrepancyCheck",
             "reference_model_path": "/old/abs/path",
             "report_output_dir": "/old/out_dir",
-            "timing_iterations": 0,  # speedup was not requested at generate-time
+            "test_metrics": ["mae"],
         }
     }
     config["input_model"]["test_model_path"] = "new_ref_model"
@@ -479,14 +476,12 @@ def test_add_discrepancy_check_pass_updates_existing_pass():
     # Reference model path and output dir must be updated to the current values.
     assert pass_config["reference_model_path"] == str(Path("new_ref_model").resolve())
     assert pass_config["report_output_dir"] == "new_out_dir"
-    # With metrics=["speedup"], timing_iterations should NOT be forced to 0.
-    assert "timing_iterations" not in pass_config
-    # With metrics including "mae", max_mae threshold should be set.
-    assert pass_config["max_mae"] == 0.1
+    # test_metrics must reflect the newly requested metrics.
+    assert pass_config["test_metrics"] == ["mae", "speedup"]
 
 
 def test_add_discrepancy_check_pass_updates_existing_pass_speedup_only():
-    """Updating an existing pass with speedup-only metrics removes mae threshold."""
+    """Updating an existing pass with speedup-only metrics updates test_metrics."""
     from olive.cli.base import add_discrepancy_check_pass
 
     config = _discrepancy_run_config()
@@ -494,8 +489,7 @@ def test_add_discrepancy_check_pass_updates_existing_pass_speedup_only():
         "dc": {
             "type": "onnxdiscrepancycheck",  # case-insensitive type match
             "reference_model_path": "/old/path",
-            "max_mae": 0.1,
-            "timing_iterations": 0,
+            "test_metrics": ["mae"],
         }
     }
 
@@ -507,8 +501,7 @@ def test_add_discrepancy_check_pass_updates_existing_pass_speedup_only():
     assert passes[first_key]["type"] == "SaveTestModelConfig"
 
     pass_config = passes["dc"]
-    assert "max_mae" not in pass_config
-    assert "timing_iterations" not in pass_config
+    assert pass_config["test_metrics"] == ["speedup"]
 
     from olive.cli.base import _parse_test_metrics
 

From b97c17e2b10bac70caa6fa25e32f787c0292e834 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 1 Jul 2026 07:53:59 +0000
Subject: [PATCH 42/80] Fix crash when formatting MAE threshold with
 test_metrics

---
 olive/passes/onnx/discrepancy_check.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 946d18528..7b65d333c 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -558,7 +558,7 @@ def _run_for_config(
         # Check thresholds
         failures = []
         if effective_max_mae is not None and max_abs_error > effective_max_mae:
-            failures.append(f"Max absolute error {max_abs_error:.6f} exceeds threshold {config.max_mae:.6f}")
+            failures.append(f"Max absolute error {max_abs_error:.6f} exceeds threshold {effective_max_mae:.6f}")
         if config.max_elements_above_0_1 is not None and count_above_0_1 > config.max_elements_above_0_1:
             failures.append(
                 f"Elements with diff > 0.1: {count_above_0_1} exceeds threshold {config.max_elements_above_0_1}"

From da0d77d050eb32f65d58345479a6ed5ce4cbef76 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 1 Jul 2026 08:05:27 +0000
Subject: [PATCH 43/80] Add attn_impl parameter to OnnxDiscrepancyCheck for
 configurable attention implementation

---
 olive/passes/onnx/discrepancy_check.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 7b65d333c..8d9542e2f 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -310,6 +310,16 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                     "below this threshold, the pass fails."
                 ),
             ),
+            "attn_impl": PassConfigParam(
+                type_=Optional[str],
+                default_value=None,
+                description=(
+                    "Attention implementation to use when loading the reference HuggingFace model via "
+                    "``AutoModelForCausalLM.from_pretrained``. Passed as ``attn_implementation`` to "
+                    "the model loader. Common values are ``'eager'``, ``'sdpa'``, and ``'flash_attention_2'``. "
+                    "When ``None`` (the default), the model's own default is used."
+                ),
+            ),
             "llama_cpp": PassConfigParam(
                 type_=bool,
                 default_value=False,
@@ -411,7 +421,9 @@ def _run_for_config(
                 f"Got architectures={architectures}"
             )
 
-        ref_model = AutoModelForCausalLM.from_pretrained(ref_path, config=ref_cfg)
+        ref_model = AutoModelForCausalLM.from_pretrained(
+            ref_path, config=ref_cfg, **({} if config.attn_impl is None else {"attn_implementation": config.attn_impl})
+        )
         ref_model.eval()
 
         # Determine the floating-point dtype used by the ONNX model weights and

From cc15cbfbb6498232715cd0c18336a2f8a5f6b803 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 1 Jul 2026 08:53:00 +0000
Subject: [PATCH 44/80] Rename Olive CLI config from config.json to
 olive_config.json to prevent collision with HF model config

---
 docs/source/how-to/cli/cli-fast-test.md | 10 +++++-----
 docs/source/how-to/cli/cli-optimize.md  |  2 +-
 olive/cli/base.py                       |  2 +-
 olive/cli/init/wizard.py                |  2 +-
 test/cli/test_cli.py                    |  2 +-
 test/cli/test_cli_test_model_smoke.py   |  6 +++---
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/source/how-to/cli/cli-fast-test.md b/docs/source/how-to/cli/cli-fast-test.md
index 588ada470..efefedac3 100644
--- a/docs/source/how-to/cli/cli-fast-test.md
+++ b/docs/source/how-to/cli/cli-fast-test.md
@@ -33,7 +33,7 @@ olive optimize \
     --test out/qwen-test-model
 ```
 
-This creates `out/qwen/config.json` without launching the full conversion yet.
+This creates `out/qwen/olive_config.json` without launching the full conversion yet.
 It also inserts an `OnnxDiscrepancyCheck` pass (if one is not already present) that will compare the generated ONNX model against the 2-layer reference model.
 
 ## Step 2: run a fast smoke test with `olive run --test`
@@ -42,7 +42,7 @@ Use the generated config with `olive run` and pass `--test` so Olive swaps in th
 
 ```bash
 olive run \
-    --config out/qwen/config.json \
+    --config out/qwen/olive_config.json \
     --test out/qwen-test-model \
     --output_path out/qwen-test-run
 ```
@@ -78,7 +78,7 @@ For example, to run both accuracy and latency checks:
 
 ```bash
 olive run \
-    --config out/qwen/config.json \
+    --config out/qwen/olive_config.json \
     --test out/qwen-test-model \
     --test_metrics mae,speedup \
     --output_path out/qwen-test-run
@@ -92,7 +92,7 @@ If you have a `llama_env` virtual environment with `llama-cpp-python` installed,
 
 ```bash
 olive run \
-    --config out/qwen/config.json \
+    --config out/qwen/olive_config.json \
     --test out/qwen-test-model \
     --test_metrics mae,speedup \
     --test_llama_path ./llama_env \
@@ -127,7 +127,7 @@ Once the smoke test succeeds, rerun the conversion on the full Qwen checkpoint b
 
 ```bash
 olive run \
-    --config out/qwen/config.json \
+    --config out/qwen/olive_config.json \
     --output_path out/qwen-full
 ```
 
diff --git a/docs/source/how-to/cli/cli-optimize.md b/docs/source/how-to/cli/cli-optimize.md
index 12a3a2e54..bff2268c9 100644
--- a/docs/source/how-to/cli/cli-optimize.md
+++ b/docs/source/how-to/cli/cli-optimize.md
@@ -30,7 +30,7 @@ This command will quantize weights into int4 precision before converting the mod
 
 ## Customizing model optimization process
 
-`olive optimize` primarily requests desired model precision and intended ExecutionProvider that will be used to run the optimized model. Based on these information, `olive optimize` command will generate model optimiation recipe as per user request and execute the recipe to produce to output model. Advanced users can use `--dry_run` option to save the `config.json` file on the disk. See comprehensive list of [options](../../reference/options.html) you can use to customize the model optimization process further by modifying the `config.json` file produced by the `olive optimize --dry_run ...` command.
+`olive optimize` primarily requests desired model precision and intended ExecutionProvider that will be used to run the optimized model. Based on these information, `olive optimize` command will generate model optimiation recipe as per user request and execute the recipe to produce to output model. Advanced users can use `--dry_run` option to save the `olive_config.json` file on the disk. See comprehensive list of [options](../../reference/options.html) you can use to customize the model optimization process further by modifying the `olive_config.json` file produced by the `olive optimize --dry_run ...` command.
 
 ## Additional details
 
diff --git a/olive/cli/base.py b/olive/cli/base.py
index 873a6c738..5cd5a10d9 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -310,7 +310,7 @@ def _parse_extra_options(kv_items):
     @staticmethod
     def _save_config_file(config: dict):
         """Save the config file."""
-        config_file_path = Path(config["output_dir"]) / "config.json"
+        config_file_path = Path(config["output_dir"]) / "olive_config.json"
         with open(config_file_path, "w") as f:
             json.dump(config, f, indent=4)
         print(f"Config file saved at {config_file_path}")
diff --git a/olive/cli/init/wizard.py b/olive/cli/init/wizard.py
index 74e31efb4..63db15dec 100644
--- a/olive/cli/init/wizard.py
+++ b/olive/cli/init/wizard.py
@@ -222,7 +222,7 @@ def _prompt_output(self, result):
             config_cmd = command_str + " --save_config_file --dry_run"
             print("\nGenerating configuration file...\n")
             subprocess.run(config_cmd, shell=True, check=False)
-            config_path = Path(output_dir) / "config.json"
+            config_path = Path(output_dir) / "olive_config.json"
             if config_path.exists():
                 print(f"\nYou can run it later with:\n  olive run --config {config_path}\n")
 
diff --git a/test/cli/test_cli.py b/test/cli/test_cli.py
index 5e666b94f..8cb08f214 100644
--- a/test/cli/test_cli.py
+++ b/test/cli/test_cli.py
@@ -316,7 +316,7 @@ def test_optimize_command_test_model_config(_, tmp_path):
 
     cli_main(command_args)
 
-    config = json.loads((output_dir / "config.json").read_text())
+    config = json.loads((output_dir / "olive_config.json").read_text())
     assert config["input_model"]["test_model_config"] == {"hidden_layers": 2}
     assert config["input_model"]["test_model_path"] == str(test_model_dir)
     assert json.loads((output_dir / TEST_OUTPUT_MARKER_FILE).read_text())["type"] == "olive_hf_test_output"
diff --git a/test/cli/test_cli_test_model_smoke.py b/test/cli/test_cli_test_model_smoke.py
index 90c10ddff..fb63ef853 100644
--- a/test/cli/test_cli_test_model_smoke.py
+++ b/test/cli/test_cli_test_model_smoke.py
@@ -122,10 +122,10 @@ def _run_documented_test_model_smoke_flow(tmp_path: Path, model_id: str):
         ]
     )
 
-    config_path = config_output_dir / "config.json"
+    config_path = config_output_dir / "olive_config.json"
     assert config_path.exists()
     _set_offline_gptq_data_config(config_path)
-    # run --config dump/config.json --test dump/test --output_path dump/run
+    # run --config dump/olive_config.json --test dump/test --output_path dump/run
     _run_cli_main(
         [
             "run",
@@ -249,7 +249,7 @@ def _assert_discrepancy_model_builder(self, tmp_path: Path, model_id: str):
             ]
         )
 
-        config_path = config_output_dir / "config.json"
+        config_path = config_output_dir / "olive_config.json"
         assert config_path.exists()
         _set_offline_gptq_data_config(config_path)
 

From 5e1512f44789e43abfaac6d53e36a578c02a9541 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 1 Jul 2026 08:59:53 +0000
Subject: [PATCH 45/80] Add logging to OnnxDiscrepancyCheck and switch
 attn_implementation default to sdpa

---
 olive/cli/base.py                          |  2 +-
 olive/passes/onnx/discrepancy_check.py     | 18 +++++++++++++-----
 test/passes/onnx/test_discrepancy_check.py | 12 +++++++++---
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index 5cd5a10d9..5865a7a45 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -351,7 +351,7 @@ def _get_hf_input_model(args: Namespace, model_path: OLIVE_RESOURCE_ANNOTATIONS)
         "type": "HfModel",
         "model_path": model_path,
         "load_kwargs": {
-            "attn_implementation": "eager",
+            "attn_implementation": "sdpa",
         },
     }
     # use getattr to avoid AttributeError in case hf model or adapter_path is not supported
diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 8d9542e2f..4441f7161 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -312,12 +312,12 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
             ),
             "attn_impl": PassConfigParam(
                 type_=Optional[str],
-                default_value=None,
+                default_value="sdpa",
                 description=(
                     "Attention implementation to use when loading the reference HuggingFace model via "
                     "``AutoModelForCausalLM.from_pretrained``. Passed as ``attn_implementation`` to "
                     "the model loader. Common values are ``'eager'``, ``'sdpa'``, and ``'flash_attention_2'``. "
-                    "When ``None`` (the default), the model's own default is used."
+                    "Defaults to ``'sdpa'``."
                 ),
             ),
             "llama_cpp": PassConfigParam(
@@ -421,9 +421,7 @@ def _run_for_config(
                 f"Got architectures={architectures}"
             )
 
-        ref_model = AutoModelForCausalLM.from_pretrained(
-            ref_path, config=ref_cfg, **({} if config.attn_impl is None else {"attn_implementation": config.attn_impl})
-        )
+        ref_model = AutoModelForCausalLM.from_pretrained(ref_path, config=ref_cfg, attn_implementation=config.attn_impl)
         ref_model.eval()
 
         # Determine the floating-point dtype used by the ONNX model weights and
@@ -477,6 +475,7 @@ def _run_for_config(
         config_save_path = Path(report_dir) / "reference_model_config.json"
         config_save_path.parent.mkdir(parents=True, exist_ok=True)
         config_save_path.write_text(ref_cfg.to_json_string())
+        logger.info("Saved reference model config to %s", config_save_path)
 
         session = model.prepare_session(
             device=device,
@@ -561,6 +560,12 @@ def _run_for_config(
                 results["pytorch_latency_s"] = pytorch_time
                 results["onnx_latency_s"] = onnx_time
                 results["speedup"] = speedup
+                logger.info(
+                    "OnnxDiscrepancyCheck speedup: pytorch_latency_s=%.4f, onnx_latency_s=%.4f, speedup=%.2f",
+                    pytorch_time,
+                    onnx_time,
+                    speedup,
+                )
         else:
             logger.info(
                 "OnnxDiscrepancyCheck speedup measurement skipped because timing_iterations=%d.",
@@ -619,6 +624,7 @@ def _run_for_config(
         report_path = Path(report_dir) / "discrepancy_check_results.json"
         report_path.parent.mkdir(parents=True, exist_ok=True)
         report_path.write_text(json.dumps(results, indent=2))
+        logger.info("Saved discrepancy check results to %s", report_path)
 
         # Store results in model attributes so the CLI can persist them in the output directory
         model_attributes = dict(model.model_attributes) if model.model_attributes else {}
@@ -923,6 +929,7 @@ def compare_llama_cpp(
         # Save model and tokenizer in standard HuggingFace format.
         ref_model.save_pretrained(model_dir, safe_serialization=True)
         tokenizer.save_pretrained(model_dir)
+        logger.info("Saved reference HuggingFace model and tokenizer to %s", model_dir)
 
         # Step 1: Convert to GGUF using the official convert_hf_to_gguf.py CLI.
         subprocess.run(
@@ -931,6 +938,7 @@ def compare_llama_cpp(
             text=True,
             check=True,
         )
+        logger.info("Converted HuggingFace model to GGUF at %s", gguf_path)
 
         # Step 2: Run inference inside llama_env using the pre-converted GGUF file.
         (output_dir_path / "llama_cpp_helper.py").write_text(_LLAMA_CPP_HELPER_SCRIPT)
diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index 721fc3f2d..e128682ea 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -106,7 +106,9 @@ def get_next_tokens_side_effect():
             patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer),
         ):
             pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck)
-            result = pass_instance.compare_generation(config, mock_ref_model, ref_model_path=config.reference_model_path)
+            result = pass_instance.compare_generation(
+                config, mock_ref_model, ref_model_path=config.reference_model_path
+            )
 
         mock_generator.append_tokens.assert_called_once_with([[1, 2, 3]])
         # Common prefix: [1, 2, 3, 10, 11] = 5 tokens before divergence
@@ -172,7 +174,9 @@ def get_next_tokens_side_effect():
             patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer),
         ):
             pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck)
-            result = pass_instance.compare_generation(config, mock_ref_model, ref_model_path=config.reference_model_path)
+            result = pass_instance.compare_generation(
+                config, mock_ref_model, ref_model_path=config.reference_model_path
+            )
 
         mock_generator.append_tokens.assert_called_once_with([[10, 20]])
         # All 5 tokens match
@@ -227,7 +231,9 @@ def test_compare_generation_with_zero_max_new_tokens(self):
             patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer),
         ):
             pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck)
-            result = pass_instance.compare_generation(config, mock_ref_model, ref_model_path=config.reference_model_path)
+            result = pass_instance.compare_generation(
+                config, mock_ref_model, ref_model_path=config.reference_model_path
+            )
 
         assert mock_ref_model.generate.call_count == 1
         assert mock_ref_model.generate.call_args.kwargs["max_new_tokens"] == 0

From 6c512b961418158f67f86d4400de079978d35dcd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Wed, 1 Jul 2026 11:04:15 +0200
Subject: [PATCH 46/80] todo

---
 olive/passes/onnx/discrepancy_check.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 8d9542e2f..ce0336db9 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -421,6 +421,7 @@ def _run_for_config(
                 f"Got architectures={architectures}"
             )
 
+        # TODO: the model tested is not the one converted into onnx
         ref_model = AutoModelForCausalLM.from_pretrained(
             ref_path, config=ref_cfg, **({} if config.attn_impl is None else {"attn_implementation": config.attn_impl})
         )

From cf849f65e080a6a0638e5ac17a9d8d2e66a9c5eb Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 1 Jul 2026 09:18:51 +0000
Subject: [PATCH 47/80] Reduce test model dimensions to fix CI artifact size
 check

---
 test/cli/test_cli_test_model_smoke.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/cli/test_cli_test_model_smoke.py b/test/cli/test_cli_test_model_smoke.py
index fb63ef853..5daede4b5 100644
--- a/test/cli/test_cli_test_model_smoke.py
+++ b/test/cli/test_cli_test_model_smoke.py
@@ -44,8 +44,8 @@ def _save_local_tiny_llama(model_path: Path):
         LlamaConfig.from_dict(
             {
                 "vocab_size": 32,
-                "hidden_size": 128,
-                "intermediate_size": 256,
+                "hidden_size": 64,
+                "intermediate_size": 128,
                 "num_hidden_layers": 2,
                 "num_attention_heads": 8,
                 "num_key_value_heads": 8,

From c27d103325b70adb782ec08f1612b810a606a0b4 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 1 Jul 2026 10:05:21 +0000
Subject: [PATCH 48/80] Save CLI dry-run config as config.json instead of
 olive_config.json

---
 docs/source/how-to/cli/cli-fast-test.md | 10 +++++-----
 docs/source/how-to/cli/cli-optimize.md  |  2 +-
 olive/cli/base.py                       |  2 +-
 olive/cli/init/wizard.py                |  2 +-
 test/cli/test_cli.py                    |  2 +-
 test/cli/test_cli_test_model_smoke.py   |  6 +++---
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/source/how-to/cli/cli-fast-test.md b/docs/source/how-to/cli/cli-fast-test.md
index efefedac3..588ada470 100644
--- a/docs/source/how-to/cli/cli-fast-test.md
+++ b/docs/source/how-to/cli/cli-fast-test.md
@@ -33,7 +33,7 @@ olive optimize \
     --test out/qwen-test-model
 ```
 
-This creates `out/qwen/olive_config.json` without launching the full conversion yet.
+This creates `out/qwen/config.json` without launching the full conversion yet.
 It also inserts an `OnnxDiscrepancyCheck` pass (if one is not already present) that will compare the generated ONNX model against the 2-layer reference model.
 
 ## Step 2: run a fast smoke test with `olive run --test`
@@ -42,7 +42,7 @@ Use the generated config with `olive run` and pass `--test` so Olive swaps in th
 
 ```bash
 olive run \
-    --config out/qwen/olive_config.json \
+    --config out/qwen/config.json \
     --test out/qwen-test-model \
     --output_path out/qwen-test-run
 ```
@@ -78,7 +78,7 @@ For example, to run both accuracy and latency checks:
 
 ```bash
 olive run \
-    --config out/qwen/olive_config.json \
+    --config out/qwen/config.json \
     --test out/qwen-test-model \
     --test_metrics mae,speedup \
     --output_path out/qwen-test-run
@@ -92,7 +92,7 @@ If you have a `llama_env` virtual environment with `llama-cpp-python` installed,
 
 ```bash
 olive run \
-    --config out/qwen/olive_config.json \
+    --config out/qwen/config.json \
     --test out/qwen-test-model \
     --test_metrics mae,speedup \
     --test_llama_path ./llama_env \
@@ -127,7 +127,7 @@ Once the smoke test succeeds, rerun the conversion on the full Qwen checkpoint b
 
 ```bash
 olive run \
-    --config out/qwen/olive_config.json \
+    --config out/qwen/config.json \
     --output_path out/qwen-full
 ```
 
diff --git a/docs/source/how-to/cli/cli-optimize.md b/docs/source/how-to/cli/cli-optimize.md
index bff2268c9..12a3a2e54 100644
--- a/docs/source/how-to/cli/cli-optimize.md
+++ b/docs/source/how-to/cli/cli-optimize.md
@@ -30,7 +30,7 @@ This command will quantize weights into int4 precision before converting the mod
 
 ## Customizing model optimization process
 
-`olive optimize` primarily requests desired model precision and intended ExecutionProvider that will be used to run the optimized model. Based on these information, `olive optimize` command will generate model optimiation recipe as per user request and execute the recipe to produce to output model. Advanced users can use `--dry_run` option to save the `olive_config.json` file on the disk. See comprehensive list of [options](../../reference/options.html) you can use to customize the model optimization process further by modifying the `olive_config.json` file produced by the `olive optimize --dry_run ...` command.
+`olive optimize` primarily requests desired model precision and intended ExecutionProvider that will be used to run the optimized model. Based on these information, `olive optimize` command will generate model optimiation recipe as per user request and execute the recipe to produce to output model. Advanced users can use `--dry_run` option to save the `config.json` file on the disk. See comprehensive list of [options](../../reference/options.html) you can use to customize the model optimization process further by modifying the `config.json` file produced by the `olive optimize --dry_run ...` command.
 
 ## Additional details
 
diff --git a/olive/cli/base.py b/olive/cli/base.py
index 5865a7a45..0642e186d 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -310,7 +310,7 @@ def _parse_extra_options(kv_items):
     @staticmethod
     def _save_config_file(config: dict):
         """Save the config file."""
-        config_file_path = Path(config["output_dir"]) / "olive_config.json"
+        config_file_path = Path(config["output_dir"]) / "config.json"
         with open(config_file_path, "w") as f:
             json.dump(config, f, indent=4)
         print(f"Config file saved at {config_file_path}")
diff --git a/olive/cli/init/wizard.py b/olive/cli/init/wizard.py
index 63db15dec..74e31efb4 100644
--- a/olive/cli/init/wizard.py
+++ b/olive/cli/init/wizard.py
@@ -222,7 +222,7 @@ def _prompt_output(self, result):
             config_cmd = command_str + " --save_config_file --dry_run"
             print("\nGenerating configuration file...\n")
             subprocess.run(config_cmd, shell=True, check=False)
-            config_path = Path(output_dir) / "olive_config.json"
+            config_path = Path(output_dir) / "config.json"
             if config_path.exists():
                 print(f"\nYou can run it later with:\n  olive run --config {config_path}\n")
 
diff --git a/test/cli/test_cli.py b/test/cli/test_cli.py
index 8cb08f214..5e666b94f 100644
--- a/test/cli/test_cli.py
+++ b/test/cli/test_cli.py
@@ -316,7 +316,7 @@ def test_optimize_command_test_model_config(_, tmp_path):
 
     cli_main(command_args)
 
-    config = json.loads((output_dir / "olive_config.json").read_text())
+    config = json.loads((output_dir / "config.json").read_text())
     assert config["input_model"]["test_model_config"] == {"hidden_layers": 2}
     assert config["input_model"]["test_model_path"] == str(test_model_dir)
     assert json.loads((output_dir / TEST_OUTPUT_MARKER_FILE).read_text())["type"] == "olive_hf_test_output"
diff --git a/test/cli/test_cli_test_model_smoke.py b/test/cli/test_cli_test_model_smoke.py
index 5daede4b5..dc9a2e305 100644
--- a/test/cli/test_cli_test_model_smoke.py
+++ b/test/cli/test_cli_test_model_smoke.py
@@ -122,10 +122,10 @@ def _run_documented_test_model_smoke_flow(tmp_path: Path, model_id: str):
         ]
     )
 
-    config_path = config_output_dir / "olive_config.json"
+    config_path = config_output_dir / "config.json"
     assert config_path.exists()
     _set_offline_gptq_data_config(config_path)
-    # run --config dump/olive_config.json --test dump/test --output_path dump/run
+    # run --config dump/config.json --test dump/test --output_path dump/run
     _run_cli_main(
         [
             "run",
@@ -249,7 +249,7 @@ def _assert_discrepancy_model_builder(self, tmp_path: Path, model_id: str):
             ]
         )
 
-        config_path = config_output_dir / "olive_config.json"
+        config_path = config_output_dir / "config.json"
         assert config_path.exists()
         _set_offline_gptq_data_config(config_path)
 

From 1e8a0d17a70182bad41c6a9e324ca3cfef94d910 Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Wed, 1 Jul 2026 12:40:13 +0200
Subject: [PATCH 49/80] Merge 3 existing PRs for OnnxDiscrepancyCheck +
 llama.cpp integration (with dedicated GGUF conversion pass) (#2548)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Describe your changes

Merges #2536, #2535, #2534.

Additionally adds llama.cpp integration and other improvements to
`OnnxDiscrepancyCheck` and test-mode workflow handling:

- **New `llama_cpp` flag** (`bool`, default `False`) on
`OnnxDiscrepancyCheck` — when enabled, compares inference with
llama.cpp.
- **New `llama_cpp_env_path` parameter** (`Optional[str]`) — path to the
`llama_env` virtual environment where `llama-cpp-python` and
`convert_hf_to_gguf.py` are installed (defaults to `"llama_env"`
relative to cwd).
- **New `--test_llama_path` CLI option** — specifies the path to the
`llama_env` virtual environment when running with `--test`. Using
`--test_llama_path` without `--test` emits a warning.
- **New `ConvertHfToGGUF` pass**
(`olive/passes/pytorch/convert_hf_to_gguf.py`) — injected when
`--test_llama_path` is provided. This pass converts the test HF model to
GGUF ahead of discrepancy checking and stores the GGUF path in model
attributes for downstream reuse.
- **`compare_llama_cpp()` updates** — now reuses a preconverted GGUF
when available; otherwise it falls back to in-method HF→GGUF conversion.
llama.cpp comparison failures are captured in discrepancy results
(status/failures) instead of aborting the whole run, so ONNX generation
can still complete.
- **Improved `--test_metrics` parsing** — now accepts both
space-separated (`--test_metrics mae speedup`) and comma-separated
(`--test_metrics mae,speedup`) forms.
- **Fixed `add_discrepancy_check_pass` update-in-place** — existing
discrepancy-pass config generated by dry-run is updated in-place so
current `--test_metrics`, `--output_path`, and llama settings are
applied.
- **Fixed test model persistence across engine cache hits** —
`ModelBuilder` stores a reference HF copy (`reference_hf_model/`)
alongside cached ONNX outputs; discrepancy check falls back to this copy
if the original test model path is missing.
- **New `SaveTestModelConfig` pass**
(`olive/passes/pytorch/save_test_model_config.py`) — injected at the
start of passes for `--test`; ensures test model config/marker (and
random test model persistence path usage) is set up before downstream
passes.
- **CI workflow** (`test-model-fast.yml`) — includes setup of a llama
environment and llama.cpp conversion script dependencies.
- **Updated documentation** (`cli-fast-test.md`) — clarifies where layer
reduction happens, when test-model directories are created, cache
fallback behavior, and llama.cpp test flow including the dedicated GGUF
conversion pass.

## Checklist before requesting a review
- [ ] Add unit tests for this change.
- [ ] Make sure all tests can pass.
- [ ] Update documents if necessary.
- [ ] Lint and apply fixes to your code by running `lintrunner -a`
- [ ] Is this a user-facing change? If yes, give a description of this
change to be included in the release notes.

## (Optional) Issue link

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
---
 docs/source/how-to/cli/cli-fast-test.md       |  7 +-
 olive/cli/base.py                             | 44 +++++++++-
 olive/olive_config.json                       |  8 ++
 olive/passes/onnx/discrepancy_check.py        | 60 ++++++++-----
 olive/passes/pytorch/convert_hf_to_gguf.py    | 86 +++++++++++++++++++
 test/cli/test_base.py                         |  8 +-
 test/passes/onnx/test_discrepancy_check.py    | 51 +++++++++++
 .../passes/pytorch/test_convert_hf_to_gguf.py | 66 ++++++++++++++
 8 files changed, 299 insertions(+), 31 deletions(-)
 create mode 100644 olive/passes/pytorch/convert_hf_to_gguf.py
 create mode 100644 test/passes/pytorch/test_convert_hf_to_gguf.py

diff --git a/docs/source/how-to/cli/cli-fast-test.md b/docs/source/how-to/cli/cli-fast-test.md
index 588ada470..0779e065f 100644
--- a/docs/source/how-to/cli/cli-fast-test.md
+++ b/docs/source/how-to/cli/cli-fast-test.md
@@ -99,11 +99,10 @@ olive run \
     --output_path out/qwen-test-run
 ```
 
-`--test_llama_path` points to the virtual environment that contains `llama-cpp-python` and `convert_hf_to_gguf.py` (from the llama.cpp repository). When provided, Olive:
+`--test_llama_path` points to the virtual environment that contains `llama-cpp-python` and `convert_hf_to_gguf.py` (from the llama.cpp repository). When provided, Olive injects a `ConvertHfToGGUF` pass before model conversion and then:
 
-1. Saves the reference HuggingFace model to `<output_path>/hf_model` using `save_pretrained`.
-2. Calls `convert_hf_to_gguf.py` inside the virtual environment to produce a GGUF F32 file at `<output_path>/model.gguf`.
-3. Runs inference with `llama_cpp.Llama` inside the virtual environment and reports first-token latency and speedup metrics alongside the regular MAE and ONNX speedup results.
+1. Converts the reduced test HuggingFace model to GGUF (`<test_model_path>/model.gguf`) via `convert_hf_to_gguf.py`.
+2. Reuses that GGUF file in `OnnxDiscrepancyCheck` for llama.cpp inference and reports first-token latency and speedup metrics alongside the regular MAE and ONNX speedup results.
 
 All `llama-cpp-python` imports are strictly isolated to the subprocess — the main Olive process never imports them.
 
diff --git a/olive/cli/base.py b/olive/cli/base.py
index 0642e186d..7a3f2db77 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -112,7 +112,7 @@ def warn_unused_test_metrics(test, metrics: Optional[list], llama_path: Optional
 def add_discrepancy_check_pass(
     run_config: dict, metrics: Optional[list] = None, llama_env_path: Optional[str] = None
 ) -> dict:
-    """Inject or update a SaveTestModelConfig and an OnnxDiscrepancyCheck pass when --test is active.
+    """Inject or update test-related passes when --test is active.
 
     ``metrics`` selects which test metrics to evaluate. Supported values are defined in
     ``TEST_METRICS`` (``"mae"`` for the max-absolute-error accuracy check and ``"speedup"`` for the
@@ -123,13 +123,16 @@ def add_discrepancy_check_pass(
     When provided, the ``llama_cpp`` flag is enabled on the pass and the path is forwarded as
     ``llama_cpp_env_path``.
 
-    Two passes are managed:
+    Managed passes:
 
     * ``SaveTestModelConfig`` — inserted at the *beginning* of the passes dict so that the
       test-model directory (containing only ``config.json`` and the marker file) is created
       before any other pass runs.  This ensures subsequent passes can find the directory even
       on the first ``olive run`` after ``olive optimize --dry_run --test``.
 
+    * ``ConvertHfToGGUF`` — inserted after ``SaveTestModelConfig`` when ``llama_env_path`` is
+      provided, and converts the test HuggingFace directory to GGUF in advance.
+
     * ``OnnxDiscrepancyCheck`` — appended at the end to compare the ONNX model against the
       reference HuggingFace model.  If an instance is already present in the config (e.g.
       from a previous ``--dry_run --test`` invocation), its dynamic runtime fields
@@ -162,6 +165,41 @@ def add_discrepancy_check_pass(
         passes = new_passes
         run_config["passes"] = passes
 
+    # --- ConvertHfToGGUF pass (optional, only with --test_llama_path) ---
+    if llama_env_path:
+        has_gguf_pass = any(
+            isinstance(cfg, dict) and cfg.get("type", "").lower() == "converthftogguf" for cfg in passes.values()
+        )
+        if not has_gguf_pass:
+            new_passes = {}
+            inserted = False
+            for name, cfg in passes.items():
+                new_passes[name] = cfg
+                if not inserted and isinstance(cfg, dict) and cfg.get("type", "").lower() == "savetestmodelconfig":
+                    new_passes["convert_hf_to_gguf"] = {
+                        "type": "ConvertHfToGGUF",
+                        "llama_cpp_env_path": llama_env_path,
+                        "reference_model_path": reference_model_path,
+                    }
+                    inserted = True
+            if not inserted:
+                new_passes = {
+                    "convert_hf_to_gguf": {
+                        "type": "ConvertHfToGGUF",
+                        "llama_cpp_env_path": llama_env_path,
+                        "reference_model_path": reference_model_path,
+                    },
+                    **new_passes,
+                }
+            passes = new_passes
+            run_config["passes"] = passes
+        else:
+            for pass_cfg in passes.values():
+                if isinstance(pass_cfg, dict) and pass_cfg.get("type", "").lower() == "converthftogguf":
+                    pass_cfg["llama_cpp_env_path"] = llama_env_path
+                    pass_cfg["reference_model_path"] = reference_model_path
+                    break
+
     # Determine output directory for discrepancy results
     report_dir = run_config.get("output_dir") or run_config.get("engine", {}).get("output_dir")
     if report_dir and Path(report_dir).suffix and not Path(report_dir).is_dir():
@@ -305,7 +343,7 @@ def _parse_extra_options(kv_items):
 
         from onnxruntime_genai.models.builder import parse_extra_options
 
-        return parse_extra_options(kv_items)
+        return parse_extra_options(kv_items)  # pylint: disable=no-value-for-parameter
 
     @staticmethod
     def _save_config_file(config: dict):
diff --git a/olive/olive_config.json b/olive/olive_config.json
index d3b21ae89..ecd8c59c8 100644
--- a/olive/olive_config.json
+++ b/olive/olive_config.json
@@ -650,6 +650,14 @@
             "supported_algorithms": [  ],
             "supported_quantization_encodings": [  ]
         },
+        "ConvertHfToGGUF": {
+            "module_path": "olive.passes.pytorch.convert_hf_to_gguf.ConvertHfToGGUF",
+            "supported_providers": [ "*" ],
+            "supported_accelerators": [ "*" ],
+            "supported_precisions": [ "*" ],
+            "supported_algorithms": [  ],
+            "supported_quantization_encodings": [  ]
+        },
         "SelectiveMixedPrecision": {
             "module_path": "olive.passes.pytorch.selective_mixed_precision.SelectiveMixedPrecision",
             "supported_providers": [ "*" ],
diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 4441f7161..29e69a2c4 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -610,15 +610,24 @@ def _run_for_config(
 
         # llama.cpp comparison: convert reference model to GGUF and compare latencies
         if config.llama_cpp:
-            llama_results = self.compare_llama_cpp(
-                config,
-                ref_model,
-                output_dir=report_dir,
-                pytorch_latency_s=results.get("pytorch_latency_s"),
-                onnx_latency_s=results.get("onnx_latency_s"),
-                ref_model_path=ref_path,
-            )
-            results.update(llama_results)
+            preconverted_gguf_path = None
+            if model.model_attributes:
+                preconverted_gguf_path = model.model_attributes.get("reference_gguf_model_path")
+            try:
+                llama_results = self.compare_llama_cpp(
+                    config,
+                    ref_model,
+                    output_dir=report_dir,
+                    pytorch_latency_s=results.get("pytorch_latency_s"),
+                    onnx_latency_s=results.get("onnx_latency_s"),
+                    ref_model_path=ref_path,
+                    preconverted_gguf_path=preconverted_gguf_path,
+                )
+                results.update(llama_results)
+            except Exception as exc:
+                logger.exception("OnnxDiscrepancyCheck llama.cpp comparison failed.")
+                results["status"] = "failed"
+                results.setdefault("failures", []).append(f"llama.cpp comparison failed: {exc}")
 
         # Save results to disk
         report_path = Path(report_dir) / "discrepancy_check_results.json"
@@ -880,6 +889,7 @@ def compare_llama_cpp(
         onnx_latency_s: Optional[float] = None,
         *,
         ref_model_path: str,
+        preconverted_gguf_path: Optional[str] = None,
     ) -> dict:
         """Convert the reference model to GGUF and compare inference with llama.cpp.
 
@@ -904,7 +914,6 @@ def compare_llama_cpp(
         # Resolve the llama_env Python interpreter and conversion script
         env_path = config.llama_cpp_env_path or "llama_env"
         python_path = self._get_llama_env_python(env_path)
-        convert_script = self._get_convert_script(env_path)
 
         # Tokenize the generation prompt using the main-env tokenizer
         tokenizer = AutoTokenizer.from_pretrained(ref_model_path)
@@ -926,19 +935,24 @@ def compare_llama_cpp(
         gguf_path = str(output_dir_path / "model.gguf")
         script_path = str(output_dir_path / "llama_cpp_helper.py")
 
-        # Save model and tokenizer in standard HuggingFace format.
-        ref_model.save_pretrained(model_dir, safe_serialization=True)
-        tokenizer.save_pretrained(model_dir)
-        logger.info("Saved reference HuggingFace model and tokenizer to %s", model_dir)
-
-        # Step 1: Convert to GGUF using the official convert_hf_to_gguf.py CLI.
-        subprocess.run(
-            [python_path, convert_script, model_dir, "--outfile", gguf_path, "--outtype", "f32"],
-            capture_output=True,
-            text=True,
-            check=True,
-        )
-        logger.info("Converted HuggingFace model to GGUF at %s", gguf_path)
+        if preconverted_gguf_path and Path(preconverted_gguf_path).exists():
+            gguf_path = preconverted_gguf_path
+            logger.info("Using pre-converted GGUF from %s", gguf_path)
+        else:
+            convert_script = self._get_convert_script(env_path)
+            # Save model and tokenizer in standard HuggingFace format.
+            ref_model.save_pretrained(model_dir, safe_serialization=True)
+            tokenizer.save_pretrained(model_dir)
+            logger.info("Saved reference HuggingFace model and tokenizer to %s", model_dir)
+
+            # Step 1: Convert to GGUF using the official convert_hf_to_gguf.py CLI.
+            subprocess.run(
+                [python_path, convert_script, model_dir, "--outfile", gguf_path, "--outtype", "f32"],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            logger.info("Converted HuggingFace model to GGUF at %s", gguf_path)
 
         # Step 2: Run inference inside llama_env using the pre-converted GGUF file.
         (output_dir_path / "llama_cpp_helper.py").write_text(_LLAMA_CPP_HELPER_SCRIPT)
diff --git a/olive/passes/pytorch/convert_hf_to_gguf.py b/olive/passes/pytorch/convert_hf_to_gguf.py
new file mode 100644
index 000000000..cc8765382
--- /dev/null
+++ b/olive/passes/pytorch/convert_hf_to_gguf.py
@@ -0,0 +1,86 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import logging
+import subprocess
+import sys
+from pathlib import Path
+
+from olive.hardware.accelerator import AcceleratorSpec
+from olive.model import HfModelHandler
+from olive.passes import Pass
+from olive.passes.pass_config import BasePassConfig, PassConfigParam
+
+logger = logging.getLogger(__name__)
+
+
+class ConvertHfToGGUF(Pass):
+    """Convert the test HuggingFace model directory to a GGUF file."""
+
+    @classmethod
+    def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassConfigParam]:
+        return {
+            "llama_cpp_env_path": PassConfigParam(
+                type_=str,
+                default_value="llama_env",
+                description="Path to the llama.cpp virtual environment containing convert_hf_to_gguf.py.",
+            ),
+            "reference_model_path": PassConfigParam(
+                type_=str,
+                default_value=None,
+                description="Fallback model path to convert when test_model_path is not set.",
+            ),
+            "gguf_file_name": PassConfigParam(
+                type_=str,
+                default_value="model.gguf",
+                description="GGUF output filename.",
+            ),
+        }
+
+    @staticmethod
+    def _get_python_executable(env_path: Path) -> str:
+        if sys.platform.startswith("win"):
+            return str(env_path / "Scripts" / "python.exe")
+        return str(env_path / "bin" / "python")
+
+    def _run_for_config(
+        self, model: HfModelHandler, config: type[BasePassConfig], output_model_path: str
+    ) -> HfModelHandler:
+        source_path = Path(model.test_model_path or config.reference_model_path or "")
+        if not source_path.is_dir():
+            logger.info("ConvertHfToGGUF skipped: source model directory does not exist: %s", source_path)
+            return model
+
+        gguf_path = source_path / config.gguf_file_name
+        if gguf_path.exists():
+            logger.info("ConvertHfToGGUF skipped: GGUF already exists at %s", gguf_path)
+            model_attributes = dict(model.model_attributes) if model.model_attributes else {}
+            model_attributes["reference_gguf_model_path"] = str(gguf_path)
+            model.model_attributes = model_attributes
+            return model
+
+        env_path = Path(config.llama_cpp_env_path).resolve()
+        convert_script = env_path / "convert_hf_to_gguf.py"
+        conversion_pkg = env_path / "conversion"
+        python_path = self._get_python_executable(env_path)
+
+        if not Path(python_path).exists():
+            raise RuntimeError(f"Could not find llama_env python executable: {python_path}")
+        if not convert_script.exists():
+            raise RuntimeError(f"Could not find convert_hf_to_gguf.py at: {convert_script}")
+        if not conversion_pkg.exists():
+            raise RuntimeError(f"Could not find conversion package at: {conversion_pkg}")
+
+        subprocess.run(
+            [python_path, str(convert_script), str(source_path), "--outfile", str(gguf_path), "--outtype", "f32"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        logger.info("Converted test model to GGUF at %s", gguf_path)
+
+        model_attributes = dict(model.model_attributes) if model.model_attributes else {}
+        model_attributes["reference_gguf_model_path"] = str(gguf_path)
+        model.model_attributes = model_attributes
+        return model
diff --git a/test/cli/test_base.py b/test/cli/test_base.py
index 8d6c25391..47a7619b3 100644
--- a/test/cli/test_base.py
+++ b/test/cli/test_base.py
@@ -427,6 +427,8 @@ def test_add_discrepancy_check_pass_llama_env_path_sets_config():
     passes = run_config["passes"]
     first_key = next(iter(passes))
     assert passes[first_key]["type"] == "SaveTestModelConfig"
+    assert passes["convert_hf_to_gguf"]["type"] == "ConvertHfToGGUF"
+    assert passes["convert_hf_to_gguf"]["llama_cpp_env_path"] == "/path/to/llama_env"
 
     pass_config = passes["discrepancy_check"]
     assert pass_config["llama_cpp"] is True
@@ -443,6 +445,7 @@ def test_add_discrepancy_check_pass_no_llama_env_path_omits_llama_config():
     assert passes[first_key]["type"] == "SaveTestModelConfig"
 
     pass_config = passes["discrepancy_check"]
+    assert "convert_hf_to_gguf" not in passes
     assert "llama_cpp" not in pass_config
     assert "llama_cpp_env_path" not in pass_config
 
@@ -465,12 +468,15 @@ def test_add_discrepancy_check_pass_updates_existing_pass():
     config["input_model"]["test_model_path"] = "new_ref_model"
     config["output_dir"] = "new_out_dir"
 
-    result = add_discrepancy_check_pass(config, metrics=["mae", "speedup"])
+    result = add_discrepancy_check_pass(config, metrics=["mae", "speedup"], llama_env_path="/path/to/llama_env")
 
     passes = result["passes"]
     # SaveTestModelConfig must be injected at the beginning
     first_key = next(iter(passes))
     assert passes[first_key]["type"] == "SaveTestModelConfig"
+    assert passes["convert_hf_to_gguf"]["type"] == "ConvertHfToGGUF"
+    assert passes["convert_hf_to_gguf"]["llama_cpp_env_path"] == "/path/to/llama_env"
+    assert passes["convert_hf_to_gguf"]["reference_model_path"] == str(Path("new_ref_model").resolve())
 
     pass_config = passes["discrepancy_check"]
     # Reference model path and output dir must be updated to the current values.
diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index e128682ea..7286e3ab0 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -596,3 +596,54 @@ def test_compare_llama_cpp_no_latency_baselines(self, tmp_path):
         assert result["llama_cpp_speedup_vs_onnx"] is None
         assert result["llama_cpp_first_token_id"] == 7
         assert result["llama_cpp_first_token_matches_pytorch"] is True
+
+    def test_compare_llama_cpp_uses_preconverted_gguf(self, tmp_path):
+        import json
+
+        import torch
+
+        from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck
+
+        config = self._make_config()
+        gguf_path = tmp_path / "prebuilt.gguf"
+        gguf_path.write_text("ok")
+
+        mock_ref_model = MagicMock()
+        mock_ref_model.device = torch.device("cpu")
+        mock_ref_model.generate.return_value = torch.tensor([[1, 2, 3, 7]])
+
+        llama_output = {
+            "first_token_id": 7,
+            "generated_tokens": [7, 8],
+            "ttft": 0.10,
+            "ttfn": None,
+            "total_time": 0.20,
+        }
+
+        mock_proc = MagicMock()
+        mock_proc.stdout = json.dumps(llama_output)
+
+        encoded = MagicMock()
+        encoded.__getitem__ = MagicMock(side_effect=lambda k: torch.tensor([[1, 2, 3]]) if k == "input_ids" else None)
+        mock_tokenizer = MagicMock()
+        mock_tokenizer.return_value = encoded
+        mock_tokenizer.get_vocab = MagicMock(return_value={})
+
+        with (
+            patch.object(OnnxDiscrepancyCheck, "_get_llama_env_python", return_value="/mock/llama_env/bin/python"),
+            patch.object(OnnxDiscrepancyCheck, "_get_convert_script") as mock_convert_script,
+            patch("subprocess.run", return_value=mock_proc) as mock_subprocess_run,
+            patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer),
+        ):
+            pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck)
+            result = pass_instance.compare_llama_cpp(
+                config,
+                mock_ref_model,
+                output_dir=str(tmp_path),
+                ref_model_path=config.reference_model_path,
+                preconverted_gguf_path=str(gguf_path),
+            )
+
+        assert result["llama_cpp_first_token_id"] == 7
+        mock_convert_script.assert_not_called()
+        assert mock_subprocess_run.call_count == 1
diff --git a/test/passes/pytorch/test_convert_hf_to_gguf.py b/test/passes/pytorch/test_convert_hf_to_gguf.py
new file mode 100644
index 000000000..e8ed5d8eb
--- /dev/null
+++ b/test/passes/pytorch/test_convert_hf_to_gguf.py
@@ -0,0 +1,66 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+# pylint: disable=protected-access
+
+from pathlib import Path
+from types import SimpleNamespace
+from unittest.mock import patch
+
+from olive.passes.pytorch.convert_hf_to_gguf import ConvertHfToGGUF
+
+
+def test_convert_hf_to_gguf_skips_when_missing_source(tmp_path):
+    pass_instance = ConvertHfToGGUF.__new__(ConvertHfToGGUF)
+    model = SimpleNamespace(test_model_path=str(tmp_path / "missing"), model_attributes=None)
+    config = SimpleNamespace(
+        llama_cpp_env_path=str(tmp_path / "llama_env"),
+        reference_model_path=str(tmp_path / "missing"),
+        gguf_file_name="model.gguf",
+    )
+
+    result = pass_instance._run_for_config(model, config, str(tmp_path / "out"))
+    assert result is model
+
+
+def test_convert_hf_to_gguf_uses_existing_gguf(tmp_path):
+    source = tmp_path / "test_model"
+    source.mkdir(parents=True, exist_ok=True)
+    gguf_path = source / "model.gguf"
+    gguf_path.write_text("ok")
+
+    pass_instance = ConvertHfToGGUF.__new__(ConvertHfToGGUF)
+    model = SimpleNamespace(test_model_path=str(source), model_attributes={})
+    config = SimpleNamespace(
+        llama_cpp_env_path=str(tmp_path / "llama_env"),
+        reference_model_path=str(source),
+        gguf_file_name="model.gguf",
+    )
+
+    result = pass_instance._run_for_config(model, config, str(tmp_path / "out"))
+    assert result.model_attributes["reference_gguf_model_path"] == str(gguf_path)
+
+
+def test_convert_hf_to_gguf_runs_conversion(tmp_path):
+    source = tmp_path / "test_model"
+    source.mkdir(parents=True, exist_ok=True)
+    env = tmp_path / "llama_env"
+    (env / "bin").mkdir(parents=True, exist_ok=True)
+    (env / "bin" / "python").write_text("")
+    (env / "convert_hf_to_gguf.py").write_text("")
+    (env / "conversion").mkdir(parents=True, exist_ok=True)
+
+    pass_instance = ConvertHfToGGUF.__new__(ConvertHfToGGUF)
+    model = SimpleNamespace(test_model_path=str(source), model_attributes={})
+    config = SimpleNamespace(
+        llama_cpp_env_path=str(env),
+        reference_model_path=str(source),
+        gguf_file_name="model.gguf",
+    )
+
+    with patch("olive.passes.pytorch.convert_hf_to_gguf.subprocess.run") as mock_run:
+        result = pass_instance._run_for_config(model, config, str(tmp_path / "out"))
+
+    assert mock_run.call_count == 1
+    assert Path(result.model_attributes["reference_gguf_model_path"]).name == "model.gguf"

From 230cdecf4e23bb9f4ab720ee0b166e9e6a07e3d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Wed, 1 Jul 2026 12:49:48 +0200
Subject: [PATCH 50/80] refactor

---
 docs/source/how-to/cli/cli-fast-test.md | 113 +-----------------------
 olive/cli/base.py                       |   4 +-
 olive/cli/run.py                        |  13 ---
 3 files changed, 4 insertions(+), 126 deletions(-)

diff --git a/docs/source/how-to/cli/cli-fast-test.md b/docs/source/how-to/cli/cli-fast-test.md
index 0779e065f..8395decae 100644
--- a/docs/source/how-to/cli/cli-fast-test.md
+++ b/docs/source/how-to/cli/cli-fast-test.md
@@ -4,22 +4,8 @@ If you are converting a large language model, it is often useful to validate the
 
 The `--test` option does that for Hugging Face models. Olive keeps the same model architecture, reduces it to a random **2-layer** test model, saves it to the folder you provide, and reuses that folder on later runs.
 
-> **Why 2 layers?**  When `olive run` executes the model-builder pass for the first time it calls
-> `_apply_test_model_config` (in `olive/common/hf/utils.py`) to override every hidden-layer count
-> field (`num_hidden_layers`, `num_layers`, `n_layer`, `n_layers`) to `2` before the random test
-> checkpoint is created.  This keeps the checkpoint small and fast to convert while preserving the
-> model's architecture family (tokeniser, attention pattern, etc.).
->
-> **Note:** `olive optimize --dry_run` only generates the workflow config — it does **not** run any
-> passes or create the test model directory.  The test model is created the first time you run
-> `olive run --test`.  On subsequent `olive run` calls that hit the model-builder cache, Olive
-> automatically falls back to a copy of the test model saved alongside the cached ONNX artifacts,
-> so the test model directory is not required to exist on disk.
-
 This example uses [`Qwen/Qwen3-0.6B`](https://huggingface.co/Qwen/Qwen3-0.6B), but the same pattern works for other supported Hugging Face LLMs.
 
-## Step 1: generate the workflow config
-
 Start by generating the config that Olive will run for the Qwen conversion.
 
 ```bash
@@ -29,33 +15,12 @@ olive optimize \
     --provider CPUExecutionProvider \
     --precision int4 \
     --output_path out/qwen \
-    --dry_run \
     --test out/qwen-test-model
 ```
 
 This creates `out/qwen/config.json` without launching the full conversion yet.
 It also inserts an `OnnxDiscrepancyCheck` pass (if one is not already present) that will compare the generated ONNX model against the 2-layer reference model.
 
-## Step 2: run a fast smoke test with `olive run --test`
-
-Use the generated config with `olive run` and pass `--test` so Olive swaps in the reduced random model.
-
-```bash
-olive run \
-    --config out/qwen/config.json \
-    --test out/qwen-test-model \
-    --output_path out/qwen-test-run
-```
-
-What this does:
-
-- `--test out/qwen-test-model` creates a reduced random Qwen model (2 hidden layers) and saves it in `out/qwen-test-model` on the first run; on later runs Olive reuses the saved test model — or, if the model-builder output is cached and `out/qwen-test-model` no longer exists, it automatically falls back to the copy saved inside the ONNX cache directory
-- `--output_path out/qwen-test-run` gives the smoke test its own output folder, so the generated ONNX artifacts are easy to find
-- Olive marks that output folder as a test-only run and refuses to reuse a non-test conversion folder for `--test`
-- The saved model configuration (`reference_model_config.json`) is written alongside the discrepancy results so you can inspect exactly which config was used
-
-After the smoke test finishes, look under `out/qwen-test-run` for the exported ONNX model and related files.
-
 This is a quick way to confirm that:
 
 - Olive can load the source model
@@ -70,80 +35,6 @@ By default, `--test` evaluates:
 
 - `mae`: maximum absolute error between the ONNX and reference model outputs
 
-Add `speedup` via `--test_metrics speedup` (or `--test_metrics mae speedup`) to also run latency measurement.
-
-> **Note:** `--test_metrics` is always respected even when the config was generated by `olive optimize --dry_run --test`, because Olive updates the existing `OnnxDiscrepancyCheck` settings each time `olive run --test` is invoked.
-
-For example, to run both accuracy and latency checks:
-
-```bash
-olive run \
-    --config out/qwen/config.json \
-    --test out/qwen-test-model \
-    --test_metrics mae,speedup \
-    --output_path out/qwen-test-run
-```
-
-Comma-separated (`mae,speedup`) and space-separated (`mae speedup`) forms are both accepted.
-
-### Optional: compare against llama.cpp
-
-If you have a `llama_env` virtual environment with `llama-cpp-python` installed, you can also compare the generated ONNX model against a llama.cpp GGUF conversion of the same reference model.
-
-```bash
-olive run \
-    --config out/qwen/config.json \
-    --test out/qwen-test-model \
-    --test_metrics mae,speedup \
-    --test_llama_path ./llama_env \
-    --output_path out/qwen-test-run
-```
-
-`--test_llama_path` points to the virtual environment that contains `llama-cpp-python` and `convert_hf_to_gguf.py` (from the llama.cpp repository). When provided, Olive injects a `ConvertHfToGGUF` pass before model conversion and then:
-
-1. Converts the reduced test HuggingFace model to GGUF (`<test_model_path>/model.gguf`) via `convert_hf_to_gguf.py`.
-2. Reuses that GGUF file in `OnnxDiscrepancyCheck` for llama.cpp inference and reports first-token latency and speedup metrics alongside the regular MAE and ONNX speedup results.
-
-All `llama-cpp-python` imports are strictly isolated to the subprocess — the main Olive process never imports them.
-
-To set up the `llama_env` virtual environment:
-
-```bash
-python -m venv llama_env
-llama_env/bin/pip install gguf safetensors transformers sentencepiece protobuf \
-    llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
-git clone --depth=1 --filter=blob:none --sparse \
-    https://github.com/ggerganov/llama.cpp.git /tmp/llama_cpp_repo
-git -C /tmp/llama_cpp_repo sparse-checkout set convert_hf_to_gguf.py conversion
-LLAMA_ENV="$(pwd)/llama_env"
-cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py "$LLAMA_ENV/"
-cp -r /tmp/llama_cpp_repo/conversion "$LLAMA_ENV/"
-```
-
-## Step 3: run the full conversion
-
-Once the smoke test succeeds, rerun the conversion on the full Qwen checkpoint by removing `--test`.
-
-```bash
-olive run \
-    --config out/qwen/config.json \
-    --output_path out/qwen-full
-```
-
-At this point you know the Olive command and the conversion recipe already worked on the lightweight test model, so you can focus on the full-model run instead of debugging both at once.
-
-## Why keep the test model folder?
-
-The saved test model is useful beyond the first smoke test:
-
-- you can rerun the reduced conversion quickly while iterating on options
-- you can reuse the same HF test model later when comparing the Hugging Face model against the exported ONNX model
-- you avoid recreating a new random test checkpoint every time
-
-Even if you delete the test model folder, `OnnxDiscrepancyCheck` will automatically use the copy saved inside the model-builder cache directory (`reference_hf_model/` alongside the ONNX artifacts), so the comparison step continues to work.
-
-## Related docs
+Add `speedup` via `--test_metrics speedup` (or `--test_metrics mae,speedup`) to also run latency measurement.
 
-- [How to use the `olive optimize` command to optimize a Pytorch model](cli-optimize)
-- [How to write a new workflow from scratch](../configure-workflows/build-workflow)
-- [CLI reference](../../reference/cli)
+> **Note:** `--test_metrics` is always respected even when the config was generated by `olive optimize --test`, because Olive updates the existing `OnnxDiscrepancyCheck` settings each time `olive run --test` is invoked.
diff --git a/olive/cli/base.py b/olive/cli/base.py
index 7a3f2db77..72ddc659e 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -128,14 +128,14 @@ def add_discrepancy_check_pass(
     * ``SaveTestModelConfig`` — inserted at the *beginning* of the passes dict so that the
       test-model directory (containing only ``config.json`` and the marker file) is created
       before any other pass runs.  This ensures subsequent passes can find the directory even
-      on the first ``olive run`` after ``olive optimize --dry_run --test``.
+      on the first ``olive run`` after ``olive optimize --test``.
 
     * ``ConvertHfToGGUF`` — inserted after ``SaveTestModelConfig`` when ``llama_env_path`` is
       provided, and converts the test HuggingFace directory to GGUF in advance.
 
     * ``OnnxDiscrepancyCheck`` — appended at the end to compare the ONNX model against the
       reference HuggingFace model.  If an instance is already present in the config (e.g.
-      from a previous ``--dry_run --test`` invocation), its dynamic runtime fields
+      from a previous ``--test`` invocation), its dynamic runtime fields
       (``reference_model_path``, ``report_output_dir``, metric and llama.cpp settings) are
       updated in-place so that the current ``--test_metrics`` and ``--output_path`` values
       always take effect.
diff --git a/olive/cli/run.py b/olive/cli/run.py
index 55c7ffb9b..31ff61657 100644
--- a/olive/cli/run.py
+++ b/olive/cli/run.py
@@ -6,8 +6,6 @@
 
 from olive.cli.base import (
     BaseOliveCLICommand,
-    _flatten_test_metrics,
-    add_discrepancy_check_pass,
     add_hf_test_model_config,
     add_input_model_options,
     add_logging_options,
@@ -15,8 +13,6 @@
     get_input_model_config,
     mark_test_output_path,
     save_discrepancy_check_results,
-    validate_test_output_path,
-    warn_unused_test_metrics,
 )
 from olive.telemetry import action
 
@@ -84,15 +80,6 @@ def run(self):
                 run_config[rc_key] = arg_value
 
         output_path = run_config.get("output_dir") or run_config.get("engine", {}).get("output_dir")
-        validate_test_output_path(output_path, self.args.test)
-        warn_unused_test_metrics(self.args.test, getattr(self.args, "test_metrics", None))
-        if self.args.test not in (None, False):
-            test_metrics = _flatten_test_metrics(getattr(self.args, "test_metrics", None))
-            run_config = add_discrepancy_check_pass(
-                run_config,
-                metrics=test_metrics,
-                llama_env_path=getattr(self.args, "test_llama_path", None),
-            )
         workflow_output = olive_run(
             run_config,
             list_required_packages=self.args.list_required_packages,

From 5dd9bf472957c5e225a09141ec05db6d2882ecc6 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 1 Jul 2026 11:05:08 +0000
Subject: [PATCH 51/80] Keep only Olive config + test model in output_path for
 optimize --test

---
 olive/cli/base.py     | 28 +++++++++++++++++++++++-----
 test/cli/test_base.py |  8 ++++----
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index 72ddc659e..35f0b88e1 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -305,26 +305,44 @@ def _run_workflow(self):
         )
         Path(self.args.output_path).mkdir(parents=True, exist_ok=True)
 
+        is_test = getattr(self.args, "test", None) not in (None, False)
+
         with tempfile.TemporaryDirectory(prefix="olive-cli-tmp-", dir=self.args.output_path) as tempdir:
             run_config = self._get_run_config(tempdir)
-            if getattr(self.args, "test", None) not in (None, False):
+            if is_test:
                 run_config = add_discrepancy_check_pass(
                     run_config,
                     test_metrics,
                     getattr(self.args, "test_llama_path", None),
                 )
-            if self.args.save_config_file or self.args.dry_run:
+            # In --test mode, always persist the Olive config to <output_path>/config.json.
+            # This must happen before the workflow runs so the model builder's transformers
+            # config.json does not overwrite it (the optimized model and any reference copies
+            # are redirected to a temp working dir below). The only persisted model is then the
+            # small test model saved at the --test path.
+            if self.args.save_config_file or self.args.dry_run or is_test:
                 self._save_config_file(run_config)
             if self.args.dry_run:
-                if getattr(self.args, "test", None) not in (None, False):
+                if is_test:
                     mark_test_output_path(self.args.output_path)
                 print("Dry run mode enabled. Configuration file is generated but no optimization is performed.")
                 return None
+            if is_test:
+                # Treat <output_path> as a report directory: it keeps only the Olive config.json
+                # and discrepancy_check_results.json. Route the optimized ONNX model, its
+                # transformers config.json, and any reference model copies into the temp working
+                # dir so they are discarded and do not clutter <output_path>.
+                work_dir = str(Path(tempdir) / "optimized")
+                run_config["output_dir"] = work_dir
+                for pass_cfg in run_config.get("passes", {}).values():
+                    if isinstance(pass_cfg, dict) and pass_cfg.get("type", "").lower() == "onnxdiscrepancycheck":
+                        pass_cfg["report_output_dir"] = work_dir
             workflow_output = olive_run(run_config)
-            if getattr(self.args, "test", None) not in (None, False):
+            if is_test:
                 mark_test_output_path(self.args.output_path)
                 save_discrepancy_check_results(workflow_output, self.args.output_path)
-            if not workflow_output.has_output_model():
+                print(f"Test report saved at {self.args.output_path}")
+            elif not workflow_output.has_output_model():
                 print("No output model produced. Please check the log for details.")
             else:
                 print(f"Model is saved at {self.args.output_path}")
diff --git a/test/cli/test_base.py b/test/cli/test_base.py
index 47a7619b3..67b3ffc32 100644
--- a/test/cli/test_base.py
+++ b/test/cli/test_base.py
@@ -58,7 +58,7 @@
                 },
                 "load_kwargs": {
                     "trust_remote_code": False,
-                    "attn_implementation": "eager",
+                    "attn_implementation": "sdpa",
                 },
             },
         ),
@@ -76,7 +76,7 @@
                 "model_path": "my_model/my_model",
                 "load_kwargs": {
                     "trust_remote_code": True,
-                    "attn_implementation": "eager",
+                    "attn_implementation": "sdpa",
                 },
             },
         ),
@@ -94,7 +94,7 @@
                 "model_path": "hf_model",
                 "load_kwargs": {
                     "trust_remote_code": False,
-                    "attn_implementation": "eager",
+                    "attn_implementation": "sdpa",
                 },
             },
         ),
@@ -141,7 +141,7 @@
                 "model_path": "hf",
                 "load_kwargs": {
                     "trust_remote_code": False,
-                    "attn_implementation": "eager",
+                    "attn_implementation": "sdpa",
                 },
             },
         ),

From 811ad65a78d26f9a41d49736fab43a4819f921df Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 1 Jul 2026 11:36:27 +0000
Subject: [PATCH 52/80] Keep optimized ONNX model in output_path/model instead
 of temp dir for --test

---
 olive/cli/base.py | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index 35f0b88e1..8ba611e73 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -317,26 +317,26 @@ def _run_workflow(self):
                 )
             # In --test mode, always persist the Olive config to <output_path>/config.json.
             # This must happen before the workflow runs so the model builder's transformers
-            # config.json does not overwrite it (the optimized model and any reference copies
-            # are redirected to a temp working dir below). The only persisted model is then the
-            # small test model saved at the --test path.
+            # config.json (written into the model subdirectory below) never overwrites it.
+            if is_test:
+                # Treat <output_path> as a report directory holding the Olive config.json and
+                # discrepancy_check_results.json. Save the optimized ONNX model into a "model"
+                # subdirectory so it is preserved on disk (not discarded in a temp directory)
+                # while keeping the Olive config.json at the <output_path> root.
+                model_dir = str(Path(self.args.output_path) / "model")
+                run_config["output_dir"] = model_dir
+                for pass_cfg in run_config.get("passes", {}).values():
+                    if isinstance(pass_cfg, dict) and pass_cfg.get("type", "").lower() == "onnxdiscrepancycheck":
+                        pass_cfg["report_output_dir"] = self.args.output_path
             if self.args.save_config_file or self.args.dry_run or is_test:
-                self._save_config_file(run_config)
+                # In --test mode, keep the Olive config.json at the <output_path> root even
+                # though the workflow output_dir points to the "model" subdirectory.
+                self._save_config_file(run_config, self.args.output_path if is_test else None)
             if self.args.dry_run:
                 if is_test:
                     mark_test_output_path(self.args.output_path)
                 print("Dry run mode enabled. Configuration file is generated but no optimization is performed.")
                 return None
-            if is_test:
-                # Treat <output_path> as a report directory: it keeps only the Olive config.json
-                # and discrepancy_check_results.json. Route the optimized ONNX model, its
-                # transformers config.json, and any reference model copies into the temp working
-                # dir so they are discarded and do not clutter <output_path>.
-                work_dir = str(Path(tempdir) / "optimized")
-                run_config["output_dir"] = work_dir
-                for pass_cfg in run_config.get("passes", {}).values():
-                    if isinstance(pass_cfg, dict) and pass_cfg.get("type", "").lower() == "onnxdiscrepancycheck":
-                        pass_cfg["report_output_dir"] = work_dir
             workflow_output = olive_run(run_config)
             if is_test:
                 mark_test_output_path(self.args.output_path)
@@ -364,9 +364,15 @@ def _parse_extra_options(kv_items):
         return parse_extra_options(kv_items)  # pylint: disable=no-value-for-parameter
 
     @staticmethod
-    def _save_config_file(config: dict):
-        """Save the config file."""
-        config_file_path = Path(config["output_dir"]) / "config.json"
+    def _save_config_file(config: dict, output_dir: Optional[str] = None):
+        """Save the config file.
+
+        By default the config is written to ``<config["output_dir"]>/config.json``. When
+        ``output_dir`` is provided, the config is written to ``<output_dir>/config.json``
+        instead (used in --test mode to keep the Olive config at the report directory root).
+        """
+        target_dir = output_dir if output_dir is not None else config["output_dir"]
+        config_file_path = Path(target_dir) / "config.json"
         with open(config_file_path, "w") as f:
             json.dump(config, f, indent=4)
         print(f"Config file saved at {config_file_path}")

From 99c5e0245c812a494e45f5cde2cf6c448d7a4951 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 1 Jul 2026 11:55:19 +0000
Subject: [PATCH 53/80] Add olive_config.json save + first_token_20/tft/tf5t
 generation metrics

---
 docs/source/how-to/cli/cli-fast-test.md    |  18 ++-
 olive/cli/base.py                          |  29 +++--
 olive/passes/onnx/discrepancy_check.py     | 139 ++++++++++++++++++---
 test/cli/test_base.py                      |   6 +
 test/passes/onnx/test_discrepancy_check.py | 123 ++++++++++++++++++
 5 files changed, 289 insertions(+), 26 deletions(-)

diff --git a/docs/source/how-to/cli/cli-fast-test.md b/docs/source/how-to/cli/cli-fast-test.md
index 8395decae..61a429e6b 100644
--- a/docs/source/how-to/cli/cli-fast-test.md
+++ b/docs/source/how-to/cli/cli-fast-test.md
@@ -18,9 +18,16 @@ olive optimize \
     --test out/qwen-test-model
 ```
 
-This creates `out/qwen/config.json` without launching the full conversion yet.
+Because this example runs without `--dry_run`, it produces:
+
+- `out/qwen/olive_config.json` — the Olive configuration used for the run (named `olive_config.json` so it is never confused with the model's own `config.json`).
+- `out/qwen/model/` — the optimized ONNX model.
+- `out/qwen/discrepancy_check_results.json` — the discrepancy report.
+
 It also inserts an `OnnxDiscrepancyCheck` pass (if one is not already present) that will compare the generated ONNX model against the 2-layer reference model.
 
+> **Note:** When `--dry_run` is used instead, Olive writes `out/qwen/config.json` (so that `olive run --config out/qwen/config.json` works) without launching the conversion yet.
+
 This is a quick way to confirm that:
 
 - Olive can load the source model
@@ -35,6 +42,13 @@ By default, `--test` evaluates:
 
 - `mae`: maximum absolute error between the ONNX and reference model outputs
 
-Add `speedup` via `--test_metrics speedup` (or `--test_metrics mae,speedup`) to also run latency measurement.
+Additional metrics can be requested via `--test_metrics` (space- or comma-separated):
+
+- `speedup`: ONNX-vs-PyTorch inference latency
+- `first_token_20`: compares the first generated token (over a 20-token generation) between ONNX Runtime GenAI and transformers
+- `tft`: time to the first generated token (reported for both ONNX Runtime GenAI and transformers)
+- `tf5t`: time to the first 5 generated tokens (reported for both ONNX Runtime GenAI and transformers)
+
+For example, `--test_metrics mae,speedup,first_token_20,tft,tf5t`. The generation metrics (`first_token_20`, `tft`, `tf5t`) use the optimized ONNX model directory as the ONNX Runtime GenAI model when it contains a `genai_config.json` (as produced by the model builder).
 
 > **Note:** `--test_metrics` is always respected even when the config was generated by `olive optimize --test`, because Olive updates the existing `OnnxDiscrepancyCheck` settings each time `olive run --test` is invoked.
diff --git a/olive/cli/base.py b/olive/cli/base.py
index 8ba611e73..7dcb94e7a 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -23,7 +23,7 @@
 TEST_OUTPUT_MARKER_FILE = "olive_test_output.json"
 
 # Metrics that --test can evaluate via the injected OnnxDiscrepancyCheck pass.
-TEST_METRICS = ("mae", "speedup")
+TEST_METRICS = ("mae", "speedup", "first_token_20", "tft", "tf5t")
 
 
 def _parse_test_metrics(value: str) -> list:
@@ -329,9 +329,17 @@ def _run_workflow(self):
                     if isinstance(pass_cfg, dict) and pass_cfg.get("type", "").lower() == "onnxdiscrepancycheck":
                         pass_cfg["report_output_dir"] = self.args.output_path
             if self.args.save_config_file or self.args.dry_run or is_test:
-                # In --test mode, keep the Olive config.json at the <output_path> root even
-                # though the workflow output_dir points to the "model" subdirectory.
-                self._save_config_file(run_config, self.args.output_path if is_test else None)
+                # In --test mode, keep the Olive config at the <output_path> root even though the
+                # workflow output_dir points to the "model" subdirectory. When dry_run is not
+                # enabled (a one-step run) save it as olive_config.json so it is never confused
+                # with the model's own transformers config.json; dry_run keeps config.json so
+                # `olive run --config <output_path>/config.json` continues to work.
+                config_file_name = "config.json" if self.args.dry_run else "olive_config.json"
+                self._save_config_file(
+                    run_config,
+                    self.args.output_path if is_test else None,
+                    config_file_name,
+                )
             if self.args.dry_run:
                 if is_test:
                     mark_test_output_path(self.args.output_path)
@@ -364,15 +372,17 @@ def _parse_extra_options(kv_items):
         return parse_extra_options(kv_items)  # pylint: disable=no-value-for-parameter
 
     @staticmethod
-    def _save_config_file(config: dict, output_dir: Optional[str] = None):
+    def _save_config_file(config: dict, output_dir: Optional[str] = None, file_name: str = "config.json"):
         """Save the config file.
 
         By default the config is written to ``<config["output_dir"]>/config.json``. When
-        ``output_dir`` is provided, the config is written to ``<output_dir>/config.json``
+        ``output_dir`` is provided, the config is written to ``<output_dir>/<file_name>``
         instead (used in --test mode to keep the Olive config at the report directory root).
+        ``file_name`` controls the config file name (e.g. ``olive_config.json`` for a one-step
+        run so it is never confused with the model's own ``config.json``).
         """
         target_dir = output_dir if output_dir is not None else config["output_dir"]
-        config_file_path = Path(target_dir) / "config.json"
+        config_file_path = Path(target_dir) / file_name
         with open(config_file_path, "w") as f:
             json.dump(config, f, indent=4)
         print(f"Config file saved at {config_file_path}")
@@ -707,7 +717,10 @@ def add_input_model_options(
             nargs="+",
             help=(
                 "Metrics to evaluate during a --test run: 'mae' enforces the max absolute error between the "
-                "ONNX and reference model outputs, and 'speedup' measures ONNX-vs-PyTorch inference latency. "
+                "ONNX and reference model outputs, 'speedup' measures ONNX-vs-PyTorch inference latency, "
+                "'first_token_20' compares the first generated token (over a 20-token generation) between "
+                "ONNX Runtime GenAI and transformers, 'tft' reports the time to the first generated token, and "
+                "'tf5t' reports the time to the first 5 generated tokens. "
                 "Accepts space- or comma-separated values (e.g. 'mae,speedup' or 'mae speedup'). "
                 "Defaults to 'mae'. Only used together with --test."
             ),
diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 29e69a2c4..4fc4c9274 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -232,9 +232,13 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                 default_value=None,
                 description=(
                     "List of test metrics to evaluate. Accepted values are ``'mae'`` (max absolute error "
-                    "between ONNX and reference PyTorch outputs) and ``'speedup'`` (ONNX-vs-PyTorch "
-                    "inference latency). When set, this field takes precedence over ``timing_iterations`` "
-                    "and ``max_mae``: ``'speedup'`` enables timing, ``'mae'`` enforces the MAE threshold. "
+                    "between ONNX and reference PyTorch outputs), ``'speedup'`` (ONNX-vs-PyTorch "
+                    "inference latency), ``'first_token_20'`` (first generated token comparison over a "
+                    "20-token generation between ONNX Runtime GenAI and transformers), ``'tft'`` (time to "
+                    "the first generated token) and ``'tf5t'`` (time to the first 5 generated tokens). "
+                    "When set, this field takes precedence over ``timing_iterations`` "
+                    "and ``max_mae``: ``'speedup'`` enables timing, ``'mae'`` enforces the MAE threshold, and "
+                    "the generation metrics run the transformers-vs-GenAI comparison. "
                     "Example: ``['mae', 'speedup']``. Set by the CLI ``--test_metrics`` option."
                 ),
             ),
@@ -537,13 +541,17 @@ def _run_for_config(
         # This lets the CLI store a human-readable ["mae", "speedup"] list in the config
         # while still supporting the lower-level timing_iterations / max_mae controls for
         # advanced users and backward compatibility with older configs.
+        requested_metrics = set(config.test_metrics) if config.test_metrics is not None else set()
         if config.test_metrics is not None:
-            effective_timing_iterations = 5 if "speedup" in config.test_metrics else 0
-            effective_max_mae = 0.1 if "mae" in config.test_metrics else None
+            effective_timing_iterations = 5 if "speedup" in requested_metrics else 0
+            effective_max_mae = 0.1 if "mae" in requested_metrics else None
         else:
             effective_timing_iterations = config.timing_iterations
             effective_max_mae = config.max_mae
 
+        # Metrics that require running token generation (transformers vs ONNX Runtime GenAI).
+        generation_metrics = requested_metrics & {"first_token_20", "tft", "tf5t"}
+
         # Measure inference speedup (ONNX vs PyTorch) on the target device
         if effective_timing_iterations > 0:
             timing = self._measure_speedup(
@@ -593,12 +601,81 @@ def _run_for_config(
         else:
             results["status"] = "passed"
 
-        # Generation token sequence comparison (transformers vs ONNX Runtime GenAI)
-        if config.genai_model_path:
-            gen_results = self.compare_generation(config, ref_model, ref_model_path=ref_path)
+        # Generation token sequence comparison (transformers vs ONNX Runtime GenAI).
+        # Runs when an explicit genai_model_path is configured or when any generation-based
+        # test metric (first_token_20 / tft / tf5t) is requested.  In the latter case the
+        # optimized ONNX model directory is used as the GenAI model when it exposes a
+        # genai_config.json (as produced by the ModelBuilder pass).
+        genai_model_path = config.genai_model_path
+        if genai_model_path is None and generation_metrics:
+            model_dir = Path(model.model_path)
+            model_dir = model_dir if model_dir.is_dir() else model_dir.parent
+            if (model_dir / "genai_config.json").is_file():
+                genai_model_path = str(model_dir)
+                logger.info(
+                    "Using optimized ONNX model directory %s as the GenAI model for generation metrics.",
+                    genai_model_path,
+                )
+            else:
+                logger.warning(
+                    "Generation metrics %s requested but no genai_config.json was found in %s; skipping them.",
+                    sorted(generation_metrics),
+                    model_dir,
+                )
+
+        if genai_model_path:
+            # first_token_20 generates 20 tokens; tf5t measures the time to the first 5 tokens.
+            gen_max_new_tokens = 20 if "first_token_20" in generation_metrics else config.generate_max_new_tokens
+            gen_first_n = 5 if "tf5t" in generation_metrics else config.time_to_first_n_tokens
+            gen_results = self.compare_generation(
+                config,
+                ref_model,
+                ref_model_path=ref_path,
+                genai_model_path=genai_model_path,
+                max_new_tokens=gen_max_new_tokens,
+                first_n=gen_first_n,
+            )
             longest_common = gen_results["longest_common_token_sequence"]
             results.update(gen_results)
-            results["genai_model_path"] = config.genai_model_path
+            results["genai_model_path"] = genai_model_path
+
+            # Surface the explicitly requested named metrics for easy inspection.
+            if "first_token_20" in generation_metrics:
+                results["first_token_20"] = {
+                    "transformers_first_token": gen_results.get("transformers_first_token"),
+                    "genai_first_token": gen_results.get("genai_first_token"),
+                    "first_token_matches": gen_results.get("first_token_matches"),
+                    "matching_leading_tokens": longest_common,
+                }
+                logger.info(
+                    "OnnxDiscrepancyCheck first_token_20: matches=%s (transformers=%s, genai=%s), "
+                    "matching_leading_tokens=%s",
+                    gen_results.get("first_token_matches"),
+                    gen_results.get("transformers_first_token"),
+                    gen_results.get("genai_first_token"),
+                    longest_common,
+                )
+            if "tft" in generation_metrics:
+                results["tft"] = {
+                    "transformers_s": gen_results.get("transformers_time_to_first_token_s"),
+                    "genai_s": gen_results.get("genai_time_to_first_token_s"),
+                }
+                logger.info(
+                    "OnnxDiscrepancyCheck tft (time to first token): transformers=%s, genai=%s",
+                    _format_seconds(gen_results.get("transformers_time_to_first_token_s")),
+                    _format_seconds(gen_results.get("genai_time_to_first_token_s")),
+                )
+            if "tf5t" in generation_metrics:
+                results["tf5t"] = {
+                    "transformers_s": gen_results.get("transformers_time_to_first_n_tokens_s"),
+                    "genai_s": gen_results.get("genai_time_to_first_n_tokens_s"),
+                }
+                logger.info(
+                    "OnnxDiscrepancyCheck tf5t (time to first 5 tokens): transformers=%s, genai=%s",
+                    _format_seconds(gen_results.get("transformers_time_to_first_n_tokens_s")),
+                    _format_seconds(gen_results.get("genai_time_to_first_n_tokens_s")),
+                )
+
             if config.min_longest_common_tokens is not None and longest_common < config.min_longest_common_tokens:
                 results["status"] = "failed"
                 gen_failure = (
@@ -713,12 +790,26 @@ def _measure_speedup(
 
         return pytorch_time, onnx_time, speedup
 
-    def compare_generation(self, config: type[BasePassConfig], ref_model, *, ref_model_path: str) -> dict:
+    def compare_generation(
+        self,
+        config: type[BasePassConfig],
+        ref_model,
+        *,
+        ref_model_path: str,
+        genai_model_path: Optional[str] = None,
+        max_new_tokens: Optional[int] = None,
+        first_n: Optional[int] = None,
+    ) -> dict:
         """Run generation on both transformers and GenAI and compare them.
 
-        Returns a dict with the longest common token sequence length and the time-to-first-token
-        and time-to-first-N-tokens latencies (in seconds) for both transformers and ONNX Runtime
-        GenAI, where N is ``config.time_to_first_n_tokens``.
+        Returns a dict with the longest common token sequence length, the first-generated-token
+        match between transformers and ONNX Runtime GenAI, and the time-to-first-token and
+        time-to-first-N-tokens latencies (in seconds) for both, where N is ``first_n``
+        (defaults to ``config.time_to_first_n_tokens``).
+
+        ``genai_model_path``, ``max_new_tokens`` and ``first_n`` override the corresponding
+        config values when provided, which lets the caller request specific metrics such as
+        ``first_token_20`` (20-token generation) or ``tf5t`` (first 5 tokens).
         """
         try:
             import onnxruntime_genai as og
@@ -726,10 +817,12 @@ def compare_generation(self, config: type[BasePassConfig], ref_model, *, ref_mod
             raise ImportError("Please install `onnxruntime-genai` to enable generation comparison.") from exc
         from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList
 
+        genai_model_path = genai_model_path if genai_model_path is not None else config.genai_model_path
         tokenizer = AutoTokenizer.from_pretrained(ref_model_path)
 
-        max_new_tokens = config.generate_max_new_tokens
-        first_n = max(1, min(config.time_to_first_n_tokens, max_new_tokens)) if max_new_tokens > 0 else 0
+        max_new_tokens = config.generate_max_new_tokens if max_new_tokens is None else max_new_tokens
+        first_n_config = config.time_to_first_n_tokens if first_n is None else first_n
+        first_n = max(1, min(first_n_config, max_new_tokens)) if max_new_tokens > 0 else 0
 
         # Transformers generation
         input_ids = tokenizer(config.generate_prompt, return_tensors="pt").input_ids
@@ -777,7 +870,7 @@ def __call__(self, generated_ids, scores, **kwargs) -> bool:
         transformers_tokens = transformers_output[0].cpu().tolist()
 
         # ONNX Runtime GenAI generation
-        genai_model = og.Model(config.genai_model_path)
+        genai_model = og.Model(genai_model_path)
         genai_tokenizer = og.Tokenizer(genai_model)
         genai_input_ids = genai_tokenizer.encode(config.generate_prompt)
 
@@ -787,6 +880,7 @@ def __call__(self, generated_ids, scores, **kwargs) -> bool:
         generator = og.Generator(genai_model, params)
         generator.append_tokens([genai_input_ids])
         genai_tokens = list(genai_input_ids)
+        genai_prompt_token_count = len(genai_input_ids)
         genai_ttft = None
         genai_ttfn = None
         num_generated = 0
@@ -803,9 +897,21 @@ def __call__(self, generated_ids, scores, **kwargs) -> bool:
 
         longest_common = _longest_common_token_sequence(transformers_tokens, genai_tokens)
 
+        # First generated token comparison (transformers vs ONNX Runtime GenAI).
+        transformers_first_token = (
+            transformers_tokens[prompt_token_count] if len(transformers_tokens) > prompt_token_count else None
+        )
+        genai_first_token = (
+            genai_tokens[genai_prompt_token_count] if len(genai_tokens) > genai_prompt_token_count else None
+        )
+        first_token_matches = transformers_first_token is not None and transformers_first_token == genai_first_token
+
         gen_results = {
             "longest_common_token_sequence": longest_common,
             "time_to_first_n_tokens": first_n,
+            "transformers_first_token": transformers_first_token,
+            "genai_first_token": genai_first_token,
+            "first_token_matches": first_token_matches,
             "transformers_time_to_first_token_s": transformers_ttft,
             "transformers_time_to_first_n_tokens_s": transformers_ttfn,
             "genai_time_to_first_token_s": genai_ttft,
@@ -816,6 +922,7 @@ def __call__(self, generated_ids, scores, **kwargs) -> bool:
             f"OnnxDiscrepancyCheck generation comparison: "
             f"transformers_len={len(transformers_tokens)}, genai_len={len(genai_tokens)}, "
             f"longest_common_token_sequence={longest_common}, "
+            f"first_token_matches={first_token_matches}, "
             f"transformers_ttft={_format_seconds(transformers_ttft)}, "
             f"transformers_time_to_first_{first_n}_tokens={_format_seconds(transformers_ttfn)}, "
             f"genai_ttft={_format_seconds(genai_ttft)}, "
diff --git a/test/cli/test_base.py b/test/cli/test_base.py
index 67b3ffc32..80f8af186 100644
--- a/test/cli/test_base.py
+++ b/test/cli/test_base.py
@@ -520,6 +520,12 @@ def test_parse_test_metrics_single():
     assert _parse_test_metrics("mae") == ["mae"]
 
 
+def test_parse_test_metrics_accepts_generation_metrics():
+    from olive.cli.base import _parse_test_metrics
+
+    assert _parse_test_metrics("first_token_20,tft,tf5t") == ["first_token_20", "tft", "tf5t"]
+
+
 def test_parse_test_metrics_invalid_raises():
     import argparse
 
diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index 7286e3ab0..03338a3ef 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -241,6 +241,129 @@ def test_compare_generation_with_zero_max_new_tokens(self):
         assert result["transformers_time_to_first_token_s"] is None
         assert result["transformers_time_to_first_n_tokens_s"] is None
 
+    def test_compare_generation_reports_first_token_match(self):
+        """first_token_matches is True when both first generated tokens are identical."""
+        import torch
+
+        from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck
+
+        config = MagicMock()
+        config.reference_model_path = "mock_model"
+        config.genai_model_path = None
+        config.generate_prompt = "Hello world"
+        config.generate_max_new_tokens = 10
+        config.time_to_first_n_tokens = 5
+
+        mock_tokenizer = MagicMock()
+        mock_tokenizer.return_value = MagicMock(input_ids=torch.tensor([[1, 2, 3]]))
+
+        mock_ref_model = MagicMock()
+        mock_ref_model.device = torch.device("cpu")
+        # First generated token (after the 3-token prompt) is 10.
+        mock_ref_model.generate.return_value = torch.tensor([[1, 2, 3, 10, 11, 12]])
+
+        mock_og = MagicMock()
+        mock_og.Model.return_value = MagicMock()
+        mock_genai_tokenizer = MagicMock()
+        mock_og.Tokenizer.return_value = mock_genai_tokenizer
+        mock_genai_tokenizer.encode.return_value = [1, 2, 3]
+        mock_og.GeneratorParams.return_value = MagicMock()
+
+        mock_generator = MagicMock()
+        # GenAI first generated token is also 10 -> match.
+        genai_new_tokens = [10, 99, 99]
+        call_count = [0]
+
+        def is_done_side_effect():
+            return call_count[0] >= len(genai_new_tokens)
+
+        def get_next_tokens_side_effect():
+            token = genai_new_tokens[call_count[0]]
+            call_count[0] += 1
+            return [token]
+
+        mock_generator.is_done = is_done_side_effect
+        mock_generator.get_next_tokens = get_next_tokens_side_effect
+        mock_og.Generator.return_value = mock_generator
+
+        with (
+            patch.dict(sys.modules, {"onnxruntime_genai": mock_og}),
+            patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer),
+        ):
+            pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck)
+            result = pass_instance.compare_generation(
+                config,
+                mock_ref_model,
+                ref_model_path=config.reference_model_path,
+                genai_model_path="explicit_genai_dir",
+                max_new_tokens=20,
+                first_n=5,
+            )
+
+        # The explicit genai_model_path override is used for og.Model.
+        mock_og.Model.assert_called_once_with("explicit_genai_dir")
+        # The max_new_tokens override is forwarded to transformers.generate.
+        assert mock_ref_model.generate.call_args.kwargs["max_new_tokens"] == 20
+        assert result["transformers_first_token"] == 10
+        assert result["genai_first_token"] == 10
+        assert result["first_token_matches"] is True
+
+    def test_compare_generation_reports_first_token_mismatch(self):
+        """first_token_matches is False when the first generated tokens differ."""
+        import torch
+
+        from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck
+
+        config = MagicMock()
+        config.reference_model_path = "mock_model"
+        config.genai_model_path = "mock_genai_model"
+        config.generate_prompt = "Hello"
+        config.generate_max_new_tokens = 10
+        config.time_to_first_n_tokens = 5
+
+        mock_tokenizer = MagicMock()
+        mock_tokenizer.return_value = MagicMock(input_ids=torch.tensor([[1, 2]]))
+
+        mock_ref_model = MagicMock()
+        mock_ref_model.device = torch.device("cpu")
+        mock_ref_model.generate.return_value = torch.tensor([[1, 2, 30, 31]])
+
+        mock_og = MagicMock()
+        mock_og.Model.return_value = MagicMock()
+        mock_genai_tokenizer = MagicMock()
+        mock_og.Tokenizer.return_value = mock_genai_tokenizer
+        mock_genai_tokenizer.encode.return_value = [1, 2]
+        mock_og.GeneratorParams.return_value = MagicMock()
+
+        mock_generator = MagicMock()
+        genai_new_tokens = [40, 41]
+        call_count = [0]
+
+        def is_done_side_effect():
+            return call_count[0] >= len(genai_new_tokens)
+
+        def get_next_tokens_side_effect():
+            token = genai_new_tokens[call_count[0]]
+            call_count[0] += 1
+            return [token]
+
+        mock_generator.is_done = is_done_side_effect
+        mock_generator.get_next_tokens = get_next_tokens_side_effect
+        mock_og.Generator.return_value = mock_generator
+
+        with (
+            patch.dict(sys.modules, {"onnxruntime_genai": mock_og}),
+            patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer),
+        ):
+            pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck)
+            result = pass_instance.compare_generation(
+                config, mock_ref_model, ref_model_path=config.reference_model_path
+            )
+
+        assert result["transformers_first_token"] == 30
+        assert result["genai_first_token"] == 40
+        assert result["first_token_matches"] is False
+
 
 class TestWeightDtypeInference:
     """Unit tests for ONNX weight dtype inference used to match the reference model precision."""

From 50f7a4f1fd7109e37dfb1000cb50bf162c177e03 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 1 Jul 2026 15:58:45 +0000
Subject: [PATCH 54/80] Fix fast test: set GPTQ group_size=32 for tiny test
 model (hidden_size 64)

---
 test/cli/test_cli_test_model_smoke.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/cli/test_cli_test_model_smoke.py b/test/cli/test_cli_test_model_smoke.py
index dc9a2e305..68c1b0abb 100644
--- a/test/cli/test_cli_test_model_smoke.py
+++ b/test/cli/test_cli_test_model_smoke.py
@@ -72,6 +72,9 @@ def _save_local_tiny_llama(model_path: Path):
 
 def _set_offline_gptq_data_config(config_path: Path):
     config = json.loads(config_path.read_text())
+    # The tiny test model has hidden_size 64, so the default GPTQ group_size of 128
+    # is too large (in_features must be divisible by group_size). Use a small group_size.
+    config["passes"]["gptq"]["group_size"] = 32
     config["passes"]["gptq"]["data_config"] = {
         "name": "test_gptq_dummy_data",
         "type": "DummyDataContainer",

From 31ecf23f63278e3a457ef9fb6a445aa2b13f2c59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Wed, 1 Jul 2026 18:10:22 +0200
Subject: [PATCH 55/80] documentation

---
 docs/source/how-to/cli/cli-fast-test.md | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/docs/source/how-to/cli/cli-fast-test.md b/docs/source/how-to/cli/cli-fast-test.md
index 61a429e6b..b7f8b9ceb 100644
--- a/docs/source/how-to/cli/cli-fast-test.md
+++ b/docs/source/how-to/cli/cli-fast-test.md
@@ -26,22 +26,6 @@ Because this example runs without `--dry_run`, it produces:
 
 It also inserts an `OnnxDiscrepancyCheck` pass (if one is not already present) that will compare the generated ONNX model against the 2-layer reference model.
 
-> **Note:** When `--dry_run` is used instead, Olive writes `out/qwen/config.json` (so that `olive run --config out/qwen/config.json` works) without launching the conversion yet.
-
-This is a quick way to confirm that:
-
-- Olive can load the source model
-- the selected optimization recipe is valid for your setup
-- the conversion path completes before you run the full model
-
-If you omit the folder and just pass `--test`, `olive run` will save the reduced model under `<output_path>/test_model`.
-
-### Optional: choose which `--test` metrics to run
-
-By default, `--test` evaluates:
-
-- `mae`: maximum absolute error between the ONNX and reference model outputs
-
 Additional metrics can be requested via `--test_metrics` (space- or comma-separated):
 
 - `speedup`: ONNX-vs-PyTorch inference latency

From 37a4e48d2d354a973c4582cc90f67a2e7626e7e3 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 1 Jul 2026 16:12:03 +0000
Subject: [PATCH 56/80] Move attn_impl to SaveTestModelConfig;
 OnnxDiscrepancyCheck uses saved model config

---
 olive/passes/onnx/discrepancy_check.py        | 20 ++++-----
 .../passes/pytorch/save_test_model_config.py  | 42 +++++++++++++++----
 2 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 4fc4c9274..5e24bc753 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -314,16 +314,6 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                     "below this threshold, the pass fails."
                 ),
             ),
-            "attn_impl": PassConfigParam(
-                type_=Optional[str],
-                default_value="sdpa",
-                description=(
-                    "Attention implementation to use when loading the reference HuggingFace model via "
-                    "``AutoModelForCausalLM.from_pretrained``. Passed as ``attn_implementation`` to "
-                    "the model loader. Common values are ``'eager'``, ``'sdpa'``, and ``'flash_attention_2'``. "
-                    "Defaults to ``'sdpa'``."
-                ),
-            ),
             "llama_cpp": PassConfigParam(
                 type_=bool,
                 default_value=False,
@@ -425,8 +415,16 @@ def _run_for_config(
                 f"Got architectures={architectures}"
             )
 
-        ref_model = AutoModelForCausalLM.from_pretrained(ref_path, config=ref_cfg, attn_implementation=config.attn_impl)
+        # The attention implementation is baked into the reference model's config.json
+        # (as ``_attn_implementation``) by the SaveTestModelConfig pass, so it is picked up
+        # automatically here without needing to pass ``attn_implementation`` explicitly.
+        ref_model = AutoModelForCausalLM.from_pretrained(ref_path, config=ref_cfg)
         ref_model.eval()
+        logger.info(
+            "Loaded reference model from %s with attn_implementation=%s",
+            ref_path,
+            getattr(ref_cfg, "_attn_implementation", None),
+        )
 
         # Determine the floating-point dtype used by the ONNX model weights and
         # cast the reference PyTorch model to match, so the comparison uses the
diff --git a/olive/passes/pytorch/save_test_model_config.py b/olive/passes/pytorch/save_test_model_config.py
index 850b51fc1..f22d78388 100644
--- a/olive/passes/pytorch/save_test_model_config.py
+++ b/olive/passes/pytorch/save_test_model_config.py
@@ -3,6 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 import logging
+from typing import Optional
 
 from olive.hardware.accelerator import AcceleratorSpec
 from olive.model import HfModelHandler
@@ -30,11 +31,24 @@ class SaveTestModelConfig(Pass):
 
     @classmethod
     def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassConfigParam]:
-        return {}
+        return {
+            "attn_impl": PassConfigParam(
+                type_=Optional[str],
+                default_value="sdpa",
+                description=(
+                    "Attention implementation baked into the saved test model's ``config.json`` "
+                    "(written as ``_attn_implementation``). Downstream passes such as "
+                    "``OnnxDiscrepancyCheck`` that load this reference model will use it. "
+                    "Common values are ``'eager'``, ``'sdpa'``, and ``'flash_attention_2'``. "
+                    "Defaults to ``'sdpa'``. When ``None`` the transformers default is used."
+                ),
+            ),
+        }
 
     def _run_for_config(
         self, model: HfModelHandler, config: type[BasePassConfig], output_model_path: str
     ) -> HfModelHandler:
+        import json
         from pathlib import Path
 
         from olive.common.hf.utils import is_test_model_dir
@@ -53,13 +67,23 @@ def _run_for_config(
         _has_weights = is_test_model_dir(test_model_dir) and (
             any(test_model_dir.glob("*.safetensors")) or any(test_model_dir.glob("pytorch_model*.bin"))
         )
-        if _has_weights:
-            logger.debug("Test model already saved at %s — skipping.", test_model_path)
-            return model
+        if not _has_weights:
+            logger.info("Saving test random model to %s", test_model_path)
+            # load_model calls load_model_from_task which creates a random-initialised model
+            # from the reduced config and persists it (weights + config.json + marker) to
+            # test_model_path on the first call.
+            model.load_model(cache_model=False)
+        else:
+            logger.debug("Test model already saved at %s — skipping model save.", test_model_path)
 
-        logger.info("Saving test random model to %s", test_model_path)
-        # load_model calls load_model_from_task which creates a random-initialised model
-        # from the reduced config and persists it (weights + config.json + marker) to
-        # test_model_path on the first call.
-        model.load_model(cache_model=False)
+        # Bake the attention implementation into the saved config.json so downstream passes
+        # (e.g. OnnxDiscrepancyCheck) that load this reference model use the same setting.
+        if config.attn_impl:
+            config_json_path = test_model_dir / "config.json"
+            if config_json_path.is_file():
+                config_data = json.loads(config_json_path.read_text())
+                if config_data.get("_attn_implementation") != config.attn_impl:
+                    config_data["_attn_implementation"] = config.attn_impl
+                    config_json_path.write_text(json.dumps(config_data, indent=2))
+                    logger.info("Set _attn_implementation=%s in %s", config.attn_impl, config_json_path)
         return model

From d44373fccc2ca272b86986b6d42c529ca80d0760 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 1 Jul 2026 16:30:58 +0000
Subject: [PATCH 57/80] Fix int32 JSON serialization error in
 OnnxDiscrepancyCheck results

---
 olive/passes/onnx/discrepancy_check.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 5e24bc753..1f56c65d3 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -19,6 +19,21 @@
 logger = logging.getLogger(__name__)
 
 
+def _json_sanitize(obj):
+    """Recursively convert numpy scalars/arrays to native Python types for JSON serialization."""
+    import numpy as np
+
+    if isinstance(obj, dict):
+        return {key: _json_sanitize(value) for key, value in obj.items()}
+    if isinstance(obj, (list, tuple)):
+        return [_json_sanitize(item) for item in obj]
+    if isinstance(obj, np.generic):
+        return obj.item()
+    if isinstance(obj, np.ndarray):
+        return obj.tolist()
+    return obj
+
+
 def _infer_shape(dynamic_shape, known_values=None):
     default_values = {
         "batch_size": 1,
@@ -705,6 +720,7 @@ def _run_for_config(
                 results.setdefault("failures", []).append(f"llama.cpp comparison failed: {exc}")
 
         # Save results to disk
+        results = _json_sanitize(results)
         report_path = Path(report_dir) / "discrepancy_check_results.json"
         report_path.parent.mkdir(parents=True, exist_ok=True)
         report_path.write_text(json.dumps(results, indent=2))

From 86466e8f28faf5ce103b5ec196dfcd543de6b32a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Thu, 2 Jul 2026 10:45:03 +0200
Subject: [PATCH 58/80] split long method

---
 olive/passes/onnx/discrepancy_check.py | 263 +++++++++++++++----------
 1 file changed, 162 insertions(+), 101 deletions(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 1f56c65d3..553050cb6 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -365,37 +365,68 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
     def _run_for_config(
         self, model: ONNXModelHandler, config: type[BasePassConfig], output_model_path: str
     ) -> ONNXModelHandler:
-        import torch
+        dataloader, io_config = self._prepare_dataloader(model)
+        ref_model, ref_cfg, ref_path = self._load_reference_model(model, config)
+
+        device, execution_provider, torch_device, weight_dtype = self._resolve_execution_device(model)
+        ref_model = self._cast_reference_model(ref_model, weight_dtype, torch_device)
+
+        report_dir = self._save_reference_artifacts(ref_model, ref_cfg, config, output_model_path)
+
+        session = model.prepare_session(
+            device=device,
+            execution_providers=[execution_provider] if execution_provider else None,
+        )
+
+        results = self._compute_logits_discrepancy(ref_model, session, dataloader, io_config, torch_device)
+
+        effective_timing_iterations, effective_max_mae, generation_metrics = self._resolve_metric_settings(config)
+
+        self._run_speedup_measurement(
+            ref_model, session, dataloader, io_config, torch_device, config, effective_timing_iterations, results
+        )
+
+        self._check_error_thresholds(config, results, effective_max_mae)
+
+        self._run_generation_comparison(model, config, ref_model, ref_path, generation_metrics, results)
+
+        self._run_llama_cpp_comparison(model, config, ref_model, ref_path, report_dir, results)
+
+        self._save_results(model, results, report_dir)
+        return model
 
+    def _prepare_dataloader(self, model: ONNXModelHandler):
         from olive.common.config_utils import validate_config
-        from olive.common.utils import format_data
         from olive.data.template import dummy_data_config_template
         from olive.model.config.io_config import is_io_config_static
 
         io_config = model.io_config
-        if io_config:
-            if is_io_config_static(io_config):
-                input_shapes = io_config.get("input_shapes")
-            else:
-                input_shapes = []
-                known = {}
-                for shape in io_config.get("input_shapes"):
-                    new_shape = _infer_shape(shape, known)
-                    input_shapes.append(new_shape)
-                    known.update(dict(zip(shape, new_shape)))
-            data_config = dummy_data_config_template(
-                input_shapes, io_config.get("input_names"), io_config.get("input_types")
-            )
-            data_config = validate_config(data_config, DataConfig)
-            data_config.load_dataset_config.params["max_samples"] = 1
-        else:
+        if not io_config:
             raise RuntimeError(
                 f"Model IO config is missing for {model.model_path}; cannot generate dummy inputs for discrepancy check."
             )
+
+        if is_io_config_static(io_config):
+            input_shapes = io_config.get("input_shapes")
+        else:
+            input_shapes = []
+            known = {}
+            for shape in io_config.get("input_shapes"):
+                new_shape = _infer_shape(shape, known)
+                input_shapes.append(new_shape)
+                known.update(dict(zip(shape, new_shape)))
+        data_config = dummy_data_config_template(
+            input_shapes, io_config.get("input_names"), io_config.get("input_types")
+        )
+        data_config = validate_config(data_config, DataConfig)
+        data_config.load_dataset_config.params["max_samples"] = 1
+
         # Create dataloader
         dc = data_config.to_data_container()
         dataloader = dc.create_dataloader()
+        return dataloader, io_config
 
+    def _load_reference_model(self, model: ONNXModelHandler, config: type[BasePassConfig]):
         # Load reference PyTorch model
         from transformers import AutoConfig, AutoModelForCausalLM
 
@@ -440,6 +471,10 @@ def _run_for_config(
             ref_path,
             getattr(ref_cfg, "_attn_implementation", None),
         )
+        return ref_model, ref_cfg, ref_path
+
+    def _resolve_execution_device(self, model: ONNXModelHandler):
+        import torch
 
         # Determine the floating-point dtype used by the ONNX model weights and
         # cast the reference PyTorch model to match, so the comparison uses the
@@ -464,6 +499,11 @@ def _run_for_config(
         torch_device = torch.device("cpu")
         if device == Device.GPU and torch.cuda.is_available():
             torch_device = torch.device("cuda")
+        return device, execution_provider, torch_device, weight_dtype
+
+    def _cast_reference_model(self, ref_model, weight_dtype, torch_device):
+        import torch
+
         if weight_dtype is not None and torch_device.type == "cpu" and weight_dtype in (torch.float16, torch.bfloat16):
             logger.info(
                 "OnnxDiscrepancyCheck skipping reference model cast to %s on CPU because the dtype is not supported.",
@@ -478,7 +518,9 @@ def _run_for_config(
             )
         else:
             ref_model = ref_model.to(torch_device)
+        return ref_model
 
+    def _save_reference_artifacts(self, ref_model, ref_cfg, config: type[BasePassConfig], output_model_path: str):
         # Save reference PyTorch model for direct comparison
         report_dir = config.report_output_dir or output_model_path
         report_dir_path = Path(report_dir)
@@ -493,11 +535,12 @@ def _run_for_config(
         config_save_path.parent.mkdir(parents=True, exist_ok=True)
         config_save_path.write_text(ref_cfg.to_json_string())
         logger.info("Saved reference model config to %s", config_save_path)
+        return report_dir
 
-        session = model.prepare_session(
-            device=device,
-            execution_providers=[execution_provider] if execution_provider else None,
-        )
+    def _compute_logits_discrepancy(self, ref_model, session, dataloader, io_config, torch_device):
+        import torch
+
+        from olive.common.utils import format_data
 
         # Run inference on both and compare
         all_max_abs_diff = []
@@ -549,7 +592,9 @@ def _run_for_config(
             f"elements_above_0.01={count_above_0_01}/{total_elements}"
         )
         logger.info(summary)
+        return results
 
+    def _resolve_metric_settings(self, config: type[BasePassConfig]):
         # Resolve effective metric settings: test_metrics takes precedence when set.
         # This lets the CLI store a human-readable ["mae", "speedup"] list in the config
         # while still supporting the lower-level timing_iterations / max_mae controls for
@@ -564,7 +609,11 @@ def _run_for_config(
 
         # Metrics that require running token generation (transformers vs ONNX Runtime GenAI).
         generation_metrics = requested_metrics & {"first_token_20", "tft", "tf5t"}
+        return effective_timing_iterations, effective_max_mae, generation_metrics
 
+    def _run_speedup_measurement(
+        self, ref_model, session, dataloader, io_config, torch_device, config, effective_timing_iterations, results
+    ):
         # Measure inference speedup (ONNX vs PyTorch) on the target device
         if effective_timing_iterations > 0:
             timing = self._measure_speedup(
@@ -593,6 +642,11 @@ def _run_for_config(
                 effective_timing_iterations,
             )
 
+    def _check_error_thresholds(self, config: type[BasePassConfig], results, effective_max_mae):
+        max_abs_error = results["max_abs_error"]
+        count_above_0_1 = results["elements_above_0_1"]
+        count_above_0_01 = results["elements_above_0_01"]
+
         # Check thresholds
         failures = []
         if effective_max_mae is not None and max_abs_error > effective_max_mae:
@@ -614,6 +668,9 @@ def _run_for_config(
         else:
             results["status"] = "passed"
 
+    def _run_generation_comparison(
+        self, model: ONNXModelHandler, config, ref_model, ref_path, generation_metrics, results
+    ):
         # Generation token sequence comparison (transformers vs ONNX Runtime GenAI).
         # Runs when an explicit genai_model_path is configured or when any generation-based
         # test metric (first_token_20 / tft / tf5t) is requested.  In the latter case the
@@ -636,89 +693,94 @@ def _run_for_config(
                     model_dir,
                 )
 
-        if genai_model_path:
-            # first_token_20 generates 20 tokens; tf5t measures the time to the first 5 tokens.
-            gen_max_new_tokens = 20 if "first_token_20" in generation_metrics else config.generate_max_new_tokens
-            gen_first_n = 5 if "tf5t" in generation_metrics else config.time_to_first_n_tokens
-            gen_results = self.compare_generation(
-                config,
-                ref_model,
-                ref_model_path=ref_path,
-                genai_model_path=genai_model_path,
-                max_new_tokens=gen_max_new_tokens,
-                first_n=gen_first_n,
+        if not genai_model_path:
+            return
+
+        # first_token_20 generates 20 tokens; tf5t measures the time to the first 5 tokens.
+        gen_max_new_tokens = 20 if "first_token_20" in generation_metrics else config.generate_max_new_tokens
+        gen_first_n = 5 if "tf5t" in generation_metrics else config.time_to_first_n_tokens
+        gen_results = self.compare_generation(
+            config,
+            ref_model,
+            ref_model_path=ref_path,
+            genai_model_path=genai_model_path,
+            max_new_tokens=gen_max_new_tokens,
+            first_n=gen_first_n,
+        )
+        longest_common = gen_results["longest_common_token_sequence"]
+        results.update(gen_results)
+        results["genai_model_path"] = genai_model_path
+
+        # Surface the explicitly requested named metrics for easy inspection.
+        if "first_token_20" in generation_metrics:
+            results["first_token_20"] = {
+                "transformers_first_token": gen_results.get("transformers_first_token"),
+                "genai_first_token": gen_results.get("genai_first_token"),
+                "first_token_matches": gen_results.get("first_token_matches"),
+                "matching_leading_tokens": longest_common,
+            }
+            logger.info(
+                "OnnxDiscrepancyCheck first_token_20: matches=%s (transformers=%s, genai=%s), "
+                "matching_leading_tokens=%s",
+                gen_results.get("first_token_matches"),
+                gen_results.get("transformers_first_token"),
+                gen_results.get("genai_first_token"),
+                longest_common,
+            )
+        if "tft" in generation_metrics:
+            results["tft"] = {
+                "transformers_s": gen_results.get("transformers_time_to_first_token_s"),
+                "genai_s": gen_results.get("genai_time_to_first_token_s"),
+            }
+            logger.info(
+                "OnnxDiscrepancyCheck tft (time to first token): transformers=%s, genai=%s",
+                _format_seconds(gen_results.get("transformers_time_to_first_token_s")),
+                _format_seconds(gen_results.get("genai_time_to_first_token_s")),
+            )
+        if "tf5t" in generation_metrics:
+            results["tf5t"] = {
+                "transformers_s": gen_results.get("transformers_time_to_first_n_tokens_s"),
+                "genai_s": gen_results.get("genai_time_to_first_n_tokens_s"),
+            }
+            logger.info(
+                "OnnxDiscrepancyCheck tf5t (time to first 5 tokens): transformers=%s, genai=%s",
+                _format_seconds(gen_results.get("transformers_time_to_first_n_tokens_s")),
+                _format_seconds(gen_results.get("genai_time_to_first_n_tokens_s")),
             )
-            longest_common = gen_results["longest_common_token_sequence"]
-            results.update(gen_results)
-            results["genai_model_path"] = genai_model_path
-
-            # Surface the explicitly requested named metrics for easy inspection.
-            if "first_token_20" in generation_metrics:
-                results["first_token_20"] = {
-                    "transformers_first_token": gen_results.get("transformers_first_token"),
-                    "genai_first_token": gen_results.get("genai_first_token"),
-                    "first_token_matches": gen_results.get("first_token_matches"),
-                    "matching_leading_tokens": longest_common,
-                }
-                logger.info(
-                    "OnnxDiscrepancyCheck first_token_20: matches=%s (transformers=%s, genai=%s), "
-                    "matching_leading_tokens=%s",
-                    gen_results.get("first_token_matches"),
-                    gen_results.get("transformers_first_token"),
-                    gen_results.get("genai_first_token"),
-                    longest_common,
-                )
-            if "tft" in generation_metrics:
-                results["tft"] = {
-                    "transformers_s": gen_results.get("transformers_time_to_first_token_s"),
-                    "genai_s": gen_results.get("genai_time_to_first_token_s"),
-                }
-                logger.info(
-                    "OnnxDiscrepancyCheck tft (time to first token): transformers=%s, genai=%s",
-                    _format_seconds(gen_results.get("transformers_time_to_first_token_s")),
-                    _format_seconds(gen_results.get("genai_time_to_first_token_s")),
-                )
-            if "tf5t" in generation_metrics:
-                results["tf5t"] = {
-                    "transformers_s": gen_results.get("transformers_time_to_first_n_tokens_s"),
-                    "genai_s": gen_results.get("genai_time_to_first_n_tokens_s"),
-                }
-                logger.info(
-                    "OnnxDiscrepancyCheck tf5t (time to first 5 tokens): transformers=%s, genai=%s",
-                    _format_seconds(gen_results.get("transformers_time_to_first_n_tokens_s")),
-                    _format_seconds(gen_results.get("genai_time_to_first_n_tokens_s")),
-                )
 
-            if config.min_longest_common_tokens is not None and longest_common < config.min_longest_common_tokens:
-                results["status"] = "failed"
-                gen_failure = (
-                    f"Longest common token sequence length {longest_common} is below "
-                    f"threshold {config.min_longest_common_tokens}"
-                )
-                results.setdefault("failures", []).append(gen_failure)
-                logger.error("ONNX model discrepancy check FAILED: %s", gen_failure)
+        if config.min_longest_common_tokens is not None and longest_common < config.min_longest_common_tokens:
+            results["status"] = "failed"
+            gen_failure = (
+                f"Longest common token sequence length {longest_common} is below "
+                f"threshold {config.min_longest_common_tokens}"
+            )
+            results.setdefault("failures", []).append(gen_failure)
+            logger.error("ONNX model discrepancy check FAILED: %s", gen_failure)
 
+    def _run_llama_cpp_comparison(self, model: ONNXModelHandler, config, ref_model, ref_path, report_dir, results):
         # llama.cpp comparison: convert reference model to GGUF and compare latencies
-        if config.llama_cpp:
-            preconverted_gguf_path = None
-            if model.model_attributes:
-                preconverted_gguf_path = model.model_attributes.get("reference_gguf_model_path")
-            try:
-                llama_results = self.compare_llama_cpp(
-                    config,
-                    ref_model,
-                    output_dir=report_dir,
-                    pytorch_latency_s=results.get("pytorch_latency_s"),
-                    onnx_latency_s=results.get("onnx_latency_s"),
-                    ref_model_path=ref_path,
-                    preconverted_gguf_path=preconverted_gguf_path,
-                )
-                results.update(llama_results)
-            except Exception as exc:
-                logger.exception("OnnxDiscrepancyCheck llama.cpp comparison failed.")
-                results["status"] = "failed"
-                results.setdefault("failures", []).append(f"llama.cpp comparison failed: {exc}")
+        if not config.llama_cpp:
+            return
+        preconverted_gguf_path = None
+        if model.model_attributes:
+            preconverted_gguf_path = model.model_attributes.get("reference_gguf_model_path")
+        try:
+            llama_results = self.compare_llama_cpp(
+                config,
+                ref_model,
+                output_dir=report_dir,
+                pytorch_latency_s=results.get("pytorch_latency_s"),
+                onnx_latency_s=results.get("onnx_latency_s"),
+                ref_model_path=ref_path,
+                preconverted_gguf_path=preconverted_gguf_path,
+            )
+            results.update(llama_results)
+        except Exception as exc:
+            logger.exception("OnnxDiscrepancyCheck llama.cpp comparison failed.")
+            results["status"] = "failed"
+            results.setdefault("failures", []).append(f"llama.cpp comparison failed: {exc}")
 
+    def _save_results(self, model: ONNXModelHandler, results, report_dir):
         # Save results to disk
         results = _json_sanitize(results)
         report_path = Path(report_dir) / "discrepancy_check_results.json"
@@ -730,7 +792,6 @@ def _run_for_config(
         model_attributes = dict(model.model_attributes) if model.model_attributes else {}
         model_attributes["discrepancy_check_results"] = results
         model.model_attributes = model_attributes
-        return model
 
     def _measure_speedup(
         self, ref_model, session, dataloader, io_config, torch_device, warmup_iterations, timing_iterations

From e4931cac03557c97e33c5647e30ea4e64dd68c7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Thu, 2 Jul 2026 11:38:16 +0200
Subject: [PATCH 59/80] remove test path

---
 olive/cli/base.py                     | 15 +++++----------
 test/cli/test_base.py                 |  7 ++++---
 test/cli/test_cli.py                  |  6 ++----
 test/cli/test_cli_test_model_smoke.py | 18 +++++++-----------
 4 files changed, 18 insertions(+), 28 deletions(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index 7dcb94e7a..0d428f96b 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -401,14 +401,11 @@ def add_hf_test_model_config(input_model: dict, test_value, output_path: Optiona
     if test_value in (None, False):
         return input_model
 
-    test_model_output_path = test_value
     # Use 2 layers to keep the test model fast and lightweight while preserving the original architecture family.
     input_model["test_model_config"] = {"hidden_layers": 2}
-    if test_model_output_path is True:
-        if not output_path:
-            raise ValueError("--test requires an explicit folder when output_path is not available.")
-        test_model_output_path = str(Path(output_path) / "test_model")
-    input_model["test_model_path"] = test_model_output_path
+    if not output_path:
+        raise ValueError("--test requires --output_path to store the generated reference model.")
+    input_model["test_model_path"] = str(Path(output_path) / "reference_hf_model")
     return input_model
 
 
@@ -703,12 +700,10 @@ def add_input_model_options(
         )
         model_group.add_argument(
             "--test",
-            type=str,
-            nargs="?",
-            const=True,
+            action="store_true",
             help=(
                 "Use a randomly initialized test model with the same Hugging Face architecture and 2 hidden layers. "
-                "Optionally provide a folder where the generated test model should be saved and reused."
+                "The generated reference model is saved under <output_path>/reference_hf_model and reused."
             ),
         )
         model_group.add_argument(
diff --git a/test/cli/test_base.py b/test/cli/test_base.py
index 80f8af186..428bf7409 100644
--- a/test/cli/test_base.py
+++ b/test/cli/test_base.py
@@ -237,13 +237,14 @@ def test_get_input_model_config_hf_test_model(_):
         task="text-generation",
         model_script=None,
         script_dir=None,
-        test="saved_test_model",
+        test=True,
+        output_path="out_dir",
     )
 
     config = get_input_model_config(args)
 
     assert config["test_model_config"] == {"hidden_layers": 2}
-    assert config["test_model_path"] == "saved_test_model"
+    assert config["test_model_path"] == str(Path("out_dir") / "reference_hf_model")
 
 
 @patch("huggingface_hub.repo_exists", return_value=True)
@@ -257,7 +258,7 @@ def test_get_input_model_config_hf_test_model_requires_path_without_output_path(
         test=True,
     )
 
-    with pytest.raises(ValueError, match=r"--test requires an explicit folder when output_path is not available\."):
+    with pytest.raises(ValueError, match=r"--test requires --output_path to store the generated reference model\."):
         get_input_model_config(args)
 
 
diff --git a/test/cli/test_cli.py b/test/cli/test_cli.py
index 5e666b94f..5e35462e8 100644
--- a/test/cli/test_cli.py
+++ b/test/cli/test_cli.py
@@ -173,7 +173,7 @@ def test_workflow_run_command_with_test_override(mock_run, tmp_path):
 
     cli_main(command_args)
 
-    test_model_path = str(tmp_path / "output" / "test_model")
+    test_model_path = str(tmp_path / "output" / "reference_hf_model")
     output_dir = str(tmp_path / "output")
     mock_run.assert_called_once_with(
         {
@@ -302,13 +302,11 @@ def test_finetune_command(_, mock_run, tmp_path):
 @patch("huggingface_hub.repo_exists", return_value=True)
 def test_optimize_command_test_model_config(_, tmp_path):
     output_dir = tmp_path / "output_dir"
-    test_model_dir = tmp_path / "saved_test_model"
     command_args = [
         "optimize",
         "-m",
         "dummy-model-id",
         "--test",
-        str(test_model_dir),
         "--dry_run",
         "-o",
         str(output_dir),
@@ -318,7 +316,7 @@ def test_optimize_command_test_model_config(_, tmp_path):
 
     config = json.loads((output_dir / "config.json").read_text())
     assert config["input_model"]["test_model_config"] == {"hidden_layers": 2}
-    assert config["input_model"]["test_model_path"] == str(test_model_dir)
+    assert config["input_model"]["test_model_path"] == str(output_dir / "reference_hf_model")
     assert json.loads((output_dir / TEST_OUTPUT_MARKER_FILE).read_text())["type"] == "olive_hf_test_output"
 
 
diff --git a/test/cli/test_cli_test_model_smoke.py b/test/cli/test_cli_test_model_smoke.py
index 68c1b0abb..e0f694ec0 100644
--- a/test/cli/test_cli_test_model_smoke.py
+++ b/test/cli/test_cli_test_model_smoke.py
@@ -103,8 +103,8 @@ def _run_documented_test_model_smoke_flow(tmp_path: Path, model_id: str):
     model_name = model_id.replace("/", "--")
     model_path = tmp_path / "models" / model_name
     config_output_dir = tmp_path / f"{model_name}-test"
-    test_model_dir = tmp_path / f"{model_name}-test-model"
     run_output_dir = tmp_path / f"{model_name}-test-run"
+    test_model_dir = run_output_dir / "reference_hf_model"
 
     _save_local_tiny_llama(model_path)
     # optimize -m arnir0/Tiny-LLM --device cpu --provider CPUExecutionProvider --precision int4 --output_path dump --dry_run
@@ -128,14 +128,13 @@ def _run_documented_test_model_smoke_flow(tmp_path: Path, model_id: str):
     config_path = config_output_dir / "config.json"
     assert config_path.exists()
     _set_offline_gptq_data_config(config_path)
-    # run --config dump/config.json --test dump/test --output_path dump/run
+    # run --config dump/config.json --test --output_path dump/run
     _run_cli_main(
         [
             "run",
             "--config",
             str(config_path),
             "--test",
-            str(test_model_dir),
             "--output_path",
             str(run_output_dir),
         ]
@@ -214,14 +213,13 @@ def _assert_discrepancy(self, tmp_path: Path):
                         self.fail(f"Unknown exporter: {exporter!r}")
 
     @staticmethod
-    def _run_discrepancy_with_test(config_path: Path, test_model_dir: Path, run_output_dir: Path):
+    def _run_discrepancy_with_test(config_path: Path, run_output_dir: Path):
         _run_cli_main(
             [
                 "run",
                 "--config",
                 str(config_path),
                 "--test",
-                str(test_model_dir),
                 "--output_path",
                 str(run_output_dir),
             ]
@@ -231,7 +229,6 @@ def _assert_discrepancy_model_builder(self, tmp_path: Path, model_id: str):
         model_name = model_id.replace("/", "--")
         model_path = tmp_path / "models" / f"{model_name}-disc"
         config_output_dir = tmp_path / f"{model_name}-disc-cfg"
-        test_model_dir = tmp_path / f"{model_name}-disc-test-model"
         run_output_dir = tmp_path / f"{model_name}-disc-run"
 
         _save_local_tiny_llama(model_path)
@@ -257,13 +254,13 @@ def _assert_discrepancy_model_builder(self, tmp_path: Path, model_id: str):
         _set_offline_gptq_data_config(config_path)
 
         # Run with --test; OnnxDiscrepancyCheck is auto-injected and reports discrepancy metrics
-        self._run_discrepancy_with_test(config_path, test_model_dir, run_output_dir)
+        self._run_discrepancy_with_test(config_path, run_output_dir)
 
     def _assert_discrepancy_mobius(self, tmp_path: Path, model_id: str):
         model_name = model_id.replace("/", "--")
         model_path = tmp_path / "models" / f"{model_name}-mobius-disc"
-        test_model_dir = tmp_path / f"{model_name}-mobius-disc-test-model"
         run_output_dir = tmp_path / f"{model_name}-mobius-disc-run"
+        test_model_dir = run_output_dir / "reference_hf_model"
 
         _save_local_tiny_llama(model_path)
 
@@ -295,12 +292,11 @@ def _assert_discrepancy_mobius(self, tmp_path: Path, model_id: str):
         config_path = tmp_path / f"{model_name}-mobius-disc-config.json"
         config_path.write_text(json.dumps(run_config, indent=2))
 
-        self._run_discrepancy_with_test(config_path, test_model_dir, run_output_dir)
+        self._run_discrepancy_with_test(config_path, run_output_dir)
 
     def _assert_discrepancy_torch_export(self, tmp_path: Path, model_id: str):
         model_name = model_id.replace("/", "--")
         model_path = tmp_path / "models" / f"{model_name}-torch-disc"
-        test_model_dir = tmp_path / f"{model_name}-torch-disc-test-model"
         run_output_dir = tmp_path / f"{model_name}-torch-disc-run"
 
         _save_local_tiny_llama(model_path)
@@ -328,7 +324,7 @@ def _assert_discrepancy_torch_export(self, tmp_path: Path, model_id: str):
         }
         config_path = tmp_path / f"{model_name}-torch-disc-config.json"
         config_path.write_text(json.dumps(run_config, indent=2))
-        self._run_discrepancy_with_test(config_path, test_model_dir, run_output_dir)
+        self._run_discrepancy_with_test(config_path, run_output_dir)
 
     def _assert_file_size_below_limit(self, path: Path):
         assert path.exists()

From bc0f6dd504d5de47e7f85150e5eb5f2c3b5b8653 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Thu, 2 Jul 2026 11:53:57 +0200
Subject: [PATCH 60/80] simplifies the number of saved models

---
 olive/common/hf/utils.py               | 23 ++++++++++++++++++---
 olive/passes/onnx/discrepancy_check.py |  8 ++++----
 olive/passes/onnx/model_builder.py     | 18 ++---------------
 test/cli/test_cli_test_model_smoke.py  |  3 +++
 test/common/test_hf.py                 | 28 ++++++++++++++++++++++++++
 5 files changed, 57 insertions(+), 23 deletions(-)

diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py
index 74326e83f..c8cc39ece 100644
--- a/olive/common/hf/utils.py
+++ b/olive/common/hf/utils.py
@@ -102,11 +102,24 @@ def _load_test_model(model_class: type, model_config: "PretrainedConfig", trust_
     return model_class.from_config(model_config, **from_config_kwargs)
 
 
-def _save_test_model(model: "PreTrainedModel", output_dir: str, test_model_config: Optional[dict[str, Any]] = None):
+def _save_test_model(
+    model: "PreTrainedModel",
+    output_dir: str,
+    test_model_config: Optional[dict[str, Any]] = None,
+    model_name_or_path: Optional[str] = None,
+):
     output_path = Path(output_dir)
     output_path.mkdir(parents=True, exist_ok=True)
     logger.info("Saving generated test model to %s", output_path)
     model.save_pretrained(str(output_path))
+    if model_name_or_path:
+        # Save the reference tokenizer alongside the weights so the test model directory is
+        # self-contained (e.g. for OnnxDiscrepancyCheck and ONNX Runtime GenAI generation).
+        try:
+            tokenizer = get_tokenizer(model_name_or_path)
+            save_tokenizer(tokenizer, str(output_path))
+        except Exception as e:  # pylint: disable=broad-except
+            logger.debug("Could not save tokenizer for test model from %r: %s", model_name_or_path, e)
     _write_test_model_marker(output_path, test_model_config)
 
 
@@ -206,12 +219,16 @@ def load_model_from_task(
                         model = from_pretrained(model_class, test_model_path, "model", **kwargs)
                     else:
                         model = _load_test_model(model_class, model_config, kwargs.get("trust_remote_code"))
-                        _save_test_model(model, test_model_path, test_model_config)
+                        _save_test_model(
+                            model, test_model_path, test_model_config, model_name_or_path=model_name_or_path
+                        )
                 else:
                     _validate_path(test_model_dir, test_model_path)
                     model = _load_test_model(model_class, model_config, kwargs.get("trust_remote_code"))
                     if test_model_path:
-                        _save_test_model(model, test_model_path, test_model_config)
+                        _save_test_model(
+                            model, test_model_path, test_model_config, model_name_or_path=model_name_or_path
+                        )
             else:
                 model = from_pretrained(model_class, model_name_or_path, "model", **kwargs)
             logger.debug("Loaded model %s with name_or_path %s", model_class, model_name_or_path)
diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 553050cb6..365c69e13 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -431,10 +431,10 @@ def _load_reference_model(self, model: ONNXModelHandler, config: type[BasePassCo
         from transformers import AutoConfig, AutoModelForCausalLM
 
         # Resolve the reference model path.  Use the configured path if it exists as a local
-        # directory; otherwise fall back to the ``reference_hf_model`` copy that ModelBuilder
-        # saves alongside the ONNX output.  That copy is written on the first successful build
-        # and is preserved across engine cache hits, so OnnxDiscrepancyCheck keeps working even
-        # when the original ``test_model_path`` (e.g. ``out/tiny-test``) has been deleted.
+        # directory; otherwise fall back to a ``reference_hf_model`` directory saved alongside the
+        # ONNX output.  The reference model is normally kept at ``<output_path>/reference_hf_model``
+        # (written by SaveTestModelConfig / the test-model flow) and persists across engine cache
+        # hits, so this fallback only triggers if the configured path has been removed.
         ref_path = config.reference_model_path
         if not Path(ref_path).is_dir():
             hf_ref_dir = (model.model_attributes or {}).get("hf_reference_model_dir", "reference_hf_model")
diff --git a/olive/passes/onnx/model_builder.py b/olive/passes/onnx/model_builder.py
index e52b8c97e..5321c13d1 100644
--- a/olive/passes/onnx/model_builder.py
+++ b/olive/passes/onnx/model_builder.py
@@ -9,7 +9,6 @@
 import json
 import logging
 import os
-import shutil
 from enum import IntEnum
 from pathlib import Path
 from typing import Any, ClassVar, Union
@@ -380,18 +379,6 @@ def _run_for_config(
                     output_model_filepath.parent,
                 )
 
-        # When a test model was used as the build input, save a copy alongside the generated
-        # ONNX files so it survives across engine cache hits.  OnnxDiscrepancyCheck will fall
-        # back to this copy when ``reference_model_path`` (e.g. ``out/tiny-test``) no longer
-        # exists on disk — for example on subsequent ``olive run`` invocations that hit the
-        # engine output cache and never re-execute this pass.
-        if not metadata_only and model.test_model_config:
-            ref_copy_path = output_model_filepath.parent / "reference_hf_model"
-            if not ref_copy_path.exists():
-                shutil.copytree(model_path, str(ref_copy_path))
-                logger.debug("Saved reference HF model copy to %s", ref_copy_path)
-            model_attributes["hf_reference_model_dir"] = "reference_hf_model"
-
         # add additional files generated by model builder to model_attributes
         additional_files = model_attributes.get("additional_files") or []
         if metadata_only:
@@ -405,9 +392,8 @@ def _run_for_config(
             primary_model_paths = {str(fp) for fp in primary_onnx_files}
             model_attributes["additional_files"] = sorted(
                 set(additional_files)
-                # all files in the output directory except the model, model.data files,
-                # and the reference HF model copy (handled separately via model_attributes)
-                | {str(fp) for fp in output_model_filepath.parent.iterdir() if fp.name != "reference_hf_model"}
+                # all files in the output directory except the model and model.data files
+                | {str(fp) for fp in output_model_filepath.parent.iterdir()}
                 - primary_model_paths
                 - {f"{path}.data" for path in primary_model_paths}
             )
diff --git a/test/cli/test_cli_test_model_smoke.py b/test/cli/test_cli_test_model_smoke.py
index e0f694ec0..7bef9a703 100644
--- a/test/cli/test_cli_test_model_smoke.py
+++ b/test/cli/test_cli_test_model_smoke.py
@@ -162,6 +162,9 @@ def _assert_smoke_flows(self, tmp_path: Path):
             "config.json",
             "generation_config.json",
             "model.safetensors",
+            "special_tokens_map.json",
+            "tokenizer.json",
+            "tokenizer_config.json",
             TEST_MODEL_MARKER_FILE,
         }
         expected_run_output_files = {
diff --git a/test/common/test_hf.py b/test/common/test_hf.py
index c3095382f..4204606e0 100644
--- a/test/common/test_hf.py
+++ b/test/common/test_hf.py
@@ -54,6 +54,34 @@ def test_load_model_from_task_test_model_config(model_config, hidden_layers_attr
     assert getattr(mock_model_class.from_config.call_args.args[0], hidden_layers_attr) == 2
 
 
+def test_load_model_from_task_test_model_config_saves_tokenizer(tmp_path):
+    """The reference tokenizer should be saved into the test model directory."""
+    model_config = BertConfig(num_hidden_layers=12)  # pylint: disable=unexpected-keyword-arg
+    created_model = MagicMock()
+    test_model_path = tmp_path / "saved_test_model"
+    mock_tokenizer = MagicMock()
+
+    with (
+        patch("transformers.pipelines.check_task") as mock_check_task,
+        patch("olive.common.hf.utils.from_pretrained", return_value=model_config),
+        patch("olive.common.hf.utils.get_tokenizer", return_value=mock_tokenizer) as mock_get_tokenizer,
+        patch("olive.common.hf.utils.save_tokenizer") as mock_save_tokenizer,
+    ):
+        mock_model_class = MagicMock()
+        mock_model_class.from_config.return_value = created_model
+        mock_check_task.return_value = ("text-classification", {"pt": (mock_model_class,)}, None)
+
+        load_model_from_task(
+            "text-classification",
+            "dummy-model",
+            test_model_config={"num_hidden_layers": 2},
+            test_model_path=str(test_model_path),
+        )
+
+    mock_get_tokenizer.assert_called_once_with("dummy-model")
+    mock_save_tokenizer.assert_called_once_with(mock_tokenizer, str(test_model_path))
+
+
 def test_load_model_from_task_test_model_config_fails_without_fallback():
     model_config = BertConfig(num_hidden_layers=12)  # pylint: disable=unexpected-keyword-arg
 

From 6e9ad3e71f97686aaeb76837b50f806728598ea8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Thu, 2 Jul 2026 12:38:04 +0200
Subject: [PATCH 61/80] simplifies how models are saved

---
 olive/passes/onnx/discrepancy_check.py | 39 ++++----------------------
 1 file changed, 5 insertions(+), 34 deletions(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 365c69e13..798a4c2bb 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -230,18 +230,10 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                 type_=Optional[str],
                 default_value=None,
                 description=(
-                    "Directory where discrepancy check results and reference model are saved. "
+                    "Directory where discrepancy check results are saved. "
                     "If not specified, results are written to the pass cache directory."
                 ),
             ),
-            "save_reference_model_state_dict": PassConfigParam(
-                type_=bool,
-                default_value=False,
-                description=(
-                    "Save the reference PyTorch model weights (state_dict) alongside the results. "
-                    "This allows direct comparison between the reference and optimized models."
-                ),
-            ),
             "test_metrics": PassConfigParam(
                 type_=Optional[list[str]],
                 default_value=None,
@@ -366,12 +358,12 @@ def _run_for_config(
         self, model: ONNXModelHandler, config: type[BasePassConfig], output_model_path: str
     ) -> ONNXModelHandler:
         dataloader, io_config = self._prepare_dataloader(model)
-        ref_model, ref_cfg, ref_path = self._load_reference_model(model, config)
+        ref_model, ref_path = self._load_reference_model(model, config)
 
         device, execution_provider, torch_device, weight_dtype = self._resolve_execution_device(model)
         ref_model = self._cast_reference_model(ref_model, weight_dtype, torch_device)
 
-        report_dir = self._save_reference_artifacts(ref_model, ref_cfg, config, output_model_path)
+        report_dir = self._resolve_report_dir(config, output_model_path)
 
         session = model.prepare_session(
             device=device,
@@ -471,7 +463,7 @@ def _load_reference_model(self, model: ONNXModelHandler, config: type[BasePassCo
             ref_path,
             getattr(ref_cfg, "_attn_implementation", None),
         )
-        return ref_model, ref_cfg, ref_path
+        return ref_model, ref_path
 
     def _resolve_execution_device(self, model: ONNXModelHandler):
         import torch
@@ -520,21 +512,11 @@ def _cast_reference_model(self, ref_model, weight_dtype, torch_device):
             ref_model = ref_model.to(torch_device)
         return ref_model
 
-    def _save_reference_artifacts(self, ref_model, ref_cfg, config: type[BasePassConfig], output_model_path: str):
-        # Save reference PyTorch model for direct comparison
+    def _resolve_report_dir(self, config: type[BasePassConfig], output_model_path: str):
         report_dir = config.report_output_dir or output_model_path
         report_dir_path = Path(report_dir)
         if report_dir_path.suffix and not report_dir_path.is_dir():
             report_dir = str(report_dir_path.parent)
-        if config.save_reference_model_state_dict:
-            self._export_reference_model(ref_model, report_dir)
-
-        # Save the (potentially modified) model config alongside the results so the
-        # exact configuration used for this test run is always reproducible.
-        config_save_path = Path(report_dir) / "reference_model_config.json"
-        config_save_path.parent.mkdir(parents=True, exist_ok=True)
-        config_save_path.write_text(ref_cfg.to_json_string())
-        logger.info("Saved reference model config to %s", config_save_path)
         return report_dir
 
     def _compute_logits_discrepancy(self, ref_model, session, dataloader, io_config, torch_device):
@@ -1195,14 +1177,3 @@ def compare_llama_cpp(
         )
 
         return results
-
-    def _export_reference_model(self, ref_model, output_model_path: str):
-        """Save the reference PyTorch model weights for direct comparison."""
-        import torch
-
-        output_dir = Path(output_model_path)
-        output_dir.mkdir(parents=True, exist_ok=True)
-
-        ref_pt_path = output_dir / "reference_model.pt"
-        torch.save(ref_model.state_dict(), str(ref_pt_path))
-        logger.info("Reference PyTorch model saved to %s", ref_pt_path)

From 30fe727afa70e6e6b5b948c27f15728f05439359 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Thu, 2 Jul 2026 12:59:49 +0200
Subject: [PATCH 62/80] use the same metrics for llama-cpp

---
 olive/passes/onnx/discrepancy_check.py     | 39 +++++++++++++++++-----
 test/passes/onnx/test_discrepancy_check.py | 19 ++++++-----
 2 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 798a4c2bb..c07fe4253 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -304,7 +304,7 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                 default_value=32,
                 description="Maximum number of new tokens to generate for the token sequence comparison.",
             ),
-            "time_to_first_n_tokens": PassConfigParam(
+            "first_n_tokens_timed": PassConfigParam(
                 type_=int,
                 default_value=5,
                 description=(
@@ -382,7 +382,9 @@ def _run_for_config(
 
         self._run_generation_comparison(model, config, ref_model, ref_path, generation_metrics, results)
 
-        self._run_llama_cpp_comparison(model, config, ref_model, ref_path, report_dir, results)
+        self._run_llama_cpp_comparison(
+            model, config, ref_model, ref_path, report_dir, generation_metrics, results
+        )
 
         self._save_results(model, results, report_dir)
         return model
@@ -680,7 +682,7 @@ def _run_generation_comparison(
 
         # first_token_20 generates 20 tokens; tf5t measures the time to the first 5 tokens.
         gen_max_new_tokens = 20 if "first_token_20" in generation_metrics else config.generate_max_new_tokens
-        gen_first_n = 5 if "tf5t" in generation_metrics else config.time_to_first_n_tokens
+        gen_first_n = 5 if "tf5t" in generation_metrics else config.first_n_tokens_timed
         gen_results = self.compare_generation(
             config,
             ref_model,
@@ -739,7 +741,9 @@ def _run_generation_comparison(
             results.setdefault("failures", []).append(gen_failure)
             logger.error("ONNX model discrepancy check FAILED: %s", gen_failure)
 
-    def _run_llama_cpp_comparison(self, model: ONNXModelHandler, config, ref_model, ref_path, report_dir, results):
+    def _run_llama_cpp_comparison(
+        self, model: ONNXModelHandler, config, ref_model, ref_path, report_dir, generation_metrics, results
+    ):
         # llama.cpp comparison: convert reference model to GGUF and compare latencies
         if not config.llama_cpp:
             return
@@ -757,6 +761,25 @@ def _run_llama_cpp_comparison(self, model: ONNXModelHandler, config, ref_model,
                 preconverted_gguf_path=preconverted_gguf_path,
             )
             results.update(llama_results)
+
+            # Surface the llama.cpp vs transformers first-token comparison alongside the
+            # transformers vs GenAI comparison when first_token_20 is requested.
+            if "first_token_20" in generation_metrics:
+                first_token_20 = results.setdefault("first_token_20", {})
+                transformers_first_token = llama_results.get("llama_cpp_pytorch_first_token_id")
+                llama_first_token = llama_results.get("llama_cpp_first_token_id")
+                first_token_20.setdefault("transformers_first_token", transformers_first_token)
+                first_token_20["llama_cpp_first_token"] = llama_first_token
+                first_token_20["llama_cpp_first_token_matches"] = llama_results.get(
+                    "llama_cpp_first_token_matches_pytorch"
+                )
+                logger.info(
+                    "OnnxDiscrepancyCheck first_token_20 (llama.cpp): matches=%s "
+                    "(transformers=%s, llama_cpp=%s)",
+                    llama_results.get("llama_cpp_first_token_matches_pytorch"),
+                    transformers_first_token,
+                    llama_first_token,
+                )
         except Exception as exc:
             logger.exception("OnnxDiscrepancyCheck llama.cpp comparison failed.")
             results["status"] = "failed"
@@ -862,7 +885,7 @@ def compare_generation(
         Returns a dict with the longest common token sequence length, the first-generated-token
         match between transformers and ONNX Runtime GenAI, and the time-to-first-token and
         time-to-first-N-tokens latencies (in seconds) for both, where N is ``first_n``
-        (defaults to ``config.time_to_first_n_tokens``).
+        (defaults to ``config.first_n_tokens_timed``).
 
         ``genai_model_path``, ``max_new_tokens`` and ``first_n`` override the corresponding
         config values when provided, which lets the caller request specific metrics such as
@@ -878,7 +901,7 @@ def compare_generation(
         tokenizer = AutoTokenizer.from_pretrained(ref_model_path)
 
         max_new_tokens = config.generate_max_new_tokens if max_new_tokens is None else max_new_tokens
-        first_n_config = config.time_to_first_n_tokens if first_n is None else first_n
+        first_n_config = config.first_n_tokens_timed if first_n is None else first_n
         first_n = max(1, min(first_n_config, max_new_tokens)) if max_new_tokens > 0 else 0
 
         # Transformers generation
@@ -965,7 +988,7 @@ def __call__(self, generated_ids, scores, **kwargs) -> bool:
 
         gen_results = {
             "longest_common_token_sequence": longest_common,
-            "time_to_first_n_tokens": first_n,
+            "first_n_tokens_timed": first_n,
             "transformers_first_token": transformers_first_token,
             "genai_first_token": genai_first_token,
             "first_token_matches": first_token_matches,
@@ -1091,7 +1114,7 @@ def compare_llama_cpp(
         pytorch_first_token_id = int(gen_out[0, -1].item())
 
         max_new_tokens = config.generate_max_new_tokens
-        first_n = max(1, min(config.time_to_first_n_tokens, max_new_tokens)) if max_new_tokens > 0 else 1
+        first_n = max(1, min(config.first_n_tokens_timed, max_new_tokens)) if max_new_tokens > 0 else 1
 
         output_dir_path = Path(output_dir)
         output_dir_path.mkdir(parents=True, exist_ok=True)
diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index 03338a3ef..077facb86 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -62,7 +62,7 @@ def test_compare_generation_returns_common_prefix_length(self):
         config.genai_model_path = "mock_genai_model"
         config.generate_prompt = "Hello world"
         config.generate_max_new_tokens = 10
-        config.time_to_first_n_tokens = 5
+        config.first_n_tokens_timed = 5
 
         # Mock transformers tokenizer and model
         mock_tokenizer = MagicMock()
@@ -114,7 +114,7 @@ def get_next_tokens_side_effect():
         # Common prefix: [1, 2, 3, 10, 11] = 5 tokens before divergence
         assert result["longest_common_token_sequence"] == 5
         # Latency metrics are exposed for both transformers and ONNX Runtime GenAI.
-        assert result["time_to_first_n_tokens"] == 5
+        assert result["first_n_tokens_timed"] == 5
         for key in (
             "transformers_time_to_first_token_s",
             "transformers_time_to_first_n_tokens_s",
@@ -135,7 +135,7 @@ def test_compare_generation_fully_matching(self):
         config.genai_model_path = "mock_genai_model"
         config.generate_prompt = "Test"
         config.generate_max_new_tokens = 5
-        config.time_to_first_n_tokens = 5
+        config.first_n_tokens_timed = 5
 
         mock_tokenizer = MagicMock()
         mock_tokenizer.return_value = MagicMock(input_ids=torch.tensor([[10, 20]]))
@@ -181,7 +181,7 @@ def get_next_tokens_side_effect():
         mock_generator.append_tokens.assert_called_once_with([[10, 20]])
         # All 5 tokens match
         assert result["longest_common_token_sequence"] == 5
-        assert result["time_to_first_n_tokens"] == 5
+        assert result["first_n_tokens_timed"] == 5
         for key in (
             "transformers_time_to_first_token_s",
             "transformers_time_to_first_n_tokens_s",
@@ -206,7 +206,7 @@ def test_compare_generation_with_zero_max_new_tokens(self):
         config.genai_model_path = "mock_genai_model"
         config.generate_prompt = "Test"
         config.generate_max_new_tokens = 0
-        config.time_to_first_n_tokens = 5
+        config.first_n_tokens_timed = 5
 
         mock_tokenizer = MagicMock()
         mock_tokenizer.return_value = MagicMock(input_ids=torch.tensor([[10, 20]]))
@@ -237,7 +237,7 @@ def test_compare_generation_with_zero_max_new_tokens(self):
 
         assert mock_ref_model.generate.call_count == 1
         assert mock_ref_model.generate.call_args.kwargs["max_new_tokens"] == 0
-        assert result["time_to_first_n_tokens"] == 0
+        assert result["first_n_tokens_timed"] == 0
         assert result["transformers_time_to_first_token_s"] is None
         assert result["transformers_time_to_first_n_tokens_s"] is None
 
@@ -252,7 +252,7 @@ def test_compare_generation_reports_first_token_match(self):
         config.genai_model_path = None
         config.generate_prompt = "Hello world"
         config.generate_max_new_tokens = 10
-        config.time_to_first_n_tokens = 5
+        config.first_n_tokens_timed = 5
 
         mock_tokenizer = MagicMock()
         mock_tokenizer.return_value = MagicMock(input_ids=torch.tensor([[1, 2, 3]]))
@@ -319,7 +319,7 @@ def test_compare_generation_reports_first_token_mismatch(self):
         config.genai_model_path = "mock_genai_model"
         config.generate_prompt = "Hello"
         config.generate_max_new_tokens = 10
-        config.time_to_first_n_tokens = 5
+        config.first_n_tokens_timed = 5
 
         mock_tokenizer = MagicMock()
         mock_tokenizer.return_value = MagicMock(input_ids=torch.tensor([[1, 2]]))
@@ -552,7 +552,7 @@ def _make_config(self):
         config.reference_model_path = "mock_model"
         config.generate_prompt = "Hello world"
         config.generate_max_new_tokens = 10
-        config.time_to_first_n_tokens = 5
+        config.first_n_tokens_timed = 5
         config.llama_cpp_env_path = "/mock/llama_env"
         return config
 
@@ -770,3 +770,4 @@ def test_compare_llama_cpp_uses_preconverted_gguf(self, tmp_path):
         assert result["llama_cpp_first_token_id"] == 7
         mock_convert_script.assert_not_called()
         assert mock_subprocess_run.call_count == 1
+

From 3e4d9ddec630606094c00ef44723daa20d1183d5 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 2 Jul 2026 11:12:31 +0000
Subject: [PATCH 63/80] Make special_tokens_map.json optional in fast test
 model smoke assertion

---
 test/cli/test_cli_test_model_smoke.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test/cli/test_cli_test_model_smoke.py b/test/cli/test_cli_test_model_smoke.py
index 7bef9a703..e9fd6874e 100644
--- a/test/cli/test_cli_test_model_smoke.py
+++ b/test/cli/test_cli_test_model_smoke.py
@@ -162,11 +162,13 @@ def _assert_smoke_flows(self, tmp_path: Path):
             "config.json",
             "generation_config.json",
             "model.safetensors",
-            "special_tokens_map.json",
             "tokenizer.json",
             "tokenizer_config.json",
             TEST_MODEL_MARKER_FILE,
         }
+        # Some transformers versions additionally emit special_tokens_map.json when saving the
+        # tokenizer; treat it as optional so the assertion is version independent.
+        optional_test_model_files = {"special_tokens_map.json"}
         expected_run_output_files = {
             "config.json",
             "genai_config.json",
@@ -181,7 +183,9 @@ def _assert_smoke_flows(self, tmp_path: Path):
             with self.subTest(model_id=model_id):
                 config_path, test_model_dir, run_output_dir = _run_documented_test_model_smoke_flow(tmp_path, model_id)
                 assert config_path.exists()
-                assert self._list_relative_files(test_model_dir) == expected_test_model_files
+                assert (
+                    self._list_relative_files(test_model_dir) - optional_test_model_files == expected_test_model_files
+                )
                 run_output_files = self._list_relative_files(run_output_dir)
                 assert expected_run_output_files.issubset(run_output_files)
                 self._assert_file_size_below_limit(test_model_dir / "model.safetensors")

From e685ff19d30ec16bf90971bf2f13e309a169497f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Thu, 2 Jul 2026 13:21:39 +0200
Subject: [PATCH 64/80] use the same metrics for llama-cpp

---
 olive/passes/onnx/discrepancy_check.py     | 33 ++++++++++++++++------
 test/passes/onnx/test_discrepancy_check.py |  3 ++
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index c07fe4253..a57c00ce8 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -773,12 +773,16 @@ def _run_llama_cpp_comparison(
                 first_token_20["llama_cpp_first_token_matches"] = llama_results.get(
                     "llama_cpp_first_token_matches_pytorch"
                 )
+                first_token_20["llama_cpp_matching_leading_tokens"] = llama_results.get(
+                    "llama_cpp_longest_common_token_sequence"
+                )
                 logger.info(
                     "OnnxDiscrepancyCheck first_token_20 (llama.cpp): matches=%s "
-                    "(transformers=%s, llama_cpp=%s)",
+                    "(transformers=%s, llama_cpp=%s), matching_leading_tokens=%s",
                     llama_results.get("llama_cpp_first_token_matches_pytorch"),
                     transformers_first_token,
                     llama_first_token,
+                    llama_results.get("llama_cpp_longest_common_token_sequence"),
                 )
         except Exception as exc:
             logger.exception("OnnxDiscrepancyCheck llama.cpp comparison failed.")
@@ -1107,15 +1111,20 @@ def compare_llama_cpp(
         encoded = tokenizer(config.generate_prompt, return_tensors="pt")
         prompt_token_ids: list[int] = encoded["input_ids"][0].tolist()
 
-        # Run one-token generation with transformers to get the reference first token
-        input_ids = torch.tensor([prompt_token_ids]).to(ref_model.device)
-        with torch.no_grad():
-            gen_out = ref_model.generate(input_ids, max_new_tokens=1, do_sample=False)
-        pytorch_first_token_id = int(gen_out[0, -1].item())
-
         max_new_tokens = config.generate_max_new_tokens
         first_n = max(1, min(config.first_n_tokens_timed, max_new_tokens)) if max_new_tokens > 0 else 1
 
+        # Run generation with transformers to get the reference first token and the leading
+        # token sequence used for the longest-common-token comparison against llama.cpp.
+        input_ids = torch.tensor([prompt_token_ids]).to(ref_model.device)
+        with torch.no_grad():
+            gen_out = ref_model.generate(input_ids, max_new_tokens=max(1, max_new_tokens), do_sample=False)
+        pytorch_tokens: list[int] = gen_out[0].cpu().tolist()
+        prompt_token_count = len(prompt_token_ids)
+        pytorch_first_token_id = (
+            pytorch_tokens[prompt_token_count] if len(pytorch_tokens) > prompt_token_count else None
+        )
+
         output_dir_path = Path(output_dir)
         output_dir_path.mkdir(parents=True, exist_ok=True)
         model_dir = str(output_dir_path / "hf_model")
@@ -1165,10 +1174,16 @@ def compare_llama_cpp(
         llama_out: dict = json.loads(proc.stdout)
 
         llama_first_token_id: Optional[int] = llama_out.get("first_token_id")
+        llama_generated_tokens: list[int] = llama_out.get("generated_tokens") or []
         llama_ttft: Optional[float] = llama_out.get("ttft")
         llama_ttfn: Optional[float] = llama_out.get("ttfn")
         llama_total: Optional[float] = llama_out.get("total_time")
 
+        # Longest common leading token sequence between transformers and llama.cpp,
+        # measured from the beginning of the (identical) prompt through the generated tokens.
+        llama_tokens = prompt_token_ids + llama_generated_tokens
+        llama_longest_common = _longest_common_token_sequence(pytorch_tokens, llama_tokens)
+
         # Speedup: compare llama.cpp TTFT with single-pass PyTorch / ONNX latency
         llama_speedup_vs_pytorch: Optional[float] = (
             pytorch_latency_s / llama_ttft if (pytorch_latency_s is not None and llama_ttft) else None
@@ -1181,6 +1196,7 @@ def compare_llama_cpp(
             "llama_cpp_pytorch_first_token_id": pytorch_first_token_id,
             "llama_cpp_first_token_id": llama_first_token_id,
             "llama_cpp_first_token_matches_pytorch": llama_first_token_id == pytorch_first_token_id,
+            "llama_cpp_longest_common_token_sequence": llama_longest_common,
             "llama_cpp_ttft_s": llama_ttft,
             "llama_cpp_ttfn_s": llama_ttfn,
             "llama_cpp_total_time_s": llama_total,
@@ -1190,8 +1206,9 @@ def compare_llama_cpp(
 
         logger.info(
             "OnnxDiscrepancyCheck llama.cpp comparison: first_token_matches_pytorch=%s, "
-            "ttft=%s, ttfn=%s, total=%s, speedup_vs_pytorch=%s, speedup_vs_onnx=%s",
+            "matching_leading_tokens=%s, ttft=%s, ttfn=%s, total=%s, speedup_vs_pytorch=%s, speedup_vs_onnx=%s",
             results["llama_cpp_first_token_matches_pytorch"],
+            llama_longest_common,
             _format_seconds(llama_ttft),
             _format_seconds(llama_ttfn),
             _format_seconds(llama_total),
diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index 077facb86..f26416dfa 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -651,6 +651,7 @@ def test_compare_llama_cpp_returns_expected_metrics(self, tmp_path):
             "llama_cpp_first_token_id",
             "llama_cpp_pytorch_first_token_id",
             "llama_cpp_first_token_matches_pytorch",
+            "llama_cpp_longest_common_token_sequence",
             "llama_cpp_ttft_s",
             "llama_cpp_ttfn_s",
             "llama_cpp_total_time_s",
@@ -660,6 +661,8 @@ def test_compare_llama_cpp_returns_expected_metrics(self, tmp_path):
         assert expected_keys <= set(result.keys())
 
         assert result["llama_cpp_first_token_id"] == 42
+        # Prompt [1, 2, 3] + first generated token 42 match transformers [1, 2, 3, 42] = 4 leading tokens.
+        assert result["llama_cpp_longest_common_token_sequence"] == 4
         assert result["llama_cpp_ttft_s"] == pytest.approx(0.05)
         assert result["llama_cpp_ttfn_s"] == pytest.approx(0.25)
         assert result["llama_cpp_total_time_s"] == pytest.approx(0.50)

From 3490d1caaeace14bb9bce1fc504c34b2cfde5682 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Thu, 2 Jul 2026 13:36:35 +0200
Subject: [PATCH 65/80] fix the maximum number of token to look at

---
 olive/passes/onnx/discrepancy_check.py     | 16 +++++++++++-----
 test/passes/onnx/test_discrepancy_check.py |  4 ++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index a57c00ce8..93e19bc1c 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -751,6 +751,9 @@ def _run_llama_cpp_comparison(
         if model.model_attributes:
             preconverted_gguf_path = model.model_attributes.get("reference_gguf_model_path")
         try:
+            # first_token_20 restricts the comparison to a 20-token generation, mirroring the
+            # transformers vs GenAI path so no more than 20 generated tokens are validated.
+            gen_max_new_tokens = 20 if "first_token_20" in generation_metrics else config.generate_max_new_tokens
             llama_results = self.compare_llama_cpp(
                 config,
                 ref_model,
@@ -759,6 +762,7 @@ def _run_llama_cpp_comparison(
                 onnx_latency_s=results.get("onnx_latency_s"),
                 ref_model_path=ref_path,
                 preconverted_gguf_path=preconverted_gguf_path,
+                max_new_tokens=gen_max_new_tokens,
             )
             results.update(llama_results)
 
@@ -1081,6 +1085,7 @@ def compare_llama_cpp(
         *,
         ref_model_path: str,
         preconverted_gguf_path: Optional[str] = None,
+        max_new_tokens: Optional[int] = None,
     ) -> dict:
         """Convert the reference model to GGUF and compare inference with llama.cpp.
 
@@ -1111,7 +1116,7 @@ def compare_llama_cpp(
         encoded = tokenizer(config.generate_prompt, return_tensors="pt")
         prompt_token_ids: list[int] = encoded["input_ids"][0].tolist()
 
-        max_new_tokens = config.generate_max_new_tokens
+        max_new_tokens = config.generate_max_new_tokens if max_new_tokens is None else max_new_tokens
         first_n = max(1, min(config.first_n_tokens_timed, max_new_tokens)) if max_new_tokens > 0 else 1
 
         # Run generation with transformers to get the reference first token and the leading
@@ -1179,10 +1184,11 @@ def compare_llama_cpp(
         llama_ttfn: Optional[float] = llama_out.get("ttfn")
         llama_total: Optional[float] = llama_out.get("total_time")
 
-        # Longest common leading token sequence between transformers and llama.cpp,
-        # measured from the beginning of the (identical) prompt through the generated tokens.
-        llama_tokens = prompt_token_ids + llama_generated_tokens
-        llama_longest_common = _longest_common_token_sequence(pytorch_tokens, llama_tokens)
+        # Longest common leading token sequence between transformers and llama.cpp, measured over
+        # the generated tokens only (the prompt is shared and identical).  This bounds the count by
+        # ``max_new_tokens`` so, e.g., first_token_20 never validates more than 20 generated tokens.
+        pytorch_generated_tokens = pytorch_tokens[prompt_token_count:]
+        llama_longest_common = _longest_common_token_sequence(pytorch_generated_tokens, llama_generated_tokens)
 
         # Speedup: compare llama.cpp TTFT with single-pass PyTorch / ONNX latency
         llama_speedup_vs_pytorch: Optional[float] = (
diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index f26416dfa..63e5a135a 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -661,8 +661,8 @@ def test_compare_llama_cpp_returns_expected_metrics(self, tmp_path):
         assert expected_keys <= set(result.keys())
 
         assert result["llama_cpp_first_token_id"] == 42
-        # Prompt [1, 2, 3] + first generated token 42 match transformers [1, 2, 3, 42] = 4 leading tokens.
-        assert result["llama_cpp_longest_common_token_sequence"] == 4
+        # Generated-only comparison: transformers generated [42] vs llama.cpp [42, 43, ...] = 1 match.
+        assert result["llama_cpp_longest_common_token_sequence"] == 1
         assert result["llama_cpp_ttft_s"] == pytest.approx(0.05)
         assert result["llama_cpp_ttfn_s"] == pytest.approx(0.25)
         assert result["llama_cpp_total_time_s"] == pytest.approx(0.50)

From de87ad1673a8578cbd1190751ce96ded0549b495 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Thu, 2 Jul 2026 13:56:28 +0200
Subject: [PATCH 66/80] fixweights

---
 olive/common/hf/utils.py               | 15 ++++--
 olive/passes/onnx/model_builder.py     |  9 +++-
 test/passes/onnx/test_model_builder.py | 68 ++++++++++++++++++++++++++
 3 files changed, 86 insertions(+), 6 deletions(-)

diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py
index c8cc39ece..6ff49a518 100644
--- a/olive/common/hf/utils.py
+++ b/olive/common/hf/utils.py
@@ -42,6 +42,16 @@ def is_test_model_dir(output_dir: Union[str, Path]) -> bool:
     return marker.get("type") == "olive_hf_test_model"
 
 
+def has_test_model_weights(output_dir: Union[str, Path]) -> bool:
+    """Return True if *output_dir* contains persisted model weight shards.
+
+    A config-only test-model directory (created by ``save_test_model_config`` during
+    ``--dry_run --test``) has a ``config.json`` and marker file but no weight shards yet.
+    """
+    output_path = Path(output_dir)
+    return any(output_path.glob("*.safetensors")) or any(output_path.glob("pytorch_model*.bin"))
+
+
 def _write_test_model_marker(output_dir: Union[str, Path], test_model_config: Optional[dict[str, Any]] = None):
     marker_path = _get_test_model_marker_path(output_dir)
     marker_path.write_text(
@@ -212,10 +222,7 @@ def load_model_from_task(
                     # and a marker file but no weight shards yet.  In that case, create a random
                     # model from the saved config and persist the weights so subsequent loads
                     # can use the saved directory directly.
-                    _has_weights = any(test_model_dir.glob("*.safetensors")) or any(
-                        test_model_dir.glob("pytorch_model*.bin")
-                    )
-                    if _has_weights:
+                    if has_test_model_weights(test_model_dir):
                         model = from_pretrained(model_class, test_model_path, "model", **kwargs)
                     else:
                         model = _load_test_model(model_class, model_config, kwargs.get("trust_remote_code"))
diff --git a/olive/passes/onnx/model_builder.py b/olive/passes/onnx/model_builder.py
index 5321c13d1..d862fe214 100644
--- a/olive/passes/onnx/model_builder.py
+++ b/olive/passes/onnx/model_builder.py
@@ -18,7 +18,7 @@
 from huggingface_hub.constants import HF_HUB_CACHE
 from packaging import version
 
-from olive.common.hf.utils import is_test_model_dir
+from olive.common.hf.utils import has_test_model_weights, is_test_model_dir
 from olive.constants import Precision
 from olive.hardware.accelerator import AcceleratorSpec, Device
 from olive.hardware.constants import ExecutionProvider
@@ -249,7 +249,12 @@ def _run_for_config(
                         "ModelBuilder requires test_model_path to be set when test_model_config is provided. "
                         "Please specify the path where the test model should be saved."
                     )
-                if not is_test_model_dir(model.test_model_path):
+                # Materialize the reference weights when the test-model directory is missing or only
+                # contains a config (as created by SaveTestModelConfig).  This guarantees the ONNX
+                # model is built from the exact same saved weights that OnnxDiscrepancyCheck later
+                # loads as the reference; otherwise the model builder would initialize its own
+                # weights and the discrepancy check would compare against a different model.
+                if not is_test_model_dir(model.test_model_path) or not has_test_model_weights(model.test_model_path):
                     model.load_model(cache_model=False)
                 model_path = str(Path(model.test_model_path).resolve())
             # provide the model path as input path, model builder uses input_path for quantized models
diff --git a/test/passes/onnx/test_model_builder.py b/test/passes/onnx/test_model_builder.py
index 0f71535db..634cc762a 100644
--- a/test/passes/onnx/test_model_builder.py
+++ b/test/passes/onnx/test_model_builder.py
@@ -187,6 +187,74 @@ def fake_create_model(*_, **kwargs):
     assert fake_builder.create_model.call_args.kwargs["input_path"] == str(test_model_path)
 
 
+def test_model_builder_materializes_weights_for_config_only_test_dir(tmp_path):
+    """A config-only test-model dir (no weights) must still trigger weight materialization.
+
+    Otherwise the model builder would initialize its own weights and the ONNX model would not
+    match the reference model that OnnxDiscrepancyCheck later loads from the same directory.
+    """
+    from olive.common.hf.utils import TEST_MODEL_MARKER_FILE
+
+    test_model_path = tmp_path / "reference_hf_model"
+    output_folder = tmp_path / "output_model"
+
+    # Pre-create a config-only Olive test-model directory: marker + config.json, but no weights.
+    test_model_path.mkdir(parents=True, exist_ok=True)
+    (test_model_path / "config.json").write_text("{}")
+    (test_model_path / TEST_MODEL_MARKER_FILE).write_text(
+        json.dumps({"type": "olive_hf_test_model", "test_model_config": {}})
+    )
+
+    mock_cfg = MagicMock()
+    mock_cfg.to_dict.return_value = {}
+    with patch.object(HfModelHandler, "get_hf_model_config", return_value=mock_cfg):
+        input_model = HfModelHandler(
+            model_path=TINY_RANDOM_LLAMA_MODEL_ID,
+            test_model_config={"hidden_layers": 2},
+            test_model_path=str(test_model_path),
+        )
+
+    def materialize_weights(*args, **kwargs):
+        (test_model_path / "model.safetensors").write_text("weights")
+        return MagicMock()
+
+    def fake_create_model(*_, **kwargs):
+        output_dir = Path(kwargs["output_dir"])
+        (output_dir / kwargs["filename"]).write_text("dummy onnx file")
+        (output_dir / "genai_config.json").write_text("{}")
+
+    fake_builder = types.ModuleType("onnxruntime_genai.models.builder")
+    fake_builder.create_model = MagicMock(side_effect=fake_create_model)
+    fake_models = types.ModuleType("onnxruntime_genai.models")
+    fake_models.builder = fake_builder
+    fake_ort_genai = types.ModuleType("onnxruntime_genai")
+    fake_ort_genai.models = fake_models
+    fake_ort_genai.__version__ = "0.0.0"
+
+    p = create_pass_from_dict(ModelBuilder, {"precision": "fp32"}, disable_search=True)
+
+    with (
+        patch.object(ModelBuilder, "maybe_patch_quant"),
+        patch.dict(
+            sys.modules,
+            {
+                "onnxruntime_genai": fake_ort_genai,
+                "onnxruntime_genai.models": fake_models,
+                "onnxruntime_genai.models.builder": fake_builder,
+            },
+        ),
+        patch.object(input_model, "load_model", side_effect=materialize_weights) as mock_load_model,
+        patch.object(input_model, "save_metadata", return_value=[]),
+    ):
+        output_model = p.run(input_model, output_folder)
+
+    assert isinstance(output_model, ONNXModelHandler)
+    # Weights were missing, so load_model must be called to persist them into the shared dir.
+    assert mock_load_model.call_count == 1
+    # The ONNX model is built from the shared test-model directory (same weights as the reference).
+    assert fake_builder.create_model.call_args.kwargs["input_path"] == str(test_model_path.resolve())
+
+
 def test_model_builder_apply_annotations_on_single_file_fallback(tmp_path, monkeypatch):
     def fake_create_model(
         model_name, input_path, output_dir, precision, execution_provider, cache_dir, filename, **kwargs

From ff1a77ea6a9bda207e662b56641f30471fe93ed9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Thu, 2 Jul 2026 14:36:55 +0200
Subject: [PATCH 67/80] 20

---
 olive/passes/onnx/discrepancy_check.py     | 29 ++++++++++++++++------
 test/passes/onnx/test_discrepancy_check.py |  9 ++++---
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 93e19bc1c..90acabf62 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -35,11 +35,17 @@ def _json_sanitize(obj):
 
 
 def _infer_shape(dynamic_shape, known_values=None):
+    # Use an empty past-KV cache (past_sequence_length=0) so the discrepancy check is a clean
+    # prefill comparison.  The dummy dataloader passes ``past_key_values.<i>.key/value`` tensors,
+    # but HuggingFace ``forward`` does not accept those dotted names as keyword arguments and
+    # silently drops them, so the reference model would run without a cache while the ONNX model
+    # would consume a (bogus, all-ones) cache -- producing a large, meaningless discrepancy.
+    # Keeping the past length at 0 makes both models perform the same prefill over ``input_ids``.
     default_values = {
         "batch_size": 1,
-        "past_sequence_length": 2,
-        "total_sequence_length": 3,
-        "sequence_length": 1,
+        "past_sequence_length": 0,
+        "sequence_length": 8,
+        "total_sequence_length": 8,
     }
     if known_values:
         default_values.update(known_values)
@@ -957,10 +963,13 @@ def __call__(self, generated_ids, scores, **kwargs) -> bool:
             transformers_ttfn = None
         transformers_tokens = transformers_output[0].cpu().tolist()
 
-        # ONNX Runtime GenAI generation
+        # ONNX Runtime GenAI generation.  Feed GenAI the exact same prompt token ids produced by the
+        # transformers tokenizer (including any special/BOS tokens) rather than re-encoding with the
+        # GenAI tokenizer.  ``og.Tokenizer.encode`` does not add special tokens by default, so
+        # re-encoding would drop the BOS token that transformers adds, giving the two models different
+        # inputs and a spurious first-token mismatch even when the models are numerically identical.
         genai_model = og.Model(genai_model_path)
-        genai_tokenizer = og.Tokenizer(genai_model)
-        genai_input_ids = genai_tokenizer.encode(config.generate_prompt)
+        genai_input_ids = input_ids[0].cpu().tolist()
 
         params = og.GeneratorParams(genai_model)
         params.set_search_options(max_length=len(genai_input_ids) + max_new_tokens, do_sample=False)
@@ -983,7 +992,13 @@ def __call__(self, generated_ids, scores, **kwargs) -> bool:
                 genai_ttfn = time.perf_counter() - start
         del generator
 
-        longest_common = _longest_common_token_sequence(transformers_tokens, genai_tokens)
+        # Longest common leading token sequence between transformers and ONNX Runtime GenAI, measured
+        # over the generated tokens only (the prompt is shared and identical since GenAI is fed the same
+        # token ids).  This bounds the count by ``max_new_tokens`` so, e.g., first_token_20 never
+        # validates more than 20 generated tokens.
+        transformers_generated_tokens = transformers_tokens[prompt_token_count:]
+        genai_generated_tokens = genai_tokens[genai_prompt_token_count:]
+        longest_common = _longest_common_token_sequence(transformers_generated_tokens, genai_generated_tokens)
 
         # First generated token comparison (transformers vs ONNX Runtime GenAI).
         transformers_first_token = (
diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index 63e5a135a..991309ace 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -111,8 +111,9 @@ def get_next_tokens_side_effect():
             )
 
         mock_generator.append_tokens.assert_called_once_with([[1, 2, 3]])
-        # Common prefix: [1, 2, 3, 10, 11] = 5 tokens before divergence
-        assert result["longest_common_token_sequence"] == 5
+        # Generated-only common prefix: transformers [10, 11, 12, 13] vs genai [10, 11, 99, 99]
+        # matches on [10, 11] = 2 tokens before divergence (shared prompt is excluded).
+        assert result["longest_common_token_sequence"] == 2
         # Latency metrics are exposed for both transformers and ONNX Runtime GenAI.
         assert result["first_n_tokens_timed"] == 5
         for key in (
@@ -179,8 +180,8 @@ def get_next_tokens_side_effect():
             )
 
         mock_generator.append_tokens.assert_called_once_with([[10, 20]])
-        # All 5 tokens match
-        assert result["longest_common_token_sequence"] == 5
+        # All 3 generated tokens match (shared prompt is excluded)
+        assert result["longest_common_token_sequence"] == 3
         assert result["first_n_tokens_timed"] == 5
         for key in (
             "transformers_time_to_first_token_s",

From d476af604f9f883b07d182a7ed083a55eba50bde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Thu, 2 Jul 2026 14:55:30 +0200
Subject: [PATCH 68/80] second token

---
 docs/source/how-to/cli/cli-fast-test.md    | 20 +++++----
 olive/passes/onnx/discrepancy_check.py     | 48 ++++++++++++++++++++--
 test/passes/onnx/test_discrepancy_check.py | 15 +++++++
 3 files changed, 71 insertions(+), 12 deletions(-)

diff --git a/docs/source/how-to/cli/cli-fast-test.md b/docs/source/how-to/cli/cli-fast-test.md
index b7f8b9ceb..edabf964f 100644
--- a/docs/source/how-to/cli/cli-fast-test.md
+++ b/docs/source/how-to/cli/cli-fast-test.md
@@ -2,7 +2,7 @@
 
 If you are converting a large language model, it is often useful to validate the Olive command, environment, and conversion recipe on a much smaller model before spending time on the full checkpoint.
 
-The `--test` option does that for Hugging Face models. Olive keeps the same model architecture, reduces it to a random **2-layer** test model, saves it to the folder you provide, and reuses that folder on later runs.
+The `--test` option does that for Hugging Face models. Olive keeps the same model architecture, reduces it to a random **2-layer** test model, saves it under `<output_path>/reference_hf_model`, and reuses that folder on later runs.
 
 This example uses [`Qwen/Qwen3-0.6B`](https://huggingface.co/Qwen/Qwen3-0.6B), but the same pattern works for other supported Hugging Face LLMs.
 
@@ -15,24 +15,28 @@ olive optimize \
     --provider CPUExecutionProvider \
     --precision int4 \
     --output_path out/qwen \
-    --test out/qwen-test-model
+    --test
 ```
 
 Because this example runs without `--dry_run`, it produces:
 
 - `out/qwen/olive_config.json` — the Olive configuration used for the run (named `olive_config.json` so it is never confused with the model's own `config.json`).
+- `out/qwen/reference_hf_model/` — the randomly initialized 2-layer reference model (weights, tokenizer, and `config.json`) that the ONNX model is compared against. It is created on the first run and reused afterwards.
 - `out/qwen/model/` — the optimized ONNX model.
 - `out/qwen/discrepancy_check_results.json` — the discrepancy report.
 
-It also inserts an `OnnxDiscrepancyCheck` pass (if one is not already present) that will compare the generated ONNX model against the 2-layer reference model.
+It also inserts an `OnnxDiscrepancyCheck` pass (if one is not already present) that compares the generated ONNX model against the 2-layer reference model. This pass only **loads** the reference model; it never saves a model or config itself (the reference model is materialized earlier in the workflow).
 
 Additional metrics can be requested via `--test_metrics` (space- or comma-separated):
 
-- `speedup`: ONNX-vs-PyTorch inference latency
-- `first_token_20`: compares the first generated token (over a 20-token generation) between ONNX Runtime GenAI and transformers
-- `tft`: time to the first generated token (reported for both ONNX Runtime GenAI and transformers)
-- `tf5t`: time to the first 5 generated tokens (reported for both ONNX Runtime GenAI and transformers)
+- `mae`: enforces the max absolute error between the ONNX and reference logits (clean prefill with an empty KV cache, so both models run the identical forward pass). This is the default when `--test_metrics` is omitted.
+- `speedup`: ONNX-vs-PyTorch inference latency.
+- `first_token_20`: generates 20 tokens and compares the outputs of ONNX Runtime GenAI and transformers. It reports `first_token_matches` and `second_token_matches` (whether the first and second generated tokens are identical, along with the token ids for each backend) and `matching_leading_tokens` — the number of leading generated tokens that match. `matching_leading_tokens` is measured over the **generated** tokens only (the shared prompt is excluded) and is therefore capped at the 20-token generation length.
+- `tft`: time to the first generated token (reported for both ONNX Runtime GenAI and transformers).
+- `tf5t`: time to the first 5 generated tokens (reported for both ONNX Runtime GenAI and transformers).
 
-For example, `--test_metrics mae,speedup,first_token_20,tft,tf5t`. The generation metrics (`first_token_20`, `tft`, `tf5t`) use the optimized ONNX model directory as the ONNX Runtime GenAI model when it contains a `genai_config.json` (as produced by the model builder).
+For example, `--test_metrics mae,speedup,first_token_20,tft,tf5t`. The generation metrics (`first_token_20`, `tft`, `tf5t`) use the optimized ONNX model directory as the ONNX Runtime GenAI model when it contains a `genai_config.json` (as produced by the model builder). To keep tokenization consistent, ONNX Runtime GenAI is fed the exact token ids produced by the transformers tokenizer (including any BOS/special tokens) rather than re-encoding the prompt itself.
+
+You can additionally compare against llama.cpp by passing `--test_llama_path <llama_env>` (a virtual environment with llama.cpp installed). When provided, `first_token_20` also reports `llama_cpp_first_token_matches`, `llama_cpp_second_token_matches`, and `llama_cpp_matching_leading_tokens`, again measured over generated tokens only and capped at the generation length.
 
 > **Note:** `--test_metrics` is always respected even when the config was generated by `olive optimize --test`, because Olive updates the existing `OnnxDiscrepancyCheck` settings each time `olive run --test` is invoked.
diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 90acabf62..a5a750f8d 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -707,14 +707,20 @@ def _run_generation_comparison(
                 "transformers_first_token": gen_results.get("transformers_first_token"),
                 "genai_first_token": gen_results.get("genai_first_token"),
                 "first_token_matches": gen_results.get("first_token_matches"),
+                "transformers_second_token": gen_results.get("transformers_second_token"),
+                "genai_second_token": gen_results.get("genai_second_token"),
+                "second_token_matches": gen_results.get("second_token_matches"),
                 "matching_leading_tokens": longest_common,
             }
             logger.info(
-                "OnnxDiscrepancyCheck first_token_20: matches=%s (transformers=%s, genai=%s), "
-                "matching_leading_tokens=%s",
+                "OnnxDiscrepancyCheck first_token_20: first_token_matches=%s (transformers=%s, genai=%s), "
+                "second_token_matches=%s (transformers=%s, genai=%s), matching_leading_tokens=%s",
                 gen_results.get("first_token_matches"),
                 gen_results.get("transformers_first_token"),
                 gen_results.get("genai_first_token"),
+                gen_results.get("second_token_matches"),
+                gen_results.get("transformers_second_token"),
+                gen_results.get("genai_second_token"),
                 longest_common,
             )
         if "tft" in generation_metrics:
@@ -783,15 +789,26 @@ def _run_llama_cpp_comparison(
                 first_token_20["llama_cpp_first_token_matches"] = llama_results.get(
                     "llama_cpp_first_token_matches_pytorch"
                 )
+                first_token_20.setdefault(
+                    "transformers_second_token", llama_results.get("llama_cpp_pytorch_second_token_id")
+                )
+                first_token_20["llama_cpp_second_token"] = llama_results.get("llama_cpp_second_token_id")
+                first_token_20["llama_cpp_second_token_matches"] = llama_results.get(
+                    "llama_cpp_second_token_matches_pytorch"
+                )
                 first_token_20["llama_cpp_matching_leading_tokens"] = llama_results.get(
                     "llama_cpp_longest_common_token_sequence"
                 )
                 logger.info(
-                    "OnnxDiscrepancyCheck first_token_20 (llama.cpp): matches=%s "
-                    "(transformers=%s, llama_cpp=%s), matching_leading_tokens=%s",
+                    "OnnxDiscrepancyCheck first_token_20 (llama.cpp): first_token_matches=%s "
+                    "(transformers=%s, llama_cpp=%s), second_token_matches=%s (transformers=%s, llama_cpp=%s), "
+                    "matching_leading_tokens=%s",
                     llama_results.get("llama_cpp_first_token_matches_pytorch"),
                     transformers_first_token,
                     llama_first_token,
+                    llama_results.get("llama_cpp_second_token_matches_pytorch"),
+                    llama_results.get("llama_cpp_pytorch_second_token_id"),
+                    llama_results.get("llama_cpp_second_token_id"),
                     llama_results.get("llama_cpp_longest_common_token_sequence"),
                 )
         except Exception as exc:
@@ -1009,12 +1026,26 @@ def __call__(self, generated_ids, scores, **kwargs) -> bool:
         )
         first_token_matches = transformers_first_token is not None and transformers_first_token == genai_first_token
 
+        # Second generated token comparison (transformers vs ONNX Runtime GenAI).
+        transformers_second_token = (
+            transformers_tokens[prompt_token_count + 1] if len(transformers_tokens) > prompt_token_count + 1 else None
+        )
+        genai_second_token = (
+            genai_tokens[genai_prompt_token_count + 1] if len(genai_tokens) > genai_prompt_token_count + 1 else None
+        )
+        second_token_matches = (
+            transformers_second_token is not None and transformers_second_token == genai_second_token
+        )
+
         gen_results = {
             "longest_common_token_sequence": longest_common,
             "first_n_tokens_timed": first_n,
             "transformers_first_token": transformers_first_token,
             "genai_first_token": genai_first_token,
             "first_token_matches": first_token_matches,
+            "transformers_second_token": transformers_second_token,
+            "genai_second_token": genai_second_token,
+            "second_token_matches": second_token_matches,
             "transformers_time_to_first_token_s": transformers_ttft,
             "transformers_time_to_first_n_tokens_s": transformers_ttfn,
             "genai_time_to_first_token_s": genai_ttft,
@@ -1144,6 +1175,9 @@ def compare_llama_cpp(
         pytorch_first_token_id = (
             pytorch_tokens[prompt_token_count] if len(pytorch_tokens) > prompt_token_count else None
         )
+        pytorch_second_token_id = (
+            pytorch_tokens[prompt_token_count + 1] if len(pytorch_tokens) > prompt_token_count + 1 else None
+        )
 
         output_dir_path = Path(output_dir)
         output_dir_path.mkdir(parents=True, exist_ok=True)
@@ -1195,6 +1229,7 @@ def compare_llama_cpp(
 
         llama_first_token_id: Optional[int] = llama_out.get("first_token_id")
         llama_generated_tokens: list[int] = llama_out.get("generated_tokens") or []
+        llama_second_token_id: Optional[int] = llama_generated_tokens[1] if len(llama_generated_tokens) > 1 else None
         llama_ttft: Optional[float] = llama_out.get("ttft")
         llama_ttfn: Optional[float] = llama_out.get("ttfn")
         llama_total: Optional[float] = llama_out.get("total_time")
@@ -1217,6 +1252,11 @@ def compare_llama_cpp(
             "llama_cpp_pytorch_first_token_id": pytorch_first_token_id,
             "llama_cpp_first_token_id": llama_first_token_id,
             "llama_cpp_first_token_matches_pytorch": llama_first_token_id == pytorch_first_token_id,
+            "llama_cpp_pytorch_second_token_id": pytorch_second_token_id,
+            "llama_cpp_second_token_id": llama_second_token_id,
+            "llama_cpp_second_token_matches_pytorch": (
+                pytorch_second_token_id is not None and llama_second_token_id == pytorch_second_token_id
+            ),
             "llama_cpp_longest_common_token_sequence": llama_longest_common,
             "llama_cpp_ttft_s": llama_ttft,
             "llama_cpp_ttfn_s": llama_ttfn,
diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index 991309ace..a3f9d78e2 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -308,6 +308,10 @@ def get_next_tokens_side_effect():
         assert result["transformers_first_token"] == 10
         assert result["genai_first_token"] == 10
         assert result["first_token_matches"] is True
+        # transformers generated [10, 11, 12] and genai [10, 99, 99] -> second tokens differ.
+        assert result["transformers_second_token"] == 11
+        assert result["genai_second_token"] == 99
+        assert result["second_token_matches"] is False
 
     def test_compare_generation_reports_first_token_mismatch(self):
         """first_token_matches is False when the first generated tokens differ."""
@@ -364,6 +368,10 @@ def get_next_tokens_side_effect():
         assert result["transformers_first_token"] == 30
         assert result["genai_first_token"] == 40
         assert result["first_token_matches"] is False
+        # transformers generated [30, 31] and genai [40, 41] -> second tokens differ too.
+        assert result["transformers_second_token"] == 31
+        assert result["genai_second_token"] == 41
+        assert result["second_token_matches"] is False
 
 
 class TestWeightDtypeInference:
@@ -652,6 +660,9 @@ def test_compare_llama_cpp_returns_expected_metrics(self, tmp_path):
             "llama_cpp_first_token_id",
             "llama_cpp_pytorch_first_token_id",
             "llama_cpp_first_token_matches_pytorch",
+            "llama_cpp_second_token_id",
+            "llama_cpp_pytorch_second_token_id",
+            "llama_cpp_second_token_matches_pytorch",
             "llama_cpp_longest_common_token_sequence",
             "llama_cpp_ttft_s",
             "llama_cpp_ttfn_s",
@@ -662,6 +673,10 @@ def test_compare_llama_cpp_returns_expected_metrics(self, tmp_path):
         assert expected_keys <= set(result.keys())
 
         assert result["llama_cpp_first_token_id"] == 42
+        # transformers generated only one token ([42]), so there is no reference second token.
+        assert result["llama_cpp_pytorch_second_token_id"] is None
+        assert result["llama_cpp_second_token_id"] == 43
+        assert result["llama_cpp_second_token_matches_pytorch"] is False
         # Generated-only comparison: transformers generated [42] vs llama.cpp [42, 43, ...] = 1 match.
         assert result["llama_cpp_longest_common_token_sequence"] == 1
         assert result["llama_cpp_ttft_s"] == pytest.approx(0.05)

From 74130ca39848fed7685b90468d007853ad0afdc5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Thu, 2 Jul 2026 15:23:08 +0200
Subject: [PATCH 69/80] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 olive/passes/pytorch/convert_hf_to_gguf.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/olive/passes/pytorch/convert_hf_to_gguf.py b/olive/passes/pytorch/convert_hf_to_gguf.py
index cc8765382..9654a7c57 100644
--- a/olive/passes/pytorch/convert_hf_to_gguf.py
+++ b/olive/passes/pytorch/convert_hf_to_gguf.py
@@ -47,7 +47,12 @@ def _get_python_executable(env_path: Path) -> str:
     def _run_for_config(
         self, model: HfModelHandler, config: type[BasePassConfig], output_model_path: str
     ) -> HfModelHandler:
-        source_path = Path(model.test_model_path or config.reference_model_path or "")
+        source = model.test_model_path or config.reference_model_path
+        if not source:
+            logger.info("ConvertHfToGGUF skipped: no source model directory was provided.")
+            return model
+
+        source_path = Path(source)
         if not source_path.is_dir():
             logger.info("ConvertHfToGGUF skipped: source model directory does not exist: %s", source_path)
             return model

From b3224f552e7ceb7aa199acfb3ee91f6cd4a4fd19 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 2 Jul 2026 13:33:24 +0000
Subject: [PATCH 70/80] Fix CLI to set max_mae and timing_iterations in
 OnnxDiscrepancyCheck pass config

---
 olive/cli/base.py    | 9 ++++-----
 olive/cli/run.py     | 6 ++++++
 test/cli/test_cli.py | 3 ++-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index 0d428f96b..5b3a27299 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -227,9 +227,8 @@ def add_discrepancy_check_pass(
             # reset test_metrics to the default, discarding any speedup setting that was
             # written by a prior ``olive optimize --dry_run --test_metrics mae,speedup``.
             if metrics_explicit:
-                # Store the human-readable test_metrics list so users can see what is
-                # being evaluated by inspecting config.json (e.g. "test_metrics": ["speedup"]).
-                pass_cfg["test_metrics"] = sorted(selected_metrics)
+                pass_cfg["max_mae"] = 0.1 if "mae" in selected_metrics else None
+                pass_cfg["timing_iterations"] = 5 if "speedup" in selected_metrics else 0
             # Enable llama.cpp when a venv path is provided.
             if llama_env_path:
                 pass_cfg["llama_cpp"] = True
@@ -245,8 +244,8 @@ def add_discrepancy_check_pass(
         "type": "OnnxDiscrepancyCheck",
         "reference_model_path": reference_model_path,
         "report_output_dir": report_dir,
-        # Store the human-readable metric list so users can inspect what will be evaluated.
-        "test_metrics": sorted(selected_metrics),
+        "max_mae": 0.1 if "mae" in selected_metrics else None,
+        "timing_iterations": 5 if "speedup" in selected_metrics else 0,
     }
     # Enable llama.cpp comparison when a venv path is provided.
     if llama_env_path:
diff --git a/olive/cli/run.py b/olive/cli/run.py
index 31ff61657..a0a91e811 100644
--- a/olive/cli/run.py
+++ b/olive/cli/run.py
@@ -6,6 +6,8 @@
 
 from olive.cli.base import (
     BaseOliveCLICommand,
+    _flatten_test_metrics,
+    add_discrepancy_check_pass,
     add_hf_test_model_config,
     add_input_model_options,
     add_logging_options,
@@ -13,6 +15,7 @@
     get_input_model_config,
     mark_test_output_path,
     save_discrepancy_check_results,
+    validate_test_output_path,
 )
 from olive.telemetry import action
 
@@ -69,7 +72,10 @@ def run(self):
             output_path = (
                 self.args.output_path or run_config.get("output_dir") or run_config.get("engine", {}).get("output_dir")
             )
+            validate_test_output_path(output_path, self.args.test)
             run_config["input_model"] = add_hf_test_model_config(input_model, self.args.test, output_path)
+            test_metrics = _flatten_test_metrics(getattr(self.args, "test_metrics", None))
+            run_config = add_discrepancy_check_pass(run_config, test_metrics)
 
         for arg_key, rc_key in [("output_path", "output_dir"), ("log_level", "log_severity_level")]:
             if (arg_value := getattr(self.args, arg_key)) is not None:
diff --git a/test/cli/test_cli.py b/test/cli/test_cli.py
index 5e35462e8..b1a6cc56a 100644
--- a/test/cli/test_cli.py
+++ b/test/cli/test_cli.py
@@ -186,13 +186,14 @@ def test_workflow_run_command_with_test_override(mock_run, tmp_path):
             },
             "output_dir": output_dir,
             "passes": {
+                "save_test_model_config": {"type": "SaveTestModelConfig"},
                 "discrepancy_check": {
                     "type": "OnnxDiscrepancyCheck",
                     "reference_model_path": test_model_path,
                     "report_output_dir": output_dir,
                     "max_mae": 0.1,
                     "timing_iterations": 0,
-                }
+                },
             },
         },
         list_required_packages=False,

From f3327e9cf88a640f5460f6fc8506ec2fc7e574d7 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 2 Jul 2026 13:42:33 +0000
Subject: [PATCH 71/80] Fix discrepancy check metric defaults

---
 olive/cli/base.py    | 22 ++++++++--------------
 test/cli/test_cli.py |  1 +
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/olive/cli/base.py b/olive/cli/base.py
index 5b3a27299..a64a9bf47 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -205,11 +205,8 @@ def add_discrepancy_check_pass(
     if report_dir and Path(report_dir).suffix and not Path(report_dir).is_dir():
         report_dir = str(Path(report_dir).parent)
 
-    # Only apply metric-related changes when the caller explicitly provided --test_metrics.
-    # When metrics is None (not supplied by the user), metric settings already present in
-    # the config (e.g. from a previous --dry_run --test_metrics run) are left untouched.
-    metrics_explicit = metrics is not None
-    selected_metrics = set(metrics) if metrics_explicit else {"mae"}
+    selected_metrics = list(metrics) if metrics is not None else ["mae"]
+    selected_metric_set = set(selected_metrics)
 
     # --- OnnxDiscrepancyCheck pass ---
     # If the pass already exists, update the dynamic runtime fields rather than re-creating it from
@@ -222,13 +219,9 @@ def add_discrepancy_check_pass(
             pass_cfg["reference_model_path"] = reference_model_path
             if report_dir is not None:
                 pass_cfg["report_output_dir"] = report_dir
-            # Only modify metric settings when --test_metrics was explicitly provided.
-            # Without this guard a bare ``olive run --test`` (no --test_metrics) would
-            # reset test_metrics to the default, discarding any speedup setting that was
-            # written by a prior ``olive optimize --dry_run --test_metrics mae,speedup``.
-            if metrics_explicit:
-                pass_cfg["max_mae"] = 0.1 if "mae" in selected_metrics else None
-                pass_cfg["timing_iterations"] = 5 if "speedup" in selected_metrics else 0
+            pass_cfg["test_metrics"] = selected_metrics
+            pass_cfg["max_mae"] = 0.1 if "mae" in selected_metric_set else None
+            pass_cfg["timing_iterations"] = 5 if "speedup" in selected_metric_set else 0
             # Enable llama.cpp when a venv path is provided.
             if llama_env_path:
                 pass_cfg["llama_cpp"] = True
@@ -244,8 +237,9 @@ def add_discrepancy_check_pass(
         "type": "OnnxDiscrepancyCheck",
         "reference_model_path": reference_model_path,
         "report_output_dir": report_dir,
-        "max_mae": 0.1 if "mae" in selected_metrics else None,
-        "timing_iterations": 5 if "speedup" in selected_metrics else 0,
+        "test_metrics": selected_metrics,
+        "max_mae": 0.1 if "mae" in selected_metric_set else None,
+        "timing_iterations": 5 if "speedup" in selected_metric_set else 0,
     }
     # Enable llama.cpp comparison when a venv path is provided.
     if llama_env_path:
diff --git a/test/cli/test_cli.py b/test/cli/test_cli.py
index b1a6cc56a..aeaeb5538 100644
--- a/test/cli/test_cli.py
+++ b/test/cli/test_cli.py
@@ -191,6 +191,7 @@ def test_workflow_run_command_with_test_override(mock_run, tmp_path):
                     "type": "OnnxDiscrepancyCheck",
                     "reference_model_path": test_model_path,
                     "report_output_dir": output_dir,
+                    "test_metrics": ["mae"],
                     "max_mae": 0.1,
                     "timing_iterations": 0,
                 },

From b3890e6b16db72e63e2e8ec277265da85ec61a29 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Thu, 2 Jul 2026 15:54:39 +0200
Subject: [PATCH 72/80] fix seed

---
 olive/common/hf/utils.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py
index 6ff49a518..3c47c1860 100644
--- a/olive/common/hf/utils.py
+++ b/olive/common/hf/utils.py
@@ -101,7 +101,17 @@ def _apply_test_model_config(
 
 
 def _load_test_model(model_class: type, model_config: "PretrainedConfig", trust_remote_code: Optional[bool] = None):
-    """Instantiate a random-initialized HF model from config for test mode."""
+    """Instantiate a random-initialized HF model from config for test mode.
+
+    The random weights are seeded so the generated test model is deterministic and reproducible
+    across runs.  Without a fixed seed each run draws different weights; on a randomly-initialized
+    model many logits are near-tied, so the unavoidable ~1e-5 fp32 difference between the PyTorch
+    reference and the exported ONNX model can flip a greedy argmax and make generation-based test
+    metrics (e.g. ``first_token_20`` / ``matching_leading_tokens``) non-deterministic.  Seeding
+    removes that flakiness and makes the discrepancy check reproducible.
+    """
+    import torch
+
     from_config_signature = inspect.signature(model_class.from_config)
     supports_trust_remote_code = "trust_remote_code" in from_config_signature.parameters or any(
         parameter.kind == inspect.Parameter.VAR_KEYWORD for parameter in from_config_signature.parameters.values()
@@ -109,6 +119,7 @@ def _load_test_model(model_class: type, model_config: "PretrainedConfig", trust_
     from_config_kwargs = {}
     if supports_trust_remote_code and trust_remote_code is not None:
         from_config_kwargs["trust_remote_code"] = trust_remote_code
+    torch.manual_seed(0)
     return model_class.from_config(model_config, **from_config_kwargs)
 
 

From 9d04b063a66aec2f0a98a7e3c9fdc63a2ed912ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Thu, 2 Jul 2026 16:10:30 +0200
Subject: [PATCH 73/80] lint

---
 olive/passes/onnx/discrepancy_check.py     | 4 +---
 test/passes/onnx/test_discrepancy_check.py | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index a5a750f8d..8ab3611d4 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -1033,9 +1033,7 @@ def __call__(self, generated_ids, scores, **kwargs) -> bool:
         genai_second_token = (
             genai_tokens[genai_prompt_token_count + 1] if len(genai_tokens) > genai_prompt_token_count + 1 else None
         )
-        second_token_matches = (
-            transformers_second_token is not None and transformers_second_token == genai_second_token
-        )
+        second_token_matches = transformers_second_token is not None and transformers_second_token == genai_second_token
 
         gen_results = {
             "longest_common_token_sequence": longest_common,
diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py
index a3f9d78e2..d19e437b7 100644
--- a/test/passes/onnx/test_discrepancy_check.py
+++ b/test/passes/onnx/test_discrepancy_check.py
@@ -789,4 +789,3 @@ def test_compare_llama_cpp_uses_preconverted_gguf(self, tmp_path):
         assert result["llama_cpp_first_token_id"] == 7
         mock_convert_script.assert_not_called()
         assert mock_subprocess_run.call_count == 1
-

From 71737090c061e1e70957a619ddbf83606beb9252 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Thu, 2 Jul 2026 16:21:17 +0200
Subject: [PATCH 74/80] fix

---
 olive/passes/onnx/discrepancy_check.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py
index 8ab3611d4..822ef03da 100644
--- a/olive/passes/onnx/discrepancy_check.py
+++ b/olive/passes/onnx/discrepancy_check.py
@@ -388,9 +388,7 @@ def _run_for_config(
 
         self._run_generation_comparison(model, config, ref_model, ref_path, generation_metrics, results)
 
-        self._run_llama_cpp_comparison(
-            model, config, ref_model, ref_path, report_dir, generation_metrics, results
-        )
+        self._run_llama_cpp_comparison(model, config, ref_model, ref_path, report_dir, generation_metrics, results)
 
         self._save_results(model, results, report_dir)
         return model

From bc078c131a53595592c6cf962d7c80fe9a58769a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 2 Jul 2026 14:46:29 +0000
Subject: [PATCH 75/80] test: cover qwen3 fast smoke flow

---
 test/cli/test_cli_test_model_smoke.py | 49 ++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/test/cli/test_cli_test_model_smoke.py b/test/cli/test_cli_test_model_smoke.py
index e9fd6874e..71b1aef46 100644
--- a/test/cli/test_cli_test_model_smoke.py
+++ b/test/cli/test_cli_test_model_smoke.py
@@ -26,6 +26,7 @@
     "local/tiny-random-llama-b",
     "mistralai/Mistral-7B-Instruct-v0.3",
     "microsoft/Phi-3-mini-4k-instruct",
+    "Qwen/Qwen3-8B",
 )
 MAX_ARTIFACT_SIZE_BYTES = 1024 * 1024
 
@@ -44,7 +45,7 @@ def _save_local_tiny_llama(model_path: Path):
         LlamaConfig.from_dict(
             {
                 "vocab_size": 32,
-                "hidden_size": 64,
+                "hidden_size": 128,
                 "intermediate_size": 128,
                 "num_hidden_layers": 2,
                 "num_attention_heads": 8,
@@ -70,9 +71,49 @@ def _save_local_tiny_llama(model_path: Path):
     ).save_pretrained(model_path)
 
 
+def _save_local_tiny_qwen3(model_path: Path):
+    from transformers import PreTrainedTokenizerFast, Qwen3Config, Qwen3ForCausalLM
+
+    model = Qwen3ForCausalLM(
+        Qwen3Config(
+            vocab_size=32,
+            hidden_size=64,
+            intermediate_size=128,
+            num_hidden_layers=2,
+            num_attention_heads=4,
+            num_key_value_heads=4,
+            head_dim=16,
+            max_position_embeddings=64,
+            tie_word_embeddings=False,
+        )
+    )
+    model.save_pretrained(model_path)
+
+    tokenizer = Tokenizer(
+        WordLevel(
+            vocab={"<pad>": 0, "<bos>": 1, "<eos>": 2, "hello": 3, "world": 4},
+            unk_token="<pad>",
+        )
+    )
+    tokenizer.pre_tokenizer = Whitespace()
+    PreTrainedTokenizerFast(
+        tokenizer_object=tokenizer,
+        bos_token="<bos>",
+        eos_token="<eos>",
+        pad_token="<pad>",
+    ).save_pretrained(model_path)
+
+
+def _save_local_tiny_model(model_id: str, model_path: Path):
+    if model_id.startswith("Qwen/"):
+        _save_local_tiny_qwen3(model_path)
+    else:
+        _save_local_tiny_llama(model_path)
+
+
 def _set_offline_gptq_data_config(config_path: Path):
     config = json.loads(config_path.read_text())
-    # The tiny test model has hidden_size 64, so the default GPTQ group_size of 128
+    # The tiny smoke-test fixtures use small hidden sizes, so the default GPTQ group_size of 128
     # is too large (in_features must be divisible by group_size). Use a small group_size.
     config["passes"]["gptq"]["group_size"] = 32
     config["passes"]["gptq"]["data_config"] = {
@@ -106,7 +147,7 @@ def _run_documented_test_model_smoke_flow(tmp_path: Path, model_id: str):
     run_output_dir = tmp_path / f"{model_name}-test-run"
     test_model_dir = run_output_dir / "reference_hf_model"
 
-    _save_local_tiny_llama(model_path)
+    _save_local_tiny_model(model_id, model_path)
     # optimize -m arnir0/Tiny-LLM --device cpu --provider CPUExecutionProvider --precision int4 --output_path dump --dry_run
     _run_cli_main(
         [
@@ -238,7 +279,7 @@ def _assert_discrepancy_model_builder(self, tmp_path: Path, model_id: str):
         config_output_dir = tmp_path / f"{model_name}-disc-cfg"
         run_output_dir = tmp_path / f"{model_name}-disc-run"
 
-        _save_local_tiny_llama(model_path)
+        _save_local_tiny_model(model_id, model_path)
         _run_cli_main(
             [
                 "optimize",

From a793f43bc9112c86d0b22e4914b7f422e5cb6897 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Thu, 2 Jul 2026 16:53:17 +0200
Subject: [PATCH 76/80] add loging info

---
 olive/common/hf/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py
index 3c47c1860..6e4919df9 100644
--- a/olive/common/hf/utils.py
+++ b/olive/common/hf/utils.py
@@ -120,7 +120,9 @@ def _load_test_model(model_class: type, model_config: "PretrainedConfig", trust_
     if supports_trust_remote_code and trust_remote_code is not None:
         from_config_kwargs["trust_remote_code"] = trust_remote_code
     torch.manual_seed(0)
-    return model_class.from_config(model_config, **from_config_kwargs)
+    model = model_class.from_config(model_config, **from_config_kwargs)
+    logger.info("Generating test model class %s", type(model))
+    return model
 
 
 def _save_test_model(

From fc3428d00e26bd3caf1f2636d2afe0d03a06b91c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 2 Jul 2026 15:33:22 +0000
Subject: [PATCH 77/80] Fix Qwen3 smoke test lint

---
 test/cli/test_cli_test_model_smoke.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/test/cli/test_cli_test_model_smoke.py b/test/cli/test_cli_test_model_smoke.py
index 71b1aef46..9b9e7911e 100644
--- a/test/cli/test_cli_test_model_smoke.py
+++ b/test/cli/test_cli_test_model_smoke.py
@@ -75,16 +75,18 @@ def _save_local_tiny_qwen3(model_path: Path):
     from transformers import PreTrainedTokenizerFast, Qwen3Config, Qwen3ForCausalLM
 
     model = Qwen3ForCausalLM(
-        Qwen3Config(
-            vocab_size=32,
-            hidden_size=64,
-            intermediate_size=128,
-            num_hidden_layers=2,
-            num_attention_heads=4,
-            num_key_value_heads=4,
-            head_dim=16,
-            max_position_embeddings=64,
-            tie_word_embeddings=False,
+        Qwen3Config.from_dict(
+            {
+                "vocab_size": 32,
+                "hidden_size": 64,
+                "intermediate_size": 128,
+                "num_hidden_layers": 2,
+                "num_attention_heads": 4,
+                "num_key_value_heads": 4,
+                "head_dim": 16,
+                "max_position_embeddings": 64,
+                "tie_word_embeddings": False,
+            }
         )
     )
     model.save_pretrained(model_path)

From ea9c047080e0955c818721c7bbc76871304f8f2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Thu, 2 Jul 2026 18:18:23 +0200
Subject: [PATCH 78/80] fix attn implementation

---
 olive/common/hf/utils.py | 42 ++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py
index 6e4919df9..12b40bf01 100644
--- a/olive/common/hf/utils.py
+++ b/olive/common/hf/utils.py
@@ -100,26 +100,30 @@ def _apply_test_model_config(
     return model_config
 
 
-def _load_test_model(model_class: type, model_config: "PretrainedConfig", trust_remote_code: Optional[bool] = None):
+def _load_test_model(
+    model_class: type,
+    model_config: "PretrainedConfig",
+    trust_remote_code: Optional[bool] = None,
+    attn_implementation: Optional[str] = None,
+):
     """Instantiate a random-initialized HF model from config for test mode.
 
-    The random weights are seeded so the generated test model is deterministic and reproducible
-    across runs.  Without a fixed seed each run draws different weights; on a randomly-initialized
-    model many logits are near-tied, so the unavoidable ~1e-5 fp32 difference between the PyTorch
-    reference and the exported ONNX model can flip a greedy argmax and make generation-based test
-    metrics (e.g. ``first_token_20`` / ``matching_leading_tokens``) non-deterministic.  Seeding
-    removes that flakiness and makes the discrepancy check reproducible.
+    ``attn_implementation`` (e.g. ``"sdpa"``, forwarded from the model's ``load_kwargs``) is passed
+    through to ``from_config`` so the random test model uses the requested attention implementation
+    rather than relying on the transformers default (which can be ``"eager"`` on some versions).
+    This keeps the generated test model consistent with the base/reference model.
     """
-    import torch
-
     from_config_signature = inspect.signature(model_class.from_config)
-    supports_trust_remote_code = "trust_remote_code" in from_config_signature.parameters or any(
+    accepts_var_keyword = any(
         parameter.kind == inspect.Parameter.VAR_KEYWORD for parameter in from_config_signature.parameters.values()
     )
     from_config_kwargs = {}
-    if supports_trust_remote_code and trust_remote_code is not None:
+    if (accepts_var_keyword or "trust_remote_code" in from_config_signature.parameters) and trust_remote_code is not None:
         from_config_kwargs["trust_remote_code"] = trust_remote_code
-    torch.manual_seed(0)
+    if (
+        accepts_var_keyword or "attn_implementation" in from_config_signature.parameters
+    ) and attn_implementation is not None:
+        from_config_kwargs["attn_implementation"] = attn_implementation
     model = model_class.from_config(model_config, **from_config_kwargs)
     logger.info("Generating test model class %s", type(model))
     return model
@@ -238,13 +242,23 @@ def load_model_from_task(
                     if has_test_model_weights(test_model_dir):
                         model = from_pretrained(model_class, test_model_path, "model", **kwargs)
                     else:
-                        model = _load_test_model(model_class, model_config, kwargs.get("trust_remote_code"))
+                        model = _load_test_model(
+                            model_class,
+                            model_config,
+                            kwargs.get("trust_remote_code"),
+                            attn_implementation=kwargs.get("attn_implementation"),
+                        )
                         _save_test_model(
                             model, test_model_path, test_model_config, model_name_or_path=model_name_or_path
                         )
                 else:
                     _validate_path(test_model_dir, test_model_path)
-                    model = _load_test_model(model_class, model_config, kwargs.get("trust_remote_code"))
+                    model = _load_test_model(
+                        model_class,
+                        model_config,
+                        kwargs.get("trust_remote_code"),
+                        attn_implementation=kwargs.get("attn_implementation"),
+                    )
                     if test_model_path:
                         _save_test_model(
                             model, test_model_path, test_model_config, model_name_or_path=model_name_or_path

From 19e3466eb5ea8b9b499f65eda1f6f95fe7d55c4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@microsoft.com>
Date: Thu, 2 Jul 2026 18:49:37 +0200
Subject: [PATCH 79/80] lint

---
 olive/common/hf/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py
index 12b40bf01..69c8b3b7a 100644
--- a/olive/common/hf/utils.py
+++ b/olive/common/hf/utils.py
@@ -118,7 +118,9 @@ def _load_test_model(
         parameter.kind == inspect.Parameter.VAR_KEYWORD for parameter in from_config_signature.parameters.values()
     )
     from_config_kwargs = {}
-    if (accepts_var_keyword or "trust_remote_code" in from_config_signature.parameters) and trust_remote_code is not None:
+    if (
+        accepts_var_keyword or "trust_remote_code" in from_config_signature.parameters
+    ) and trust_remote_code is not None:
         from_config_kwargs["trust_remote_code"] = trust_remote_code
     if (
         accepts_var_keyword or "attn_implementation" in from_config_signature.parameters

From bae341a5f96fb815ad8ca84215d251e5aeed0fe2 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 2 Jul 2026 17:40:50 +0000
Subject: [PATCH 80/80] Fix CLI attn implementation expectations

---
 test/cli/test_cli.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/cli/test_cli.py b/test/cli/test_cli.py
index aeaeb5538..6b7a09849 100644
--- a/test/cli/test_cli.py
+++ b/test/cli/test_cli.py
@@ -141,7 +141,7 @@ def test_workflow_run_command_with_overrides(mock_repo_exists, mock_run, tmp_pat
             "input_model": {
                 "type": "HfModel",
                 "model_path": "hf-internal-testing/tiny-random-LlamaForCausalLM",
-                "load_kwargs": {"attn_implementation": "eager", "trust_remote_code": False},
+                "load_kwargs": {"attn_implementation": "sdpa", "trust_remote_code": False},
             },
             "engine": {},
             "output_dir": str(Path("new_output_path").resolve()),
@@ -163,7 +163,7 @@ def test_workflow_run_command_with_test_override(mock_run, tmp_path):
                 "input_model": {
                     "type": "HfModel",
                     "model_path": "hf-internal-testing/tiny-random-LlamaForCausalLM",
-                    "load_kwargs": {"attn_implementation": "eager", "trust_remote_code": False},
+                    "load_kwargs": {"attn_implementation": "sdpa", "trust_remote_code": False},
                 },
                 "output_dir": str(tmp_path / "output"),
             }
@@ -180,7 +180,7 @@ def test_workflow_run_command_with_test_override(mock_run, tmp_path):
             "input_model": {
                 "type": "HfModel",
                 "model_path": "hf-internal-testing/tiny-random-LlamaForCausalLM",
-                "load_kwargs": {"attn_implementation": "eager", "trust_remote_code": False},
+                "load_kwargs": {"attn_implementation": "sdpa", "trust_remote_code": False},
                 "test_model_config": {"hidden_layers": 2},
                 "test_model_path": test_model_path,
             },
@@ -214,7 +214,7 @@ def test_workflow_run_command_with_test_rejects_non_test_output_dir(tmp_path):
                 "input_model": {
                     "type": "HfModel",
                     "model_path": "hf-internal-testing/tiny-random-LlamaForCausalLM",
-                    "load_kwargs": {"attn_implementation": "eager", "trust_remote_code": False},
+                    "load_kwargs": {"attn_implementation": "sdpa", "trust_remote_code": False},
                 },
                 "output_dir": str(output_dir),
             }