diff --git a/.github/workflows/test-model-fast.yml b/.github/workflows/test-model-fast.yml index 20af9b524..fb03db2e1 100644 --- a/.github/workflows/test-model-fast.yml +++ b/.github/workflows/test-model-fast.yml @@ -30,6 +30,18 @@ jobs: python -m pip install -r requirements.txt python -m pip install -r test/requirements-test-cpu.txt + - name: Create llama_env and install llama-cpp-python + run: | + LLAMA_ENV="$(pwd)/llama_env" + python -m venv "$LLAMA_ENV" + "$LLAMA_ENV/bin/pip" install --upgrade pip + "$LLAMA_ENV/bin/pip" install gguf safetensors llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu + "$LLAMA_ENV/bin/pip" install transformers sentencepiece protobuf tabulate gguf + git clone --depth=1 --filter=blob:none --sparse https://github.com/ggerganov/llama.cpp.git /tmp/llama_cpp_repo + git -C /tmp/llama_cpp_repo sparse-checkout set convert_hf_to_gguf.py conversion --skip-checks + cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py "$LLAMA_ENV/" + cp -r /tmp/llama_cpp_repo/conversion "$LLAMA_ENV/" + - name: pip freeze run: | python -m pip freeze diff --git a/docs/source/how-to/cli/cli-fast-test.md b/docs/source/how-to/cli/cli-fast-test.md index 49fa13055..edabf964f 100644 --- a/docs/source/how-to/cli/cli-fast-test.md +++ b/docs/source/how-to/cli/cli-fast-test.md @@ -2,12 +2,10 @@ If you are converting a large language model, it is often useful to validate the Olive command, environment, and conversion recipe on a much smaller model before spending time on the full checkpoint. -The `--test` option does that for Hugging Face models. Olive keeps the same model architecture, reduces it to a random 2-layer test model, saves it to the folder you provide, and reuses that folder on later runs. +The `--test` option does that for Hugging Face models. Olive keeps the same model architecture, reduces it to a random **2-layer** test model, saves it under `/reference_hf_model`, and reuses that folder on later runs. This example uses [`Qwen/Qwen3-0.6B`](https://huggingface.co/Qwen/Qwen3-0.6B), but the same pattern works for other supported Hugging Face LLMs. -## Step 1: generate the workflow config - Start by generating the config that Olive will run for the Qwen conversion. ```bash @@ -17,61 +15,28 @@ olive optimize \ --provider CPUExecutionProvider \ --precision int4 \ --output_path out/qwen \ - --dry_run -``` - -This creates `out/qwen/config.json` without launching the full conversion yet. - -## Step 2: run a fast smoke test with `olive run --test` - -Use the generated config with `olive run` and pass `--test` so Olive swaps in a reduced random Qwen model. - -```bash -olive run \ - --config out/qwen/config.json \ - --test out/qwen-test-model \ - --output_path out/qwen-test-run + --test ``` -What this does: - -- `--test out/qwen-test-model` creates a reduced random Qwen model and saves it in `out/qwen-test-model` -- later runs reuse the same saved test model instead of recreating it -- `--output_path out/qwen-test-run` gives the smoke test its own output folder, so the generated ONNX artifacts are easy to find -- Olive marks that output folder as a test-only run and refuses to reuse a non-test conversion folder for `--test` +Because this example runs without `--dry_run`, it produces: -After the smoke test finishes, look under `out/qwen-test-run` for the exported ONNX model and related files. - -This is a quick way to confirm that: - -- Olive can load the source model -- the selected optimization recipe is valid for your setup -- the conversion path completes before you run the full model - -If you omit the folder and just pass `--test`, `olive run` will save the reduced model under `/test_model`. - -## Step 3: run the full conversion - -Once the smoke test succeeds, rerun the conversion on the full Qwen checkpoint by removing `--test`. - -```bash -olive run \ - --config out/qwen/config.json \ - --output_path out/qwen-full -``` +- `out/qwen/olive_config.json` — the Olive configuration used for the run (named `olive_config.json` so it is never confused with the model's own `config.json`). +- `out/qwen/reference_hf_model/` — the randomly initialized 2-layer reference model (weights, tokenizer, and `config.json`) that the ONNX model is compared against. It is created on the first run and reused afterwards. +- `out/qwen/model/` — the optimized ONNX model. +- `out/qwen/discrepancy_check_results.json` — the discrepancy report. -At this point you know the Olive command and the conversion recipe already worked on the lightweight test model, so you can focus on the full-model run instead of debugging both at once. +It also inserts an `OnnxDiscrepancyCheck` pass (if one is not already present) that compares the generated ONNX model against the 2-layer reference model. This pass only **loads** the reference model; it never saves a model or config itself (the reference model is materialized earlier in the workflow). -## Why keep the test model folder? +Additional metrics can be requested via `--test_metrics` (space- or comma-separated): -The saved test model is useful beyond the first smoke test: +- `mae`: enforces the max absolute error between the ONNX and reference logits (clean prefill with an empty KV cache, so both models run the identical forward pass). This is the default when `--test_metrics` is omitted. +- `speedup`: ONNX-vs-PyTorch inference latency. +- `first_token_20`: generates 20 tokens and compares the outputs of ONNX Runtime GenAI and transformers. It reports `first_token_matches` and `second_token_matches` (whether the first and second generated tokens are identical, along with the token ids for each backend) and `matching_leading_tokens` — the number of leading generated tokens that match. `matching_leading_tokens` is measured over the **generated** tokens only (the shared prompt is excluded) and is therefore capped at the 20-token generation length. +- `tft`: time to the first generated token (reported for both ONNX Runtime GenAI and transformers). +- `tf5t`: time to the first 5 generated tokens (reported for both ONNX Runtime GenAI and transformers). -- you can rerun the reduced conversion quickly while iterating on options -- you can reuse the same HF test model later when comparing the Hugging Face model against the exported ONNX model -- you avoid recreating a new random test checkpoint every time +For example, `--test_metrics mae,speedup,first_token_20,tft,tf5t`. The generation metrics (`first_token_20`, `tft`, `tf5t`) use the optimized ONNX model directory as the ONNX Runtime GenAI model when it contains a `genai_config.json` (as produced by the model builder). To keep tokenization consistent, ONNX Runtime GenAI is fed the exact token ids produced by the transformers tokenizer (including any BOS/special tokens) rather than re-encoding the prompt itself. -## Related docs +You can additionally compare against llama.cpp by passing `--test_llama_path ` (a virtual environment with llama.cpp installed). When provided, `first_token_20` also reports `llama_cpp_first_token_matches`, `llama_cpp_second_token_matches`, and `llama_cpp_matching_leading_tokens`, again measured over generated tokens only and capped at the generation length. -- [How to use the `olive optimize` command to optimize a Pytorch model](cli-optimize) -- [How to write a new workflow from scratch](../configure-workflows/build-workflow) -- [CLI reference](../../reference/cli) +> **Note:** `--test_metrics` is always respected even when the config was generated by `olive optimize --test`, because Olive updates the existing `OnnxDiscrepancyCheck` settings each time `olive run --test` is invoked. diff --git a/olive/cli/base.py b/olive/cli/base.py index 50f1e55bf..a64a9bf47 100644 --- a/olive/cli/base.py +++ b/olive/cli/base.py @@ -22,6 +22,40 @@ TEST_OUTPUT_MARKER_FILE = "olive_test_output.json" +# Metrics that --test can evaluate via the injected OnnxDiscrepancyCheck pass. +TEST_METRICS = ("mae", "speedup", "first_token_20", "tft", "tf5t") + + +def _parse_test_metrics(value: str) -> list: + """Parse a comma- or space-separated list of test metric names. + + Accepts values like ``'mae'``, ``'mae,speedup'``, or ``'mae speedup'`` and + returns a flat list of validated metric names. Raises ``argparse.ArgumentTypeError`` + for any unrecognised name. + """ + import argparse + + names = [m.strip() for m in value.replace(",", " ").split() if m.strip()] + invalid = [n for n in names if n not in TEST_METRICS] + if invalid: + raise argparse.ArgumentTypeError(f"invalid choice(s): {invalid!r} (choose from {list(TEST_METRICS)})") + return names + + +def _flatten_test_metrics(raw) -> Optional[list]: + """Flatten the nested list produced by argparse when nargs="+" and type returns a list. + + ``argparse`` calls the ``type`` function once per token, so + ``--test_metrics mae,speedup`` yields ``[["mae", "speedup"]]`` and + ``--test_metrics mae speedup`` yields ``[["mae"], ["speedup"]]``. + This function flattens both forms to ``["mae", "speedup"]``. + Returns ``None`` when ``raw`` is ``None`` or empty. + """ + if not raw: + return None + flat = [item for sublist in raw for item in (sublist if isinstance(sublist, list) else [sublist])] + return flat or None + def _get_test_output_marker_path(output_path: str) -> Path: return Path(output_path) / TEST_OUTPUT_MARKER_FILE @@ -67,31 +101,152 @@ def mark_test_output_path(output_path: Optional[str]) -> None: _get_test_output_marker_path(output_path).write_text(json.dumps({"type": "olive_hf_test_output"}, indent=2)) -def add_discrepancy_check_pass(run_config: dict) -> dict: - """Inject OnnxDiscrepancyCheck pass when --test is active and not already configured.""" +def warn_unused_test_metrics(test, metrics: Optional[list], llama_path: Optional[str] = None) -> None: + """Warn when --test_metrics or --test_llama_path is provided without --test, since it has no effect.""" + if metrics and test in (None, False): + logger.warning("--test_metrics is ignored because --test is not enabled.") + if llama_path and test in (None, False): + logger.warning("--test_llama_path is ignored because --test is not enabled.") + + +def add_discrepancy_check_pass( + run_config: dict, metrics: Optional[list] = None, llama_env_path: Optional[str] = None +) -> dict: + """Inject or update test-related passes when --test is active. + + ``metrics`` selects which test metrics to evaluate. Supported values are defined in + ``TEST_METRICS`` (``"mae"`` for the max-absolute-error accuracy check and ``"speedup"`` for the + ONNX-vs-PyTorch latency measurement). When ``None``, only ``"mae"`` is evaluated; pass + ``["speedup"]`` or ``["mae", "speedup"]`` explicitly to enable timing. + + ``llama_env_path`` is the path to the llama_env virtual environment used for llama.cpp inference. + When provided, the ``llama_cpp`` flag is enabled on the pass and the path is forwarded as + ``llama_cpp_env_path``. + + Managed passes: + + * ``SaveTestModelConfig`` — inserted at the *beginning* of the passes dict so that the + test-model directory (containing only ``config.json`` and the marker file) is created + before any other pass runs. This ensures subsequent passes can find the directory even + on the first ``olive run`` after ``olive optimize --test``. + + * ``ConvertHfToGGUF`` — inserted after ``SaveTestModelConfig`` when ``llama_env_path`` is + provided, and converts the test HuggingFace directory to GGUF in advance. + + * ``OnnxDiscrepancyCheck`` — appended at the end to compare the ONNX model against the + reference HuggingFace model. If an instance is already present in the config (e.g. + from a previous ``--test`` invocation), its dynamic runtime fields + (``reference_model_path``, ``report_output_dir``, metric and llama.cpp settings) are + updated in-place so that the current ``--test_metrics`` and ``--output_path`` values + always take effect. + """ passes = run_config.get("passes", {}) - # Skip if already configured - for pass_config in passes.values(): - if isinstance(pass_config, dict) and pass_config.get("type", "").lower() == "onnxdiscrepancycheck": - return run_config # Get the reference model path from the input_model test_model_path input_model = run_config.get("input_model", {}) reference_model_path = input_model.get("test_model_path") if not reference_model_path: return run_config + # Resolve to absolute path so ORT GenAI and transformers always find a local + # directory rather than treating a relative path like "out/my-test" as a + # HuggingFace "org/repo" model identifier. + reference_model_path = str(Path(reference_model_path).resolve()) + + # --- SaveTestModelConfig pass (injected at the beginning) --- + # Ensure the pass is present and positioned before any other pass so that + # the test-model directory is created on the first real run. + has_save_pass = any( + isinstance(cfg, dict) and cfg.get("type", "").lower() == "savetestmodelconfig" for cfg in passes.values() + ) + if not has_save_pass: + logger.debug("Adding SaveTestModelConfig pass at the beginning of the passes dict") + new_passes = {"save_test_model_config": {"type": "SaveTestModelConfig"}} + new_passes.update(passes) + passes = new_passes + run_config["passes"] = passes + + # --- ConvertHfToGGUF pass (optional, only with --test_llama_path) --- + if llama_env_path: + has_gguf_pass = any( + isinstance(cfg, dict) and cfg.get("type", "").lower() == "converthftogguf" for cfg in passes.values() + ) + if not has_gguf_pass: + new_passes = {} + inserted = False + for name, cfg in passes.items(): + new_passes[name] = cfg + if not inserted and isinstance(cfg, dict) and cfg.get("type", "").lower() == "savetestmodelconfig": + new_passes["convert_hf_to_gguf"] = { + "type": "ConvertHfToGGUF", + "llama_cpp_env_path": llama_env_path, + "reference_model_path": reference_model_path, + } + inserted = True + if not inserted: + new_passes = { + "convert_hf_to_gguf": { + "type": "ConvertHfToGGUF", + "llama_cpp_env_path": llama_env_path, + "reference_model_path": reference_model_path, + }, + **new_passes, + } + passes = new_passes + run_config["passes"] = passes + else: + for pass_cfg in passes.values(): + if isinstance(pass_cfg, dict) and pass_cfg.get("type", "").lower() == "converthftogguf": + pass_cfg["llama_cpp_env_path"] = llama_env_path + pass_cfg["reference_model_path"] = reference_model_path + break # Determine output directory for discrepancy results report_dir = run_config.get("output_dir") or run_config.get("engine", {}).get("output_dir") if report_dir and Path(report_dir).suffix and not Path(report_dir).is_dir(): report_dir = str(Path(report_dir).parent) + + selected_metrics = list(metrics) if metrics is not None else ["mae"] + selected_metric_set = set(selected_metrics) + + # --- OnnxDiscrepancyCheck pass --- + # If the pass already exists, update the dynamic runtime fields rather than re-creating it from + # scratch. This handles the common pattern of running ``olive optimize --dry_run --test …`` + # (which saves a config with a pre-populated OnnxDiscrepancyCheck) and then ``olive run --config + # … --test … --test_metrics …``. Without this update the ``--test_metrics`` selection and the + # new ``--output_path`` would be silently ignored. + for pass_cfg in passes.values(): + if isinstance(pass_cfg, dict) and pass_cfg.get("type", "").lower() == "onnxdiscrepancycheck": + pass_cfg["reference_model_path"] = reference_model_path + if report_dir is not None: + pass_cfg["report_output_dir"] = report_dir + pass_cfg["test_metrics"] = selected_metrics + pass_cfg["max_mae"] = 0.1 if "mae" in selected_metric_set else None + pass_cfg["timing_iterations"] = 5 if "speedup" in selected_metric_set else 0 + # Enable llama.cpp when a venv path is provided. + if llama_env_path: + pass_cfg["llama_cpp"] = True + pass_cfg["llama_cpp_env_path"] = llama_env_path + logger.debug( + "Updated existing OnnxDiscrepancyCheck pass with reference_model_path=%s", reference_model_path + ) + return run_config + logger.debug("Adding OnnxDiscrepancyCheck pass with reference_model_path=%s", reference_model_path) - passes["discrepancy_check"] = { + + pass_config: dict = { "type": "OnnxDiscrepancyCheck", "reference_model_path": reference_model_path, - "max_mae": 0.1, "report_output_dir": report_dir, + "test_metrics": selected_metrics, + "max_mae": 0.1 if "mae" in selected_metric_set else None, + "timing_iterations": 5 if "speedup" in selected_metric_set else 0, } + # Enable llama.cpp comparison when a venv path is provided. + if llama_env_path: + pass_config["llama_cpp"] = True + pass_config["llama_cpp_env_path"] = llama_env_path + + passes["discrepancy_check"] = pass_config run_config["passes"] = passes return run_config @@ -135,24 +290,60 @@ def _run_workflow(self): from olive.workflows import run as olive_run validate_test_output_path(self.args.output_path, getattr(self.args, "test", None)) + test_metrics = _flatten_test_metrics(getattr(self.args, "test_metrics", None)) + warn_unused_test_metrics( + getattr(self.args, "test", None), + test_metrics, + getattr(self.args, "test_llama_path", None), + ) Path(self.args.output_path).mkdir(parents=True, exist_ok=True) + is_test = getattr(self.args, "test", None) not in (None, False) + with tempfile.TemporaryDirectory(prefix="olive-cli-tmp-", dir=self.args.output_path) as tempdir: run_config = self._get_run_config(tempdir) - if getattr(self.args, "test", None) not in (None, False): - run_config = add_discrepancy_check_pass(run_config) - if self.args.save_config_file or self.args.dry_run: - self._save_config_file(run_config) + if is_test: + run_config = add_discrepancy_check_pass( + run_config, + test_metrics, + getattr(self.args, "test_llama_path", None), + ) + # In --test mode, always persist the Olive config to /config.json. + # This must happen before the workflow runs so the model builder's transformers + # config.json (written into the model subdirectory below) never overwrites it. + if is_test: + # Treat as a report directory holding the Olive config.json and + # discrepancy_check_results.json. Save the optimized ONNX model into a "model" + # subdirectory so it is preserved on disk (not discarded in a temp directory) + # while keeping the Olive config.json at the root. + model_dir = str(Path(self.args.output_path) / "model") + run_config["output_dir"] = model_dir + for pass_cfg in run_config.get("passes", {}).values(): + if isinstance(pass_cfg, dict) and pass_cfg.get("type", "").lower() == "onnxdiscrepancycheck": + pass_cfg["report_output_dir"] = self.args.output_path + if self.args.save_config_file or self.args.dry_run or is_test: + # In --test mode, keep the Olive config at the root even though the + # workflow output_dir points to the "model" subdirectory. When dry_run is not + # enabled (a one-step run) save it as olive_config.json so it is never confused + # with the model's own transformers config.json; dry_run keeps config.json so + # `olive run --config /config.json` continues to work. + config_file_name = "config.json" if self.args.dry_run else "olive_config.json" + self._save_config_file( + run_config, + self.args.output_path if is_test else None, + config_file_name, + ) if self.args.dry_run: - if getattr(self.args, "test", None) not in (None, False): + if is_test: mark_test_output_path(self.args.output_path) print("Dry run mode enabled. Configuration file is generated but no optimization is performed.") return None workflow_output = olive_run(run_config) - if getattr(self.args, "test", None) not in (None, False): + if is_test: mark_test_output_path(self.args.output_path) save_discrepancy_check_results(workflow_output, self.args.output_path) - if not workflow_output.has_output_model(): + print(f"Test report saved at {self.args.output_path}") + elif not workflow_output.has_output_model(): print("No output model produced. Please check the log for details.") else: print(f"Model is saved at {self.args.output_path}") @@ -171,12 +362,20 @@ def _parse_extra_options(kv_items): from onnxruntime_genai.models.builder import parse_extra_options - return parse_extra_options(kv_items) + return parse_extra_options(kv_items) # pylint: disable=no-value-for-parameter @staticmethod - def _save_config_file(config: dict): - """Save the config file.""" - config_file_path = Path(config["output_dir"]) / "config.json" + def _save_config_file(config: dict, output_dir: Optional[str] = None, file_name: str = "config.json"): + """Save the config file. + + By default the config is written to ``/config.json``. When + ``output_dir`` is provided, the config is written to ``/`` + instead (used in --test mode to keep the Olive config at the report directory root). + ``file_name`` controls the config file name (e.g. ``olive_config.json`` for a one-step + run so it is never confused with the model's own ``config.json``). + """ + target_dir = output_dir if output_dir is not None else config["output_dir"] + config_file_path = Path(target_dir) / file_name with open(config_file_path, "w") as f: json.dump(config, f, indent=4) print(f"Config file saved at {config_file_path}") @@ -195,14 +394,11 @@ def add_hf_test_model_config(input_model: dict, test_value, output_path: Optiona if test_value in (None, False): return input_model - test_model_output_path = test_value # Use 2 layers to keep the test model fast and lightweight while preserving the original architecture family. input_model["test_model_config"] = {"hidden_layers": 2} - if test_model_output_path is True: - if not output_path: - raise ValueError("--test requires an explicit folder when output_path is not available.") - test_model_output_path = str(Path(output_path) / "test_model") - input_model["test_model_path"] = test_model_output_path + if not output_path: + raise ValueError("--test requires --output_path to store the generated reference model.") + input_model["test_model_path"] = str(Path(output_path) / "reference_hf_model") return input_model @@ -217,7 +413,7 @@ def _get_hf_input_model(args: Namespace, model_path: OLIVE_RESOURCE_ANNOTATIONS) "type": "HfModel", "model_path": model_path, "load_kwargs": { - "attn_implementation": "eager", + "attn_implementation": "sdpa", }, } # use getattr to avoid AttributeError in case hf model or adapter_path is not supported @@ -497,12 +693,34 @@ def add_input_model_options( ) model_group.add_argument( "--test", - type=str, - nargs="?", - const=True, + action="store_true", help=( "Use a randomly initialized test model with the same Hugging Face architecture and 2 hidden layers. " - "Optionally provide a folder where the generated test model should be saved and reused." + "The generated reference model is saved under /reference_hf_model and reused." + ), + ) + model_group.add_argument( + "--test_metrics", + type=_parse_test_metrics, + nargs="+", + help=( + "Metrics to evaluate during a --test run: 'mae' enforces the max absolute error between the " + "ONNX and reference model outputs, 'speedup' measures ONNX-vs-PyTorch inference latency, " + "'first_token_20' compares the first generated token (over a 20-token generation) between " + "ONNX Runtime GenAI and transformers, 'tft' reports the time to the first generated token, and " + "'tf5t' reports the time to the first 5 generated tokens. " + "Accepts space- or comma-separated values (e.g. 'mae,speedup' or 'mae speedup'). " + "Defaults to 'mae'. Only used together with --test." + ), + ) + model_group.add_argument( + "--test_llama_path", + type=str, + default=None, + help=( + "Path to the llama_env virtual environment used to run llama.cpp inference during a --test run. " + "When provided, the ONNX model is also compared against llama.cpp (GGUF format) and results " + "include first-token latency and speedup metrics. Only used together with --test." ), ) diff --git a/olive/cli/run.py b/olive/cli/run.py index 8554ddbe4..a0a91e811 100644 --- a/olive/cli/run.py +++ b/olive/cli/run.py @@ -6,6 +6,7 @@ from olive.cli.base import ( BaseOliveCLICommand, + _flatten_test_metrics, add_discrepancy_check_pass, add_hf_test_model_config, add_input_model_options, @@ -71,7 +72,10 @@ def run(self): output_path = ( self.args.output_path or run_config.get("output_dir") or run_config.get("engine", {}).get("output_dir") ) + validate_test_output_path(output_path, self.args.test) run_config["input_model"] = add_hf_test_model_config(input_model, self.args.test, output_path) + test_metrics = _flatten_test_metrics(getattr(self.args, "test_metrics", None)) + run_config = add_discrepancy_check_pass(run_config, test_metrics) for arg_key, rc_key in [("output_path", "output_dir"), ("log_level", "log_severity_level")]: if (arg_value := getattr(self.args, arg_key)) is not None: @@ -82,9 +86,6 @@ def run(self): run_config[rc_key] = arg_value output_path = run_config.get("output_dir") or run_config.get("engine", {}).get("output_dir") - validate_test_output_path(output_path, self.args.test) - if self.args.test not in (None, False): - run_config = add_discrepancy_check_pass(run_config) workflow_output = olive_run( run_config, list_required_packages=self.args.list_required_packages, diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py index 75649d2e3..69c8b3b7a 100644 --- a/olive/common/hf/utils.py +++ b/olive/common/hf/utils.py @@ -42,6 +42,16 @@ def is_test_model_dir(output_dir: Union[str, Path]) -> bool: return marker.get("type") == "olive_hf_test_model" +def has_test_model_weights(output_dir: Union[str, Path]) -> bool: + """Return True if *output_dir* contains persisted model weight shards. + + A config-only test-model directory (created by ``save_test_model_config`` during + ``--dry_run --test``) has a ``config.json`` and marker file but no weight shards yet. + """ + output_path = Path(output_dir) + return any(output_path.glob("*.safetensors")) or any(output_path.glob("pytorch_model*.bin")) + + def _write_test_model_marker(output_dir: Union[str, Path], test_model_config: Optional[dict[str, Any]] = None): marker_path = _get_test_model_marker_path(output_dir) marker_path.write_text( @@ -90,24 +100,76 @@ def _apply_test_model_config( return model_config -def _load_test_model(model_class: type, model_config: "PretrainedConfig", trust_remote_code: Optional[bool] = None): - """Instantiate a random-initialized HF model from config for test mode.""" +def _load_test_model( + model_class: type, + model_config: "PretrainedConfig", + trust_remote_code: Optional[bool] = None, + attn_implementation: Optional[str] = None, +): + """Instantiate a random-initialized HF model from config for test mode. + + ``attn_implementation`` (e.g. ``"sdpa"``, forwarded from the model's ``load_kwargs``) is passed + through to ``from_config`` so the random test model uses the requested attention implementation + rather than relying on the transformers default (which can be ``"eager"`` on some versions). + This keeps the generated test model consistent with the base/reference model. + """ from_config_signature = inspect.signature(model_class.from_config) - supports_trust_remote_code = "trust_remote_code" in from_config_signature.parameters or any( + accepts_var_keyword = any( parameter.kind == inspect.Parameter.VAR_KEYWORD for parameter in from_config_signature.parameters.values() ) from_config_kwargs = {} - if supports_trust_remote_code and trust_remote_code is not None: + if ( + accepts_var_keyword or "trust_remote_code" in from_config_signature.parameters + ) and trust_remote_code is not None: from_config_kwargs["trust_remote_code"] = trust_remote_code - return model_class.from_config(model_config, **from_config_kwargs) + if ( + accepts_var_keyword or "attn_implementation" in from_config_signature.parameters + ) and attn_implementation is not None: + from_config_kwargs["attn_implementation"] = attn_implementation + model = model_class.from_config(model_config, **from_config_kwargs) + logger.info("Generating test model class %s", type(model)) + return model -def _save_test_model(model: "PreTrainedModel", output_dir: str, test_model_config: Optional[dict[str, Any]] = None): +def _save_test_model( + model: "PreTrainedModel", + output_dir: str, + test_model_config: Optional[dict[str, Any]] = None, + model_name_or_path: Optional[str] = None, +): output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) logger.info("Saving generated test model to %s", output_path) model.save_pretrained(str(output_path)) + if model_name_or_path: + # Save the reference tokenizer alongside the weights so the test model directory is + # self-contained (e.g. for OnnxDiscrepancyCheck and ONNX Runtime GenAI generation). + try: + tokenizer = get_tokenizer(model_name_or_path) + save_tokenizer(tokenizer, str(output_path)) + except Exception as e: # pylint: disable=broad-except + logger.debug("Could not save tokenizer for test model from %r: %s", model_name_or_path, e) + _write_test_model_marker(output_path, test_model_config) + + +def save_test_model_config( + model_name_or_path: str, test_model_config: Optional[dict[str, Any]], test_model_path: str +) -> None: + """Save a modified config.json (without model weights) to *test_model_path*. + + Used during ``--dry_run --test`` to pre-create the test model directory with the + reduced-layer config so that subsequent ``olive run`` calls can find the directory + and complete it with random weights the first time ModelBuilder runs. + """ + output_path = Path(test_model_path) + if is_test_model_dir(output_path): + logger.debug("Test model config directory already exists at %s.", output_path) + return + output_path.mkdir(parents=True, exist_ok=True) + model_config = get_model_config(model_name_or_path, test_model_config=test_model_config) + model_config.save_pretrained(str(output_path)) _write_test_model_marker(output_path, test_model_config) + logger.info("Saved test model config to %s.", output_path) def _validate_path(test_model_dir: Path, test_model_path: str): @@ -174,12 +236,35 @@ def load_model_from_task( if test_model_config: test_model_dir = Path(test_model_path) if test_model_path else None if test_model_dir and is_test_model_dir(test_model_dir): - model = from_pretrained(model_class, test_model_path, "model", **kwargs) + # Check if model weights are present. A config-only directory (created by + # ``save_test_model_config`` during ``--dry_run --test``) has a config.json + # and a marker file but no weight shards yet. In that case, create a random + # model from the saved config and persist the weights so subsequent loads + # can use the saved directory directly. + if has_test_model_weights(test_model_dir): + model = from_pretrained(model_class, test_model_path, "model", **kwargs) + else: + model = _load_test_model( + model_class, + model_config, + kwargs.get("trust_remote_code"), + attn_implementation=kwargs.get("attn_implementation"), + ) + _save_test_model( + model, test_model_path, test_model_config, model_name_or_path=model_name_or_path + ) else: _validate_path(test_model_dir, test_model_path) - model = _load_test_model(model_class, model_config, kwargs.get("trust_remote_code")) + model = _load_test_model( + model_class, + model_config, + kwargs.get("trust_remote_code"), + attn_implementation=kwargs.get("attn_implementation"), + ) if test_model_path: - _save_test_model(model, test_model_path, test_model_config) + _save_test_model( + model, test_model_path, test_model_config, model_name_or_path=model_name_or_path + ) else: model = from_pretrained(model_class, model_name_or_path, "model", **kwargs) logger.debug("Loaded model %s with name_or_path %s", model_class, model_name_or_path) diff --git a/olive/olive_config.json b/olive/olive_config.json index 88a096c2f..ecd8c59c8 100644 --- a/olive/olive_config.json +++ b/olive/olive_config.json @@ -642,6 +642,22 @@ "supported_algorithms": [ "rtn" ], "supported_quantization_encodings": [ ] }, + "SaveTestModelConfig": { + "module_path": "olive.passes.pytorch.save_test_model_config.SaveTestModelConfig", + "supported_providers": [ "*" ], + "supported_accelerators": [ "*" ], + "supported_precisions": [ "*" ], + "supported_algorithms": [ ], + "supported_quantization_encodings": [ ] + }, + "ConvertHfToGGUF": { + "module_path": "olive.passes.pytorch.convert_hf_to_gguf.ConvertHfToGGUF", + "supported_providers": [ "*" ], + "supported_accelerators": [ "*" ], + "supported_precisions": [ "*" ], + "supported_algorithms": [ ], + "supported_quantization_encodings": [ ] + }, "SelectiveMixedPrecision": { "module_path": "olive.passes.pytorch.selective_mixed_precision.SelectiveMixedPrecision", "supported_providers": [ "*" ], diff --git a/olive/passes/onnx/discrepancy_check.py b/olive/passes/onnx/discrepancy_check.py index a9bb7e1ac..822ef03da 100644 --- a/olive/passes/onnx/discrepancy_check.py +++ b/olive/passes/onnx/discrepancy_check.py @@ -4,6 +4,7 @@ # -------------------------------------------------------------------------- import json import logging +import subprocess import time from pathlib import Path from typing import Optional @@ -18,12 +19,33 @@ logger = logging.getLogger(__name__) +def _json_sanitize(obj): + """Recursively convert numpy scalars/arrays to native Python types for JSON serialization.""" + import numpy as np + + if isinstance(obj, dict): + return {key: _json_sanitize(value) for key, value in obj.items()} + if isinstance(obj, (list, tuple)): + return [_json_sanitize(item) for item in obj] + if isinstance(obj, np.generic): + return obj.item() + if isinstance(obj, np.ndarray): + return obj.tolist() + return obj + + def _infer_shape(dynamic_shape, known_values=None): + # Use an empty past-KV cache (past_sequence_length=0) so the discrepancy check is a clean + # prefill comparison. The dummy dataloader passes ``past_key_values..key/value`` tensors, + # but HuggingFace ``forward`` does not accept those dotted names as keyword arguments and + # silently drops them, so the reference model would run without a cache while the ONNX model + # would consume a (bogus, all-ones) cache -- producing a large, meaningless discrepancy. + # Keeping the past length at 0 makes both models perform the same prefill over ``input_ids``. default_values = { "batch_size": 1, - "past_sequence_length": 2, - "total_sequence_length": 3, - "sequence_length": 1, + "past_sequence_length": 0, + "sequence_length": 8, + "total_sequence_length": 8, } if known_values: default_values.update(known_values) @@ -110,6 +132,80 @@ def _longest_common_token_sequence(seq_a: list[int], seq_b: list[int]) -> int: return length +def _format_seconds(value: Optional[float]) -> str: + """Format an optional latency value (in seconds) for logging.""" + return "n/a" if value is None else f"{value:.4f}s" + + +# --------------------------------------------------------------------------- +# Helper script executed inside the ``llama_env`` virtual environment. +# All llama-cpp-python / gguf imports are intentionally isolated to this +# subprocess so the main Olive process does not require those packages. +# --------------------------------------------------------------------------- +_LLAMA_CPP_HELPER_SCRIPT = '''\ +"""llama.cpp inference helper for OnnxDiscrepancyCheck. + +This script runs inside the llama_env virtual environment via subprocess. +It measures first-token latency using llama-cpp-python on a pre-converted GGUF file. +Results are written as a JSON object to stdout. + +GGUF conversion is done separately via the convert_hf_to_gguf.py CLI from llama.cpp +before this script is invoked. +""" +import argparse +import json +import time + + +def run_inference(gguf_path, prompt_tokens, max_new_tokens, first_n): + """Run greedy generation with llama.cpp and return first-token latency metrics.""" + from llama_cpp import Llama + + n_ctx = max(512, len(prompt_tokens) + max_new_tokens + 64) + llm = Llama(model_path=gguf_path, n_ctx=n_ctx, verbose=False) + + generated = [] + ttft = None + ttfn = None + first_token_id = None + + start = time.perf_counter() + for token in llm.generate(prompt_tokens, top_k=1, temp=0.0, reset=True): + count = len(generated) + 1 + if count == 1: + ttft = time.perf_counter() - start + first_token_id = int(token) + if count == first_n and ttfn is None: + ttfn = time.perf_counter() - start + generated.append(int(token)) + if count >= max_new_tokens: + break + + total_time = time.perf_counter() - start + + return { + "first_token_id": first_token_id, + "generated_tokens": generated, + "ttft": ttft, + "ttfn": ttfn, + "total_time": total_time, + } + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="llama.cpp inference helper") + parser.add_argument("--gguf_path", required=True) + parser.add_argument("--prompt_tokens", required=True, help="JSON-encoded list of token IDs") + parser.add_argument("--max_new_tokens", type=int, default=32) + parser.add_argument("--first_n", type=int, default=5) + args = parser.parse_args() + + prompt_tokens = json.loads(args.prompt_tokens) + result = run_inference(args.gguf_path, prompt_tokens, args.max_new_tokens, args.first_n) + print(json.dumps(result)) +''' + + class OnnxDiscrepancyCheck(Pass): """Validates ONNX model outputs against a reference PyTorch model. @@ -122,6 +218,8 @@ class OnnxDiscrepancyCheck(Pass): - Inference speedup of ONNX over PyTorch on the target device (or CPU fallback) - Longest common token sequence from the beginning between transformers generate and ONNX Runtime GenAI generate (when enabled) + - Time-to-first-token and time-to-first-N-tokens latencies for both transformers + and ONNX Runtime GenAI generation (when enabled) The pass status is marked as failed if any configured threshold is exceeded. """ @@ -138,16 +236,23 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon type_=Optional[str], default_value=None, description=( - "Directory where discrepancy check results and reference model are saved. " + "Directory where discrepancy check results are saved. " "If not specified, results are written to the pass cache directory." ), ), - "save_reference_model_state_dict": PassConfigParam( - type_=bool, - default_value=False, + "test_metrics": PassConfigParam( + type_=Optional[list[str]], + default_value=None, description=( - "Save the reference PyTorch model weights (state_dict) alongside the results. " - "This allows direct comparison between the reference and optimized models." + "List of test metrics to evaluate. Accepted values are ``'mae'`` (max absolute error " + "between ONNX and reference PyTorch outputs), ``'speedup'`` (ONNX-vs-PyTorch " + "inference latency), ``'first_token_20'`` (first generated token comparison over a " + "20-token generation between ONNX Runtime GenAI and transformers), ``'tft'`` (time to " + "the first generated token) and ``'tf5t'`` (time to the first 5 generated tokens). " + "When set, this field takes precedence over ``timing_iterations`` " + "and ``max_mae``: ``'speedup'`` enables timing, ``'mae'`` enforces the MAE threshold, and " + "the generation metrics run the transformers-vs-GenAI comparison. " + "Example: ``['mae', 'speedup']``. Set by the CLI ``--test_metrics`` option." ), ), "max_mae": PassConfigParam( @@ -205,6 +310,14 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon default_value=32, description="Maximum number of new tokens to generate for the token sequence comparison.", ), + "first_n_tokens_timed": PassConfigParam( + type_=int, + default_value=5, + description=( + "Number of leading generated tokens used for the time-to-first-N-tokens latency " + "measurement reported for both transformers and ONNX Runtime GenAI." + ), + ), "min_longest_common_tokens": PassConfigParam( type_=Optional[int], default_value=None, @@ -214,46 +327,131 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon "below this threshold, the pass fails." ), ), + "llama_cpp": PassConfigParam( + type_=bool, + default_value=False, + description=( + "When True, convert the reference HuggingFace model to GGUF format using " + "``convert_hf_to_gguf.py`` from llama.cpp and compare inference with llama.cpp. " + "Measures first-token difference between llama.cpp and the reference PyTorch model " + "as well as latency and speedup. All llama-cpp-python operations are executed in " + "the ``llama_env`` virtual environment via subprocess." + ), + ), + "llama_cpp_env_path": PassConfigParam( + type_=Optional[str], + default_value=None, + description=( + "Path to the virtual environment where llama-cpp-python and " + "``convert_hf_to_gguf.py`` are installed. " + "Defaults to 'llama_env' relative to the current working directory when " + "``llama_cpp`` is True. Create this environment and obtain the conversion " + "script and its dependencies with: " + "``python -m venv llama_env && llama_env/bin/pip install gguf safetensors " + "transformers sentencepiece protobuf " + "llama-cpp-python --extra-index-url " + "https://abetlen.github.io/llama-cpp-python/whl/cpu && " + "git clone --depth=1 --filter=blob:none --sparse " + "https://github.com/ggerganov/llama.cpp.git /tmp/llama_cpp_repo && " + "git -C /tmp/llama_cpp_repo sparse-checkout set convert_hf_to_gguf.py conversion && " + "cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py llama_env/ && " + "cp -r /tmp/llama_cpp_repo/conversion llama_env/``." + ), + ), } def _run_for_config( self, model: ONNXModelHandler, config: type[BasePassConfig], output_model_path: str ) -> ONNXModelHandler: - import torch + dataloader, io_config = self._prepare_dataloader(model) + ref_model, ref_path = self._load_reference_model(model, config) + + device, execution_provider, torch_device, weight_dtype = self._resolve_execution_device(model) + ref_model = self._cast_reference_model(ref_model, weight_dtype, torch_device) + + report_dir = self._resolve_report_dir(config, output_model_path) + + session = model.prepare_session( + device=device, + execution_providers=[execution_provider] if execution_provider else None, + ) + + results = self._compute_logits_discrepancy(ref_model, session, dataloader, io_config, torch_device) + + effective_timing_iterations, effective_max_mae, generation_metrics = self._resolve_metric_settings(config) + + self._run_speedup_measurement( + ref_model, session, dataloader, io_config, torch_device, config, effective_timing_iterations, results + ) + + self._check_error_thresholds(config, results, effective_max_mae) + + self._run_generation_comparison(model, config, ref_model, ref_path, generation_metrics, results) + + self._run_llama_cpp_comparison(model, config, ref_model, ref_path, report_dir, generation_metrics, results) + self._save_results(model, results, report_dir) + return model + + def _prepare_dataloader(self, model: ONNXModelHandler): from olive.common.config_utils import validate_config - from olive.common.utils import format_data from olive.data.template import dummy_data_config_template from olive.model.config.io_config import is_io_config_static io_config = model.io_config - if io_config: - if is_io_config_static(io_config): - input_shapes = io_config.get("input_shapes") - else: - input_shapes = [] - known = {} - for shape in io_config.get("input_shapes"): - new_shape = _infer_shape(shape, known) - input_shapes.append(new_shape) - known.update(dict(zip(shape, new_shape))) - data_config = dummy_data_config_template( - input_shapes, io_config.get("input_names"), io_config.get("input_types") - ) - data_config = validate_config(data_config, DataConfig) - data_config.load_dataset_config.params["max_samples"] = 1 - else: + if not io_config: raise RuntimeError( f"Model IO config is missing for {model.model_path}; cannot generate dummy inputs for discrepancy check." ) + + if is_io_config_static(io_config): + input_shapes = io_config.get("input_shapes") + else: + input_shapes = [] + known = {} + for shape in io_config.get("input_shapes"): + new_shape = _infer_shape(shape, known) + input_shapes.append(new_shape) + known.update(dict(zip(shape, new_shape))) + data_config = dummy_data_config_template( + input_shapes, io_config.get("input_names"), io_config.get("input_types") + ) + data_config = validate_config(data_config, DataConfig) + data_config.load_dataset_config.params["max_samples"] = 1 + # Create dataloader dc = data_config.to_data_container() dataloader = dc.create_dataloader() + return dataloader, io_config + def _load_reference_model(self, model: ONNXModelHandler, config: type[BasePassConfig]): # Load reference PyTorch model from transformers import AutoConfig, AutoModelForCausalLM - ref_cfg = AutoConfig.from_pretrained(config.reference_model_path) + # Resolve the reference model path. Use the configured path if it exists as a local + # directory; otherwise fall back to a ``reference_hf_model`` directory saved alongside the + # ONNX output. The reference model is normally kept at ``/reference_hf_model`` + # (written by SaveTestModelConfig / the test-model flow) and persists across engine cache + # hits, so this fallback only triggers if the configured path has been removed. + ref_path = config.reference_model_path + if not Path(ref_path).is_dir(): + hf_ref_dir = (model.model_attributes or {}).get("hf_reference_model_dir", "reference_hf_model") + fallback = Path(model.model_path).parent / hf_ref_dir + if fallback.is_dir(): + logger.info( + "Reference model not found at %r; using cached copy at %r.", + ref_path, + str(fallback), + ) + ref_path = str(fallback) + else: + raise RuntimeError( + f"Reference model directory {ref_path!r} does not exist and no cached copy was " + f"found at {str(fallback)!r}. Re-run the optimization workflow (olive run) to " + "recreate the test model." + ) + + ref_cfg = AutoConfig.from_pretrained(ref_path) architectures = getattr(ref_cfg, "architectures", None) or [] if not any("ForCausalLM" in arch for arch in architectures): raise ValueError( @@ -261,8 +459,20 @@ def _run_for_config( f"Got architectures={architectures}" ) - ref_model = AutoModelForCausalLM.from_pretrained(config.reference_model_path) + # The attention implementation is baked into the reference model's config.json + # (as ``_attn_implementation``) by the SaveTestModelConfig pass, so it is picked up + # automatically here without needing to pass ``attn_implementation`` explicitly. + ref_model = AutoModelForCausalLM.from_pretrained(ref_path, config=ref_cfg) ref_model.eval() + logger.info( + "Loaded reference model from %s with attn_implementation=%s", + ref_path, + getattr(ref_cfg, "_attn_implementation", None), + ) + return ref_model, ref_path + + def _resolve_execution_device(self, model: ONNXModelHandler): + import torch # Determine the floating-point dtype used by the ONNX model weights and # cast the reference PyTorch model to match, so the comparison uses the @@ -287,6 +497,11 @@ def _run_for_config( torch_device = torch.device("cpu") if device == Device.GPU and torch.cuda.is_available(): torch_device = torch.device("cuda") + return device, execution_provider, torch_device, weight_dtype + + def _cast_reference_model(self, ref_model, weight_dtype, torch_device): + import torch + if weight_dtype is not None and torch_device.type == "cpu" and weight_dtype in (torch.float16, torch.bfloat16): logger.info( "OnnxDiscrepancyCheck skipping reference model cast to %s on CPU because the dtype is not supported.", @@ -301,19 +516,19 @@ def _run_for_config( ) else: ref_model = ref_model.to(torch_device) + return ref_model - # Save reference PyTorch model for direct comparison + def _resolve_report_dir(self, config: type[BasePassConfig], output_model_path: str): report_dir = config.report_output_dir or output_model_path report_dir_path = Path(report_dir) if report_dir_path.suffix and not report_dir_path.is_dir(): report_dir = str(report_dir_path.parent) - if config.save_reference_model_state_dict: - self._export_reference_model(ref_model, report_dir) + return report_dir - session = model.prepare_session( - device=device, - execution_providers=[execution_provider] if execution_provider else None, - ) + def _compute_logits_discrepancy(self, ref_model, session, dataloader, io_config, torch_device): + import torch + + from olive.common.utils import format_data # Run inference on both and compare all_max_abs_diff = [] @@ -365,28 +580,65 @@ def _run_for_config( f"elements_above_0.01={count_above_0_01}/{total_elements}" ) logger.info(summary) + return results + + def _resolve_metric_settings(self, config: type[BasePassConfig]): + # Resolve effective metric settings: test_metrics takes precedence when set. + # This lets the CLI store a human-readable ["mae", "speedup"] list in the config + # while still supporting the lower-level timing_iterations / max_mae controls for + # advanced users and backward compatibility with older configs. + requested_metrics = set(config.test_metrics) if config.test_metrics is not None else set() + if config.test_metrics is not None: + effective_timing_iterations = 5 if "speedup" in requested_metrics else 0 + effective_max_mae = 0.1 if "mae" in requested_metrics else None + else: + effective_timing_iterations = config.timing_iterations + effective_max_mae = config.max_mae + # Metrics that require running token generation (transformers vs ONNX Runtime GenAI). + generation_metrics = requested_metrics & {"first_token_20", "tft", "tf5t"} + return effective_timing_iterations, effective_max_mae, generation_metrics + + def _run_speedup_measurement( + self, ref_model, session, dataloader, io_config, torch_device, config, effective_timing_iterations, results + ): # Measure inference speedup (ONNX vs PyTorch) on the target device - if config.timing_iterations > 0: - self._measure_speedup( + if effective_timing_iterations > 0: + timing = self._measure_speedup( ref_model, session, dataloader, io_config, torch_device, config.warmup_iterations, - config.timing_iterations, + effective_timing_iterations, ) + if timing is not None: + pytorch_time, onnx_time, speedup = timing + results["pytorch_latency_s"] = pytorch_time + results["onnx_latency_s"] = onnx_time + results["speedup"] = speedup + logger.info( + "OnnxDiscrepancyCheck speedup: pytorch_latency_s=%.4f, onnx_latency_s=%.4f, speedup=%.2f", + pytorch_time, + onnx_time, + speedup, + ) else: logger.info( "OnnxDiscrepancyCheck speedup measurement skipped because timing_iterations=%d.", - config.timing_iterations, + effective_timing_iterations, ) + def _check_error_thresholds(self, config: type[BasePassConfig], results, effective_max_mae): + max_abs_error = results["max_abs_error"] + count_above_0_1 = results["elements_above_0_1"] + count_above_0_01 = results["elements_above_0_01"] + # Check thresholds failures = [] - if config.max_mae is not None and max_abs_error > config.max_mae: - failures.append(f"Max absolute error {max_abs_error:.6f} exceeds threshold {config.max_mae:.6f}") + if effective_max_mae is not None and max_abs_error > effective_max_mae: + failures.append(f"Max absolute error {max_abs_error:.6f} exceeds threshold {effective_max_mae:.6f}") if config.max_elements_above_0_1 is not None and count_above_0_1 > config.max_elements_above_0_1: failures.append( f"Elements with diff > 0.1: {count_above_0_1} exceeds threshold {config.max_elements_above_0_1}" @@ -404,35 +656,186 @@ def _run_for_config( else: results["status"] = "passed" - # Generation token sequence comparison (transformers vs ONNX Runtime GenAI) - if config.genai_model_path: - longest_common = self.compare_generation(config, ref_model) - results["longest_common_token_sequence"] = longest_common - results["genai_model_path"] = config.genai_model_path - if config.min_longest_common_tokens is not None and longest_common < config.min_longest_common_tokens: - results["status"] = "failed" - gen_failure = ( - f"Longest common token sequence length {longest_common} is below " - f"threshold {config.min_longest_common_tokens}" + def _run_generation_comparison( + self, model: ONNXModelHandler, config, ref_model, ref_path, generation_metrics, results + ): + # Generation token sequence comparison (transformers vs ONNX Runtime GenAI). + # Runs when an explicit genai_model_path is configured or when any generation-based + # test metric (first_token_20 / tft / tf5t) is requested. In the latter case the + # optimized ONNX model directory is used as the GenAI model when it exposes a + # genai_config.json (as produced by the ModelBuilder pass). + genai_model_path = config.genai_model_path + if genai_model_path is None and generation_metrics: + model_dir = Path(model.model_path) + model_dir = model_dir if model_dir.is_dir() else model_dir.parent + if (model_dir / "genai_config.json").is_file(): + genai_model_path = str(model_dir) + logger.info( + "Using optimized ONNX model directory %s as the GenAI model for generation metrics.", + genai_model_path, + ) + else: + logger.warning( + "Generation metrics %s requested but no genai_config.json was found in %s; skipping them.", + sorted(generation_metrics), + model_dir, + ) + + if not genai_model_path: + return + + # first_token_20 generates 20 tokens; tf5t measures the time to the first 5 tokens. + gen_max_new_tokens = 20 if "first_token_20" in generation_metrics else config.generate_max_new_tokens + gen_first_n = 5 if "tf5t" in generation_metrics else config.first_n_tokens_timed + gen_results = self.compare_generation( + config, + ref_model, + ref_model_path=ref_path, + genai_model_path=genai_model_path, + max_new_tokens=gen_max_new_tokens, + first_n=gen_first_n, + ) + longest_common = gen_results["longest_common_token_sequence"] + results.update(gen_results) + results["genai_model_path"] = genai_model_path + + # Surface the explicitly requested named metrics for easy inspection. + if "first_token_20" in generation_metrics: + results["first_token_20"] = { + "transformers_first_token": gen_results.get("transformers_first_token"), + "genai_first_token": gen_results.get("genai_first_token"), + "first_token_matches": gen_results.get("first_token_matches"), + "transformers_second_token": gen_results.get("transformers_second_token"), + "genai_second_token": gen_results.get("genai_second_token"), + "second_token_matches": gen_results.get("second_token_matches"), + "matching_leading_tokens": longest_common, + } + logger.info( + "OnnxDiscrepancyCheck first_token_20: first_token_matches=%s (transformers=%s, genai=%s), " + "second_token_matches=%s (transformers=%s, genai=%s), matching_leading_tokens=%s", + gen_results.get("first_token_matches"), + gen_results.get("transformers_first_token"), + gen_results.get("genai_first_token"), + gen_results.get("second_token_matches"), + gen_results.get("transformers_second_token"), + gen_results.get("genai_second_token"), + longest_common, + ) + if "tft" in generation_metrics: + results["tft"] = { + "transformers_s": gen_results.get("transformers_time_to_first_token_s"), + "genai_s": gen_results.get("genai_time_to_first_token_s"), + } + logger.info( + "OnnxDiscrepancyCheck tft (time to first token): transformers=%s, genai=%s", + _format_seconds(gen_results.get("transformers_time_to_first_token_s")), + _format_seconds(gen_results.get("genai_time_to_first_token_s")), + ) + if "tf5t" in generation_metrics: + results["tf5t"] = { + "transformers_s": gen_results.get("transformers_time_to_first_n_tokens_s"), + "genai_s": gen_results.get("genai_time_to_first_n_tokens_s"), + } + logger.info( + "OnnxDiscrepancyCheck tf5t (time to first 5 tokens): transformers=%s, genai=%s", + _format_seconds(gen_results.get("transformers_time_to_first_n_tokens_s")), + _format_seconds(gen_results.get("genai_time_to_first_n_tokens_s")), + ) + + if config.min_longest_common_tokens is not None and longest_common < config.min_longest_common_tokens: + results["status"] = "failed" + gen_failure = ( + f"Longest common token sequence length {longest_common} is below " + f"threshold {config.min_longest_common_tokens}" + ) + results.setdefault("failures", []).append(gen_failure) + logger.error("ONNX model discrepancy check FAILED: %s", gen_failure) + + def _run_llama_cpp_comparison( + self, model: ONNXModelHandler, config, ref_model, ref_path, report_dir, generation_metrics, results + ): + # llama.cpp comparison: convert reference model to GGUF and compare latencies + if not config.llama_cpp: + return + preconverted_gguf_path = None + if model.model_attributes: + preconverted_gguf_path = model.model_attributes.get("reference_gguf_model_path") + try: + # first_token_20 restricts the comparison to a 20-token generation, mirroring the + # transformers vs GenAI path so no more than 20 generated tokens are validated. + gen_max_new_tokens = 20 if "first_token_20" in generation_metrics else config.generate_max_new_tokens + llama_results = self.compare_llama_cpp( + config, + ref_model, + output_dir=report_dir, + pytorch_latency_s=results.get("pytorch_latency_s"), + onnx_latency_s=results.get("onnx_latency_s"), + ref_model_path=ref_path, + preconverted_gguf_path=preconverted_gguf_path, + max_new_tokens=gen_max_new_tokens, + ) + results.update(llama_results) + + # Surface the llama.cpp vs transformers first-token comparison alongside the + # transformers vs GenAI comparison when first_token_20 is requested. + if "first_token_20" in generation_metrics: + first_token_20 = results.setdefault("first_token_20", {}) + transformers_first_token = llama_results.get("llama_cpp_pytorch_first_token_id") + llama_first_token = llama_results.get("llama_cpp_first_token_id") + first_token_20.setdefault("transformers_first_token", transformers_first_token) + first_token_20["llama_cpp_first_token"] = llama_first_token + first_token_20["llama_cpp_first_token_matches"] = llama_results.get( + "llama_cpp_first_token_matches_pytorch" + ) + first_token_20.setdefault( + "transformers_second_token", llama_results.get("llama_cpp_pytorch_second_token_id") + ) + first_token_20["llama_cpp_second_token"] = llama_results.get("llama_cpp_second_token_id") + first_token_20["llama_cpp_second_token_matches"] = llama_results.get( + "llama_cpp_second_token_matches_pytorch" ) - results.setdefault("failures", []).append(gen_failure) - logger.error("ONNX model discrepancy check FAILED: %s", gen_failure) + first_token_20["llama_cpp_matching_leading_tokens"] = llama_results.get( + "llama_cpp_longest_common_token_sequence" + ) + logger.info( + "OnnxDiscrepancyCheck first_token_20 (llama.cpp): first_token_matches=%s " + "(transformers=%s, llama_cpp=%s), second_token_matches=%s (transformers=%s, llama_cpp=%s), " + "matching_leading_tokens=%s", + llama_results.get("llama_cpp_first_token_matches_pytorch"), + transformers_first_token, + llama_first_token, + llama_results.get("llama_cpp_second_token_matches_pytorch"), + llama_results.get("llama_cpp_pytorch_second_token_id"), + llama_results.get("llama_cpp_second_token_id"), + llama_results.get("llama_cpp_longest_common_token_sequence"), + ) + except Exception as exc: + logger.exception("OnnxDiscrepancyCheck llama.cpp comparison failed.") + results["status"] = "failed" + results.setdefault("failures", []).append(f"llama.cpp comparison failed: {exc}") + def _save_results(self, model: ONNXModelHandler, results, report_dir): # Save results to disk + results = _json_sanitize(results) report_path = Path(report_dir) / "discrepancy_check_results.json" report_path.parent.mkdir(parents=True, exist_ok=True) report_path.write_text(json.dumps(results, indent=2)) + logger.info("Saved discrepancy check results to %s", report_path) # Store results in model attributes so the CLI can persist them in the output directory model_attributes = dict(model.model_attributes) if model.model_attributes else {} model_attributes["discrepancy_check_results"] = results model.model_attributes = model_attributes - return model def _measure_speedup( self, ref_model, session, dataloader, io_config, torch_device, warmup_iterations, timing_iterations - ): - """Measure inference speedup of ONNX over PyTorch on the target device.""" + ) -> tuple[float, float, float] | None: + """Measure inference latencies and speedup of ONNX over PyTorch on the target device. + + Returns a tuple ``(pytorch_time, onnx_time, speedup)`` of the average PyTorch and ONNX + per-iteration latencies (in seconds) and the ONNX-over-PyTorch speedup, or ``None`` when + measurement is skipped. + """ if timing_iterations <= 0: logger.info( "OnnxDiscrepancyCheck speedup measurement skipped because timing_iterations=%d.", @@ -494,65 +897,380 @@ def _measure_speedup( torch_device, ) - return speedup - - def compare_generation(self, config: type[BasePassConfig], ref_model) -> int: - """Run generation on both transformers and GenAI, return longest common token sequence length.""" + return pytorch_time, onnx_time, speedup + + def compare_generation( + self, + config: type[BasePassConfig], + ref_model, + *, + ref_model_path: str, + genai_model_path: Optional[str] = None, + max_new_tokens: Optional[int] = None, + first_n: Optional[int] = None, + ) -> dict: + """Run generation on both transformers and GenAI and compare them. + + Returns a dict with the longest common token sequence length, the first-generated-token + match between transformers and ONNX Runtime GenAI, and the time-to-first-token and + time-to-first-N-tokens latencies (in seconds) for both, where N is ``first_n`` + (defaults to ``config.first_n_tokens_timed``). + + ``genai_model_path``, ``max_new_tokens`` and ``first_n`` override the corresponding + config values when provided, which lets the caller request specific metrics such as + ``first_token_20`` (20-token generation) or ``tf5t`` (first 5 tokens). + """ try: import onnxruntime_genai as og except ImportError as exc: raise ImportError("Please install `onnxruntime-genai` to enable generation comparison.") from exc - from transformers import AutoTokenizer + from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList - tokenizer = AutoTokenizer.from_pretrained(config.reference_model_path) + genai_model_path = genai_model_path if genai_model_path is not None else config.genai_model_path + tokenizer = AutoTokenizer.from_pretrained(ref_model_path) + + max_new_tokens = config.generate_max_new_tokens if max_new_tokens is None else max_new_tokens + first_n_config = config.first_n_tokens_timed if first_n is None else first_n + first_n = max(1, min(first_n_config, max_new_tokens)) if max_new_tokens > 0 else 0 # Transformers generation input_ids = tokenizer(config.generate_prompt, return_tensors="pt").input_ids - input_ids = input_ids.to(ref_model.device) import torch + input_ids = input_ids.to(ref_model.device) + use_cuda_sync = ref_model.device.type == "cuda" + + prompt_token_count = input_ids.shape[-1] + transformers_latency = {"start": None, "ttft": None, "ttfn": None} + + class _TransformersLatencyStopCriteria(StoppingCriteria): + def __call__(self, generated_ids, scores, **kwargs) -> bool: + generated_token_count = generated_ids.shape[-1] - prompt_token_count + if generated_token_count >= 1 and transformers_latency["ttft"] is None: + transformers_latency["ttft"] = time.perf_counter() - transformers_latency["start"] + if generated_token_count >= first_n and transformers_latency["ttfn"] is None: + transformers_latency["ttfn"] = time.perf_counter() - transformers_latency["start"] + return False + with torch.no_grad(): + if use_cuda_sync: + torch.cuda.synchronize() + start = time.perf_counter() + transformers_latency["start"] = start transformers_output = ref_model.generate( input_ids, - max_new_tokens=config.generate_max_new_tokens, + max_new_tokens=max_new_tokens, do_sample=False, + stopping_criteria=StoppingCriteriaList([_TransformersLatencyStopCriteria()]), + ) + if use_cuda_sync: + torch.cuda.synchronize() + transformers_elapsed = time.perf_counter() - start + if max_new_tokens > 0: + transformers_ttft = ( + transformers_latency["ttft"] if transformers_latency["ttft"] is not None else transformers_elapsed + ) + transformers_ttfn = ( + transformers_latency["ttfn"] if transformers_latency["ttfn"] is not None else transformers_elapsed ) + else: + transformers_ttft = None + transformers_ttfn = None transformers_tokens = transformers_output[0].cpu().tolist() - # ONNX Runtime GenAI generation - genai_model = og.Model(config.genai_model_path) - genai_tokenizer = og.Tokenizer(genai_model) - genai_input_ids = genai_tokenizer.encode(config.generate_prompt) + # ONNX Runtime GenAI generation. Feed GenAI the exact same prompt token ids produced by the + # transformers tokenizer (including any special/BOS tokens) rather than re-encoding with the + # GenAI tokenizer. ``og.Tokenizer.encode`` does not add special tokens by default, so + # re-encoding would drop the BOS token that transformers adds, giving the two models different + # inputs and a spurious first-token mismatch even when the models are numerically identical. + genai_model = og.Model(genai_model_path) + genai_input_ids = input_ids[0].cpu().tolist() params = og.GeneratorParams(genai_model) - params.set_search_options(max_length=len(genai_input_ids) + config.generate_max_new_tokens, do_sample=False) + params.set_search_options(max_length=len(genai_input_ids) + max_new_tokens, do_sample=False) generator = og.Generator(genai_model, params) generator.append_tokens([genai_input_ids]) genai_tokens = list(genai_input_ids) + genai_prompt_token_count = len(genai_input_ids) + genai_ttft = None + genai_ttfn = None + num_generated = 0 + start = time.perf_counter() while not generator.is_done(): generator.generate_next_token() genai_tokens.append(generator.get_next_tokens()[0]) + num_generated += 1 + if num_generated == 1: + genai_ttft = time.perf_counter() - start + if num_generated == first_n: + genai_ttfn = time.perf_counter() - start del generator - longest_common = _longest_common_token_sequence(transformers_tokens, genai_tokens) + # Longest common leading token sequence between transformers and ONNX Runtime GenAI, measured + # over the generated tokens only (the prompt is shared and identical since GenAI is fed the same + # token ids). This bounds the count by ``max_new_tokens`` so, e.g., first_token_20 never + # validates more than 20 generated tokens. + transformers_generated_tokens = transformers_tokens[prompt_token_count:] + genai_generated_tokens = genai_tokens[genai_prompt_token_count:] + longest_common = _longest_common_token_sequence(transformers_generated_tokens, genai_generated_tokens) + + # First generated token comparison (transformers vs ONNX Runtime GenAI). + transformers_first_token = ( + transformers_tokens[prompt_token_count] if len(transformers_tokens) > prompt_token_count else None + ) + genai_first_token = ( + genai_tokens[genai_prompt_token_count] if len(genai_tokens) > genai_prompt_token_count else None + ) + first_token_matches = transformers_first_token is not None and transformers_first_token == genai_first_token + + # Second generated token comparison (transformers vs ONNX Runtime GenAI). + transformers_second_token = ( + transformers_tokens[prompt_token_count + 1] if len(transformers_tokens) > prompt_token_count + 1 else None + ) + genai_second_token = ( + genai_tokens[genai_prompt_token_count + 1] if len(genai_tokens) > genai_prompt_token_count + 1 else None + ) + second_token_matches = transformers_second_token is not None and transformers_second_token == genai_second_token + + gen_results = { + "longest_common_token_sequence": longest_common, + "first_n_tokens_timed": first_n, + "transformers_first_token": transformers_first_token, + "genai_first_token": genai_first_token, + "first_token_matches": first_token_matches, + "transformers_second_token": transformers_second_token, + "genai_second_token": genai_second_token, + "second_token_matches": second_token_matches, + "transformers_time_to_first_token_s": transformers_ttft, + "transformers_time_to_first_n_tokens_s": transformers_ttfn, + "genai_time_to_first_token_s": genai_ttft, + "genai_time_to_first_n_tokens_s": genai_ttfn, + } gen_summary = ( f"OnnxDiscrepancyCheck generation comparison: " f"transformers_len={len(transformers_tokens)}, genai_len={len(genai_tokens)}, " - f"longest_common_token_sequence={longest_common}" + f"longest_common_token_sequence={longest_common}, " + f"first_token_matches={first_token_matches}, " + f"transformers_ttft={_format_seconds(transformers_ttft)}, " + f"transformers_time_to_first_{first_n}_tokens={_format_seconds(transformers_ttfn)}, " + f"genai_ttft={_format_seconds(genai_ttft)}, " + f"genai_time_to_first_{first_n}_tokens={_format_seconds(genai_ttfn)}" ) logger.info(gen_summary) - return longest_common + return gen_results + + @staticmethod + def _get_llama_env_python(env_path: str) -> str: + """Return the Python interpreter path inside the given virtual environment. + + Checks both the POSIX (``bin/python``) and Windows (``Scripts/python.exe``) + layouts so the method works cross-platform. + """ + env = Path(env_path) + for candidate in (env / "bin" / "python", env / "Scripts" / "python.exe"): + if candidate.exists(): + return str(candidate) + raise RuntimeError( + f"Could not find a Python interpreter in the llama_env at '{env_path}'. " + "Create the environment with: " + "python -m venv llama_env && llama_env/bin/pip install gguf safetensors " + "llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu" + ) - def _export_reference_model(self, ref_model, output_model_path: str): - """Save the reference PyTorch model weights for direct comparison.""" + @staticmethod + def _get_convert_script(env_path: str) -> str: + r"""Return the path to the ``convert_hf_to_gguf.py`` conversion script. + + The script and the accompanying ``conversion/`` package must be placed at the root + of the virtual environment directory (i.e. ``{env_path}/convert_hf_to_gguf.py`` and + ``{env_path}/conversion/``). Obtain them via a sparse clone:: + + git clone --depth=1 --filter=blob:none --sparse \ + https://github.com/ggerganov/llama.cpp.git /tmp/llama_cpp_repo + git -C /tmp/llama_cpp_repo sparse-checkout set convert_hf_to_gguf.py conversion + cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py {env_path}/ + cp -r /tmp/llama_cpp_repo/conversion {env_path}/ + """ + env = Path(env_path) + script = env / "convert_hf_to_gguf.py" + conversion_pkg = env / "conversion" + setup_cmd = ( + f"git clone --depth=1 --filter=blob:none --sparse " + f"https://github.com/ggerganov/llama.cpp.git /tmp/llama_cpp_repo && " + f"git -C /tmp/llama_cpp_repo sparse-checkout set convert_hf_to_gguf.py conversion && " + f"cp /tmp/llama_cpp_repo/convert_hf_to_gguf.py {env_path}/ && " + f"cp -r /tmp/llama_cpp_repo/conversion {env_path}/" + ) + if not script.exists(): + raise RuntimeError( + f"Could not find convert_hf_to_gguf.py in '{env_path}'. " + f"Clone it from the llama.cpp repository: {setup_cmd}" + ) + if not conversion_pkg.exists(): + raise RuntimeError( + f"Could not find the 'conversion' package in '{env_path}'. " + "convert_hf_to_gguf.py requires the 'conversion/' directory alongside it. " + f"Clone it from the llama.cpp repository: {setup_cmd}" + ) + return str(script) + + def compare_llama_cpp( + self, + config: type[BasePassConfig], + ref_model, + output_dir: str, + pytorch_latency_s: Optional[float] = None, + onnx_latency_s: Optional[float] = None, + *, + ref_model_path: str, + preconverted_gguf_path: Optional[str] = None, + max_new_tokens: Optional[int] = None, + ) -> dict: + """Convert the reference model to GGUF and compare inference with llama.cpp. + + All llama-cpp-python operations are executed inside the ``llama_env`` virtual + environment via subprocess, so the main Olive process does not need + llama-cpp-python installed. + + The method: + + 1. Saves the reference model and tokenizer to ``output_dir/hf_model`` using + ``save_pretrained`` (standard HuggingFace format). + 2. Calls ``convert_hf_to_gguf.py`` from llama.cpp via the command line to + convert the saved directory to a GGUF F32 file at ``output_dir/model.gguf``. + 3. Runs ``_LLAMA_CPP_HELPER_SCRIPT`` inside ``llama_env`` to measure + first-token latency with llama-cpp-python on the converted GGUF file. + 4. Returns a metrics dict with the llama.cpp results and speedup ratios + relative to PyTorch and ONNX when those latencies are provided. + """ import torch + from transformers import AutoTokenizer + + # Resolve the llama_env Python interpreter and conversion script + env_path = config.llama_cpp_env_path or "llama_env" + python_path = self._get_llama_env_python(env_path) + + # Tokenize the generation prompt using the main-env tokenizer + tokenizer = AutoTokenizer.from_pretrained(ref_model_path) + encoded = tokenizer(config.generate_prompt, return_tensors="pt") + prompt_token_ids: list[int] = encoded["input_ids"][0].tolist() + + max_new_tokens = config.generate_max_new_tokens if max_new_tokens is None else max_new_tokens + first_n = max(1, min(config.first_n_tokens_timed, max_new_tokens)) if max_new_tokens > 0 else 1 - output_dir = Path(output_model_path) - output_dir.mkdir(parents=True, exist_ok=True) + # Run generation with transformers to get the reference first token and the leading + # token sequence used for the longest-common-token comparison against llama.cpp. + input_ids = torch.tensor([prompt_token_ids]).to(ref_model.device) + with torch.no_grad(): + gen_out = ref_model.generate(input_ids, max_new_tokens=max(1, max_new_tokens), do_sample=False) + pytorch_tokens: list[int] = gen_out[0].cpu().tolist() + prompt_token_count = len(prompt_token_ids) + pytorch_first_token_id = ( + pytorch_tokens[prompt_token_count] if len(pytorch_tokens) > prompt_token_count else None + ) + pytorch_second_token_id = ( + pytorch_tokens[prompt_token_count + 1] if len(pytorch_tokens) > prompt_token_count + 1 else None + ) + + output_dir_path = Path(output_dir) + output_dir_path.mkdir(parents=True, exist_ok=True) + model_dir = str(output_dir_path / "hf_model") + gguf_path = str(output_dir_path / "model.gguf") + script_path = str(output_dir_path / "llama_cpp_helper.py") + + if preconverted_gguf_path and Path(preconverted_gguf_path).exists(): + gguf_path = preconverted_gguf_path + logger.info("Using pre-converted GGUF from %s", gguf_path) + else: + convert_script = self._get_convert_script(env_path) + # Save model and tokenizer in standard HuggingFace format. + ref_model.save_pretrained(model_dir, safe_serialization=True) + tokenizer.save_pretrained(model_dir) + logger.info("Saved reference HuggingFace model and tokenizer to %s", model_dir) + + # Step 1: Convert to GGUF using the official convert_hf_to_gguf.py CLI. + subprocess.run( + [python_path, convert_script, model_dir, "--outfile", gguf_path, "--outtype", "f32"], + capture_output=True, + text=True, + check=True, + ) + logger.info("Converted HuggingFace model to GGUF at %s", gguf_path) + + # Step 2: Run inference inside llama_env using the pre-converted GGUF file. + (output_dir_path / "llama_cpp_helper.py").write_text(_LLAMA_CPP_HELPER_SCRIPT) + + proc = subprocess.run( + [ + python_path, + script_path, + "--gguf_path", + gguf_path, + "--prompt_tokens", + json.dumps(prompt_token_ids), + "--max_new_tokens", + str(max_new_tokens), + "--first_n", + str(first_n), + ], + capture_output=True, + text=True, + check=True, + ) + + llama_out: dict = json.loads(proc.stdout) + + llama_first_token_id: Optional[int] = llama_out.get("first_token_id") + llama_generated_tokens: list[int] = llama_out.get("generated_tokens") or [] + llama_second_token_id: Optional[int] = llama_generated_tokens[1] if len(llama_generated_tokens) > 1 else None + llama_ttft: Optional[float] = llama_out.get("ttft") + llama_ttfn: Optional[float] = llama_out.get("ttfn") + llama_total: Optional[float] = llama_out.get("total_time") + + # Longest common leading token sequence between transformers and llama.cpp, measured over + # the generated tokens only (the prompt is shared and identical). This bounds the count by + # ``max_new_tokens`` so, e.g., first_token_20 never validates more than 20 generated tokens. + pytorch_generated_tokens = pytorch_tokens[prompt_token_count:] + llama_longest_common = _longest_common_token_sequence(pytorch_generated_tokens, llama_generated_tokens) + + # Speedup: compare llama.cpp TTFT with single-pass PyTorch / ONNX latency + llama_speedup_vs_pytorch: Optional[float] = ( + pytorch_latency_s / llama_ttft if (pytorch_latency_s is not None and llama_ttft) else None + ) + llama_speedup_vs_onnx: Optional[float] = ( + onnx_latency_s / llama_ttft if (onnx_latency_s is not None and llama_ttft) else None + ) + + results = { + "llama_cpp_pytorch_first_token_id": pytorch_first_token_id, + "llama_cpp_first_token_id": llama_first_token_id, + "llama_cpp_first_token_matches_pytorch": llama_first_token_id == pytorch_first_token_id, + "llama_cpp_pytorch_second_token_id": pytorch_second_token_id, + "llama_cpp_second_token_id": llama_second_token_id, + "llama_cpp_second_token_matches_pytorch": ( + pytorch_second_token_id is not None and llama_second_token_id == pytorch_second_token_id + ), + "llama_cpp_longest_common_token_sequence": llama_longest_common, + "llama_cpp_ttft_s": llama_ttft, + "llama_cpp_ttfn_s": llama_ttfn, + "llama_cpp_total_time_s": llama_total, + "llama_cpp_speedup_vs_pytorch": llama_speedup_vs_pytorch, + "llama_cpp_speedup_vs_onnx": llama_speedup_vs_onnx, + } + + logger.info( + "OnnxDiscrepancyCheck llama.cpp comparison: first_token_matches_pytorch=%s, " + "matching_leading_tokens=%s, ttft=%s, ttfn=%s, total=%s, speedup_vs_pytorch=%s, speedup_vs_onnx=%s", + results["llama_cpp_first_token_matches_pytorch"], + llama_longest_common, + _format_seconds(llama_ttft), + _format_seconds(llama_ttfn), + _format_seconds(llama_total), + f"{llama_speedup_vs_pytorch:.2f}x" if llama_speedup_vs_pytorch is not None else "n/a", + f"{llama_speedup_vs_onnx:.2f}x" if llama_speedup_vs_onnx is not None else "n/a", + ) - ref_pt_path = output_dir / "reference_model.pt" - torch.save(ref_model.state_dict(), str(ref_pt_path)) - logger.info("Reference PyTorch model saved to %s", ref_pt_path) + return results diff --git a/olive/passes/onnx/model_builder.py b/olive/passes/onnx/model_builder.py index e4aa0fdbc..d862fe214 100644 --- a/olive/passes/onnx/model_builder.py +++ b/olive/passes/onnx/model_builder.py @@ -18,7 +18,7 @@ from huggingface_hub.constants import HF_HUB_CACHE from packaging import version -from olive.common.hf.utils import is_test_model_dir +from olive.common.hf.utils import has_test_model_weights, is_test_model_dir from olive.constants import Precision from olive.hardware.accelerator import AcceleratorSpec, Device from olive.hardware.constants import ExecutionProvider @@ -249,9 +249,14 @@ def _run_for_config( "ModelBuilder requires test_model_path to be set when test_model_config is provided. " "Please specify the path where the test model should be saved." ) - if not is_test_model_dir(model.test_model_path): + # Materialize the reference weights when the test-model directory is missing or only + # contains a config (as created by SaveTestModelConfig). This guarantees the ONNX + # model is built from the exact same saved weights that OnnxDiscrepancyCheck later + # loads as the reference; otherwise the model builder would initialize its own + # weights and the discrepancy check would compare against a different model. + if not is_test_model_dir(model.test_model_path) or not has_test_model_weights(model.test_model_path): model.load_model(cache_model=False) - model_path = model.test_model_path + model_path = str(Path(model.test_model_path).resolve()) # provide the model path as input path, model builder uses input_path for quantized models input_path = model_path if model.adapter_path: diff --git a/olive/passes/pytorch/convert_hf_to_gguf.py b/olive/passes/pytorch/convert_hf_to_gguf.py new file mode 100644 index 000000000..9654a7c57 --- /dev/null +++ b/olive/passes/pytorch/convert_hf_to_gguf.py @@ -0,0 +1,91 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import logging +import subprocess +import sys +from pathlib import Path + +from olive.hardware.accelerator import AcceleratorSpec +from olive.model import HfModelHandler +from olive.passes import Pass +from olive.passes.pass_config import BasePassConfig, PassConfigParam + +logger = logging.getLogger(__name__) + + +class ConvertHfToGGUF(Pass): + """Convert the test HuggingFace model directory to a GGUF file.""" + + @classmethod + def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassConfigParam]: + return { + "llama_cpp_env_path": PassConfigParam( + type_=str, + default_value="llama_env", + description="Path to the llama.cpp virtual environment containing convert_hf_to_gguf.py.", + ), + "reference_model_path": PassConfigParam( + type_=str, + default_value=None, + description="Fallback model path to convert when test_model_path is not set.", + ), + "gguf_file_name": PassConfigParam( + type_=str, + default_value="model.gguf", + description="GGUF output filename.", + ), + } + + @staticmethod + def _get_python_executable(env_path: Path) -> str: + if sys.platform.startswith("win"): + return str(env_path / "Scripts" / "python.exe") + return str(env_path / "bin" / "python") + + def _run_for_config( + self, model: HfModelHandler, config: type[BasePassConfig], output_model_path: str + ) -> HfModelHandler: + source = model.test_model_path or config.reference_model_path + if not source: + logger.info("ConvertHfToGGUF skipped: no source model directory was provided.") + return model + + source_path = Path(source) + if not source_path.is_dir(): + logger.info("ConvertHfToGGUF skipped: source model directory does not exist: %s", source_path) + return model + + gguf_path = source_path / config.gguf_file_name + if gguf_path.exists(): + logger.info("ConvertHfToGGUF skipped: GGUF already exists at %s", gguf_path) + model_attributes = dict(model.model_attributes) if model.model_attributes else {} + model_attributes["reference_gguf_model_path"] = str(gguf_path) + model.model_attributes = model_attributes + return model + + env_path = Path(config.llama_cpp_env_path).resolve() + convert_script = env_path / "convert_hf_to_gguf.py" + conversion_pkg = env_path / "conversion" + python_path = self._get_python_executable(env_path) + + if not Path(python_path).exists(): + raise RuntimeError(f"Could not find llama_env python executable: {python_path}") + if not convert_script.exists(): + raise RuntimeError(f"Could not find convert_hf_to_gguf.py at: {convert_script}") + if not conversion_pkg.exists(): + raise RuntimeError(f"Could not find conversion package at: {conversion_pkg}") + + subprocess.run( + [python_path, str(convert_script), str(source_path), "--outfile", str(gguf_path), "--outtype", "f32"], + capture_output=True, + text=True, + check=True, + ) + logger.info("Converted test model to GGUF at %s", gguf_path) + + model_attributes = dict(model.model_attributes) if model.model_attributes else {} + model_attributes["reference_gguf_model_path"] = str(gguf_path) + model.model_attributes = model_attributes + return model diff --git a/olive/passes/pytorch/save_test_model_config.py b/olive/passes/pytorch/save_test_model_config.py new file mode 100644 index 000000000..f22d78388 --- /dev/null +++ b/olive/passes/pytorch/save_test_model_config.py @@ -0,0 +1,89 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import logging +from typing import Optional + +from olive.hardware.accelerator import AcceleratorSpec +from olive.model import HfModelHandler +from olive.passes import Pass +from olive.passes.pass_config import BasePassConfig, PassConfigParam + +logger = logging.getLogger(__name__) + + +class SaveTestModelConfig(Pass): + """Saves a random-initialised HuggingFace model to the test_model_path directory. + + When ``test_model_path`` and ``test_model_config`` are set on the input + ``HfModelHandler``, this pass creates the target directory, writes + ``config.json`` (with the modified number of hidden layers), the Olive + test-model marker file, *and* the random model weights (safetensors). + + The pass is a no-op when neither ``test_model_path`` nor + ``test_model_config`` is set on the model, and it is idempotent — running + it a second time on a directory that already contains both the marker file + and model weights is safe. + + The input model is returned unchanged. + """ + + @classmethod + def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassConfigParam]: + return { + "attn_impl": PassConfigParam( + type_=Optional[str], + default_value="sdpa", + description=( + "Attention implementation baked into the saved test model's ``config.json`` " + "(written as ``_attn_implementation``). Downstream passes such as " + "``OnnxDiscrepancyCheck`` that load this reference model will use it. " + "Common values are ``'eager'``, ``'sdpa'``, and ``'flash_attention_2'``. " + "Defaults to ``'sdpa'``. When ``None`` the transformers default is used." + ), + ), + } + + def _run_for_config( + self, model: HfModelHandler, config: type[BasePassConfig], output_model_path: str + ) -> HfModelHandler: + import json + from pathlib import Path + + from olive.common.hf.utils import is_test_model_dir + + test_model_path = model.test_model_path + test_model_config = model.test_model_config + if not (test_model_path and test_model_config): + logger.debug( + "SaveTestModelConfig: test_model_path=%r, test_model_config=%r — nothing to save.", + test_model_path, + test_model_config, + ) + return model + + test_model_dir = Path(test_model_path) + _has_weights = is_test_model_dir(test_model_dir) and ( + any(test_model_dir.glob("*.safetensors")) or any(test_model_dir.glob("pytorch_model*.bin")) + ) + if not _has_weights: + logger.info("Saving test random model to %s", test_model_path) + # load_model calls load_model_from_task which creates a random-initialised model + # from the reduced config and persists it (weights + config.json + marker) to + # test_model_path on the first call. + model.load_model(cache_model=False) + else: + logger.debug("Test model already saved at %s — skipping model save.", test_model_path) + + # Bake the attention implementation into the saved config.json so downstream passes + # (e.g. OnnxDiscrepancyCheck) that load this reference model use the same setting. + if config.attn_impl: + config_json_path = test_model_dir / "config.json" + if config_json_path.is_file(): + config_data = json.loads(config_json_path.read_text()) + if config_data.get("_attn_implementation") != config.attn_impl: + config_data["_attn_implementation"] = config.attn_impl + config_json_path.write_text(json.dumps(config_data, indent=2)) + logger.info("Set _attn_implementation=%s in %s", config.attn_impl, config_json_path) + return model diff --git a/test/cli/test_base.py b/test/cli/test_base.py index bb34cef3f..428bf7409 100644 --- a/test/cli/test_base.py +++ b/test/cli/test_base.py @@ -58,7 +58,7 @@ }, "load_kwargs": { "trust_remote_code": False, - "attn_implementation": "eager", + "attn_implementation": "sdpa", }, }, ), @@ -76,7 +76,7 @@ "model_path": "my_model/my_model", "load_kwargs": { "trust_remote_code": True, - "attn_implementation": "eager", + "attn_implementation": "sdpa", }, }, ), @@ -94,7 +94,7 @@ "model_path": "hf_model", "load_kwargs": { "trust_remote_code": False, - "attn_implementation": "eager", + "attn_implementation": "sdpa", }, }, ), @@ -141,7 +141,7 @@ "model_path": "hf", "load_kwargs": { "trust_remote_code": False, - "attn_implementation": "eager", + "attn_implementation": "sdpa", }, }, ), @@ -237,13 +237,14 @@ def test_get_input_model_config_hf_test_model(_): task="text-generation", model_script=None, script_dir=None, - test="saved_test_model", + test=True, + output_path="out_dir", ) config = get_input_model_config(args) assert config["test_model_config"] == {"hidden_layers": 2} - assert config["test_model_path"] == "saved_test_model" + assert config["test_model_path"] == str(Path("out_dir") / "reference_hf_model") @patch("huggingface_hub.repo_exists", return_value=True) @@ -257,7 +258,7 @@ def test_get_input_model_config_hf_test_model_requires_path_without_output_path( test=True, ) - with pytest.raises(ValueError, match=r"--test requires an explicit folder when output_path is not available\."): + with pytest.raises(ValueError, match=r"--test requires --output_path to store the generated reference model\."): get_input_model_config(args) @@ -338,3 +339,218 @@ def test_get_input_model_config_no_crash_without_onnx_file_name(tmp_path): # model_path should remain unchanged since no onnx_file_name to guide rewriting assert config["config"]["model_path"] == stale_model_path + + +def _discrepancy_run_config(): + return { + "input_model": {"type": "HfModel", "test_model_path": "ref_model"}, + "output_dir": "out_dir", + } + + +def test_add_discrepancy_check_pass_default_enables_mae_only(): + from olive.cli.base import add_discrepancy_check_pass + + run_config = add_discrepancy_check_pass(_discrepancy_run_config()) + + passes = run_config["passes"] + # SaveTestModelConfig must be the first pass + first_key = next(iter(passes)) + assert passes[first_key]["type"] == "SaveTestModelConfig" + + pass_config = passes["discrepancy_check"] + assert pass_config["type"] == "OnnxDiscrepancyCheck" + assert pass_config["reference_model_path"] == str(Path("ref_model").resolve()) + # default: mae only -> test_metrics stores the human-readable selection + assert pass_config["test_metrics"] == ["mae"] + + +def test_add_discrepancy_check_pass_speedup_only_disables_mae(): + from olive.cli.base import add_discrepancy_check_pass + + run_config = add_discrepancy_check_pass(_discrepancy_run_config(), metrics=["speedup"]) + + passes = run_config["passes"] + first_key = next(iter(passes)) + assert passes[first_key]["type"] == "SaveTestModelConfig" + + pass_config = passes["discrepancy_check"] + assert pass_config["test_metrics"] == ["speedup"] + + +def test_add_discrepancy_check_pass_mae_only_disables_speedup(): + from olive.cli.base import add_discrepancy_check_pass + + run_config = add_discrepancy_check_pass(_discrepancy_run_config(), metrics=["mae"]) + + passes = run_config["passes"] + first_key = next(iter(passes)) + assert passes[first_key]["type"] == "SaveTestModelConfig" + + pass_config = passes["discrepancy_check"] + assert pass_config["test_metrics"] == ["mae"] + + +def test_warn_unused_test_metrics_logs_when_test_disabled(): + from olive.cli.base import warn_unused_test_metrics + + with patch("olive.cli.base.logger") as mock_logger: + warn_unused_test_metrics(test=None, metrics=["speedup"]) + + mock_logger.warning.assert_called_once() + assert "--test_metrics is ignored" in mock_logger.warning.call_args[0][0] + + +def test_warn_unused_test_metrics_silent_when_test_enabled(): + from olive.cli.base import warn_unused_test_metrics + + with patch("olive.cli.base.logger") as mock_logger: + warn_unused_test_metrics(test=True, metrics=["speedup"]) + + mock_logger.warning.assert_not_called() + + +def test_warn_unused_test_metrics_logs_llama_path_when_test_disabled(): + from olive.cli.base import warn_unused_test_metrics + + with patch("olive.cli.base.logger") as mock_logger: + warn_unused_test_metrics(test=None, metrics=None, llama_path="/path/to/llama_env") + + mock_logger.warning.assert_called_once() + assert "--test_llama_path is ignored" in mock_logger.warning.call_args[0][0] + + +def test_add_discrepancy_check_pass_llama_env_path_sets_config(): + from olive.cli.base import add_discrepancy_check_pass + + run_config = add_discrepancy_check_pass(_discrepancy_run_config(), llama_env_path="/path/to/llama_env") + + passes = run_config["passes"] + first_key = next(iter(passes)) + assert passes[first_key]["type"] == "SaveTestModelConfig" + assert passes["convert_hf_to_gguf"]["type"] == "ConvertHfToGGUF" + assert passes["convert_hf_to_gguf"]["llama_cpp_env_path"] == "/path/to/llama_env" + + pass_config = passes["discrepancy_check"] + assert pass_config["llama_cpp"] is True + assert pass_config["llama_cpp_env_path"] == "/path/to/llama_env" + + +def test_add_discrepancy_check_pass_no_llama_env_path_omits_llama_config(): + from olive.cli.base import add_discrepancy_check_pass + + run_config = add_discrepancy_check_pass(_discrepancy_run_config()) + + passes = run_config["passes"] + first_key = next(iter(passes)) + assert passes[first_key]["type"] == "SaveTestModelConfig" + + pass_config = passes["discrepancy_check"] + assert "convert_hf_to_gguf" not in passes + assert "llama_cpp" not in pass_config + assert "llama_cpp_env_path" not in pass_config + + +def test_add_discrepancy_check_pass_updates_existing_pass(): + """When OnnxDiscrepancyCheck already exists in the config, its runtime fields are updated.""" + from olive.cli.base import add_discrepancy_check_pass + + # Simulate a config generated by `olive optimize --dry_run --test` - the pass already exists + # with stale settings (old output dir, only mae was requested at generate-time). + config = _discrepancy_run_config() + config["passes"] = { + "discrepancy_check": { + "type": "OnnxDiscrepancyCheck", + "reference_model_path": "/old/abs/path", + "report_output_dir": "/old/out_dir", + "test_metrics": ["mae"], + } + } + config["input_model"]["test_model_path"] = "new_ref_model" + config["output_dir"] = "new_out_dir" + + result = add_discrepancy_check_pass(config, metrics=["mae", "speedup"], llama_env_path="/path/to/llama_env") + + passes = result["passes"] + # SaveTestModelConfig must be injected at the beginning + first_key = next(iter(passes)) + assert passes[first_key]["type"] == "SaveTestModelConfig" + assert passes["convert_hf_to_gguf"]["type"] == "ConvertHfToGGUF" + assert passes["convert_hf_to_gguf"]["llama_cpp_env_path"] == "/path/to/llama_env" + assert passes["convert_hf_to_gguf"]["reference_model_path"] == str(Path("new_ref_model").resolve()) + + pass_config = passes["discrepancy_check"] + # Reference model path and output dir must be updated to the current values. + assert pass_config["reference_model_path"] == str(Path("new_ref_model").resolve()) + assert pass_config["report_output_dir"] == "new_out_dir" + # test_metrics must reflect the newly requested metrics. + assert pass_config["test_metrics"] == ["mae", "speedup"] + + +def test_add_discrepancy_check_pass_updates_existing_pass_speedup_only(): + """Updating an existing pass with speedup-only metrics updates test_metrics.""" + from olive.cli.base import add_discrepancy_check_pass + + config = _discrepancy_run_config() + config["passes"] = { + "dc": { + "type": "onnxdiscrepancycheck", # case-insensitive type match + "reference_model_path": "/old/path", + "test_metrics": ["mae"], + } + } + + result = add_discrepancy_check_pass(config, metrics=["speedup"]) + + passes = result["passes"] + # SaveTestModelConfig must be injected at the beginning + first_key = next(iter(passes)) + assert passes[first_key]["type"] == "SaveTestModelConfig" + + pass_config = passes["dc"] + assert pass_config["test_metrics"] == ["speedup"] + + from olive.cli.base import _parse_test_metrics + + assert _parse_test_metrics("mae,speedup") == ["mae", "speedup"] + + +def test_parse_test_metrics_single(): + from olive.cli.base import _parse_test_metrics + + assert _parse_test_metrics("mae") == ["mae"] + + +def test_parse_test_metrics_accepts_generation_metrics(): + from olive.cli.base import _parse_test_metrics + + assert _parse_test_metrics("first_token_20,tft,tf5t") == ["first_token_20", "tft", "tf5t"] + + +def test_parse_test_metrics_invalid_raises(): + import argparse + + from olive.cli.base import _parse_test_metrics + + with pytest.raises(argparse.ArgumentTypeError, match="invalid choice"): + _parse_test_metrics("unknown") + + +def test_flatten_test_metrics_nested_lists(): + from olive.cli.base import _flatten_test_metrics + + # Simulates: --test_metrics mae,speedup → [["mae", "speedup"]] + assert _flatten_test_metrics([["mae", "speedup"]]) == ["mae", "speedup"] + + +def test_flatten_test_metrics_space_separated_tokens(): + from olive.cli.base import _flatten_test_metrics + + # Simulates: --test_metrics mae speedup → [["mae"], ["speedup"]] + assert _flatten_test_metrics([["mae"], ["speedup"]]) == ["mae", "speedup"] + + +def test_flatten_test_metrics_none_returns_none(): + from olive.cli.base import _flatten_test_metrics + + assert _flatten_test_metrics(None) is None diff --git a/test/cli/test_cli.py b/test/cli/test_cli.py index 59817d830..6b7a09849 100644 --- a/test/cli/test_cli.py +++ b/test/cli/test_cli.py @@ -141,7 +141,7 @@ def test_workflow_run_command_with_overrides(mock_repo_exists, mock_run, tmp_pat "input_model": { "type": "HfModel", "model_path": "hf-internal-testing/tiny-random-LlamaForCausalLM", - "load_kwargs": {"attn_implementation": "eager", "trust_remote_code": False}, + "load_kwargs": {"attn_implementation": "sdpa", "trust_remote_code": False}, }, "engine": {}, "output_dir": str(Path("new_output_path").resolve()), @@ -163,7 +163,7 @@ def test_workflow_run_command_with_test_override(mock_run, tmp_path): "input_model": { "type": "HfModel", "model_path": "hf-internal-testing/tiny-random-LlamaForCausalLM", - "load_kwargs": {"attn_implementation": "eager", "trust_remote_code": False}, + "load_kwargs": {"attn_implementation": "sdpa", "trust_remote_code": False}, }, "output_dir": str(tmp_path / "output"), } @@ -173,25 +173,28 @@ def test_workflow_run_command_with_test_override(mock_run, tmp_path): cli_main(command_args) - test_model_path = str(tmp_path / "output" / "test_model") + test_model_path = str(tmp_path / "output" / "reference_hf_model") output_dir = str(tmp_path / "output") mock_run.assert_called_once_with( { "input_model": { "type": "HfModel", "model_path": "hf-internal-testing/tiny-random-LlamaForCausalLM", - "load_kwargs": {"attn_implementation": "eager", "trust_remote_code": False}, + "load_kwargs": {"attn_implementation": "sdpa", "trust_remote_code": False}, "test_model_config": {"hidden_layers": 2}, "test_model_path": test_model_path, }, "output_dir": output_dir, "passes": { + "save_test_model_config": {"type": "SaveTestModelConfig"}, "discrepancy_check": { "type": "OnnxDiscrepancyCheck", "reference_model_path": test_model_path, - "max_mae": 0.1, "report_output_dir": output_dir, - } + "test_metrics": ["mae"], + "max_mae": 0.1, + "timing_iterations": 0, + }, }, }, list_required_packages=False, @@ -211,7 +214,7 @@ def test_workflow_run_command_with_test_rejects_non_test_output_dir(tmp_path): "input_model": { "type": "HfModel", "model_path": "hf-internal-testing/tiny-random-LlamaForCausalLM", - "load_kwargs": {"attn_implementation": "eager", "trust_remote_code": False}, + "load_kwargs": {"attn_implementation": "sdpa", "trust_remote_code": False}, }, "output_dir": str(output_dir), } @@ -301,13 +304,11 @@ def test_finetune_command(_, mock_run, tmp_path): @patch("huggingface_hub.repo_exists", return_value=True) def test_optimize_command_test_model_config(_, tmp_path): output_dir = tmp_path / "output_dir" - test_model_dir = tmp_path / "saved_test_model" command_args = [ "optimize", "-m", "dummy-model-id", "--test", - str(test_model_dir), "--dry_run", "-o", str(output_dir), @@ -317,7 +318,7 @@ def test_optimize_command_test_model_config(_, tmp_path): config = json.loads((output_dir / "config.json").read_text()) assert config["input_model"]["test_model_config"] == {"hidden_layers": 2} - assert config["input_model"]["test_model_path"] == str(test_model_dir) + assert config["input_model"]["test_model_path"] == str(output_dir / "reference_hf_model") assert json.loads((output_dir / TEST_OUTPUT_MARKER_FILE).read_text())["type"] == "olive_hf_test_output" diff --git a/test/cli/test_cli_test_model_smoke.py b/test/cli/test_cli_test_model_smoke.py index 90c10ddff..9b9e7911e 100644 --- a/test/cli/test_cli_test_model_smoke.py +++ b/test/cli/test_cli_test_model_smoke.py @@ -26,6 +26,7 @@ "local/tiny-random-llama-b", "mistralai/Mistral-7B-Instruct-v0.3", "microsoft/Phi-3-mini-4k-instruct", + "Qwen/Qwen3-8B", ) MAX_ARTIFACT_SIZE_BYTES = 1024 * 1024 @@ -45,7 +46,7 @@ def _save_local_tiny_llama(model_path: Path): { "vocab_size": 32, "hidden_size": 128, - "intermediate_size": 256, + "intermediate_size": 128, "num_hidden_layers": 2, "num_attention_heads": 8, "num_key_value_heads": 8, @@ -70,8 +71,53 @@ def _save_local_tiny_llama(model_path: Path): ).save_pretrained(model_path) +def _save_local_tiny_qwen3(model_path: Path): + from transformers import PreTrainedTokenizerFast, Qwen3Config, Qwen3ForCausalLM + + model = Qwen3ForCausalLM( + Qwen3Config.from_dict( + { + "vocab_size": 32, + "hidden_size": 64, + "intermediate_size": 128, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "num_key_value_heads": 4, + "head_dim": 16, + "max_position_embeddings": 64, + "tie_word_embeddings": False, + } + ) + ) + model.save_pretrained(model_path) + + tokenizer = Tokenizer( + WordLevel( + vocab={"": 0, "": 1, "": 2, "hello": 3, "world": 4}, + unk_token="", + ) + ) + tokenizer.pre_tokenizer = Whitespace() + PreTrainedTokenizerFast( + tokenizer_object=tokenizer, + bos_token="", + eos_token="", + pad_token="", + ).save_pretrained(model_path) + + +def _save_local_tiny_model(model_id: str, model_path: Path): + if model_id.startswith("Qwen/"): + _save_local_tiny_qwen3(model_path) + else: + _save_local_tiny_llama(model_path) + + def _set_offline_gptq_data_config(config_path: Path): config = json.loads(config_path.read_text()) + # The tiny smoke-test fixtures use small hidden sizes, so the default GPTQ group_size of 128 + # is too large (in_features must be divisible by group_size). Use a small group_size. + config["passes"]["gptq"]["group_size"] = 32 config["passes"]["gptq"]["data_config"] = { "name": "test_gptq_dummy_data", "type": "DummyDataContainer", @@ -100,10 +146,10 @@ def _run_documented_test_model_smoke_flow(tmp_path: Path, model_id: str): model_name = model_id.replace("/", "--") model_path = tmp_path / "models" / model_name config_output_dir = tmp_path / f"{model_name}-test" - test_model_dir = tmp_path / f"{model_name}-test-model" run_output_dir = tmp_path / f"{model_name}-test-run" + test_model_dir = run_output_dir / "reference_hf_model" - _save_local_tiny_llama(model_path) + _save_local_tiny_model(model_id, model_path) # optimize -m arnir0/Tiny-LLM --device cpu --provider CPUExecutionProvider --precision int4 --output_path dump --dry_run _run_cli_main( [ @@ -125,14 +171,13 @@ def _run_documented_test_model_smoke_flow(tmp_path: Path, model_id: str): config_path = config_output_dir / "config.json" assert config_path.exists() _set_offline_gptq_data_config(config_path) - # run --config dump/config.json --test dump/test --output_path dump/run + # run --config dump/config.json --test --output_path dump/run _run_cli_main( [ "run", "--config", str(config_path), "--test", - str(test_model_dir), "--output_path", str(run_output_dir), ] @@ -160,8 +205,13 @@ def _assert_smoke_flows(self, tmp_path: Path): "config.json", "generation_config.json", "model.safetensors", + "tokenizer.json", + "tokenizer_config.json", TEST_MODEL_MARKER_FILE, } + # Some transformers versions additionally emit special_tokens_map.json when saving the + # tokenizer; treat it as optional so the assertion is version independent. + optional_test_model_files = {"special_tokens_map.json"} expected_run_output_files = { "config.json", "genai_config.json", @@ -176,7 +226,9 @@ def _assert_smoke_flows(self, tmp_path: Path): with self.subTest(model_id=model_id): config_path, test_model_dir, run_output_dir = _run_documented_test_model_smoke_flow(tmp_path, model_id) assert config_path.exists() - assert self._list_relative_files(test_model_dir) == expected_test_model_files + assert ( + self._list_relative_files(test_model_dir) - optional_test_model_files == expected_test_model_files + ) run_output_files = self._list_relative_files(run_output_dir) assert expected_run_output_files.issubset(run_output_files) self._assert_file_size_below_limit(test_model_dir / "model.safetensors") @@ -211,14 +263,13 @@ def _assert_discrepancy(self, tmp_path: Path): self.fail(f"Unknown exporter: {exporter!r}") @staticmethod - def _run_discrepancy_with_test(config_path: Path, test_model_dir: Path, run_output_dir: Path): + def _run_discrepancy_with_test(config_path: Path, run_output_dir: Path): _run_cli_main( [ "run", "--config", str(config_path), "--test", - str(test_model_dir), "--output_path", str(run_output_dir), ] @@ -228,10 +279,9 @@ def _assert_discrepancy_model_builder(self, tmp_path: Path, model_id: str): model_name = model_id.replace("/", "--") model_path = tmp_path / "models" / f"{model_name}-disc" config_output_dir = tmp_path / f"{model_name}-disc-cfg" - test_model_dir = tmp_path / f"{model_name}-disc-test-model" run_output_dir = tmp_path / f"{model_name}-disc-run" - _save_local_tiny_llama(model_path) + _save_local_tiny_model(model_id, model_path) _run_cli_main( [ "optimize", @@ -254,13 +304,13 @@ def _assert_discrepancy_model_builder(self, tmp_path: Path, model_id: str): _set_offline_gptq_data_config(config_path) # Run with --test; OnnxDiscrepancyCheck is auto-injected and reports discrepancy metrics - self._run_discrepancy_with_test(config_path, test_model_dir, run_output_dir) + self._run_discrepancy_with_test(config_path, run_output_dir) def _assert_discrepancy_mobius(self, tmp_path: Path, model_id: str): model_name = model_id.replace("/", "--") model_path = tmp_path / "models" / f"{model_name}-mobius-disc" - test_model_dir = tmp_path / f"{model_name}-mobius-disc-test-model" run_output_dir = tmp_path / f"{model_name}-mobius-disc-run" + test_model_dir = run_output_dir / "reference_hf_model" _save_local_tiny_llama(model_path) @@ -292,12 +342,11 @@ def _assert_discrepancy_mobius(self, tmp_path: Path, model_id: str): config_path = tmp_path / f"{model_name}-mobius-disc-config.json" config_path.write_text(json.dumps(run_config, indent=2)) - self._run_discrepancy_with_test(config_path, test_model_dir, run_output_dir) + self._run_discrepancy_with_test(config_path, run_output_dir) def _assert_discrepancy_torch_export(self, tmp_path: Path, model_id: str): model_name = model_id.replace("/", "--") model_path = tmp_path / "models" / f"{model_name}-torch-disc" - test_model_dir = tmp_path / f"{model_name}-torch-disc-test-model" run_output_dir = tmp_path / f"{model_name}-torch-disc-run" _save_local_tiny_llama(model_path) @@ -325,7 +374,7 @@ def _assert_discrepancy_torch_export(self, tmp_path: Path, model_id: str): } config_path = tmp_path / f"{model_name}-torch-disc-config.json" config_path.write_text(json.dumps(run_config, indent=2)) - self._run_discrepancy_with_test(config_path, test_model_dir, run_output_dir) + self._run_discrepancy_with_test(config_path, run_output_dir) def _assert_file_size_below_limit(self, path: Path): assert path.exists() diff --git a/test/common/test_hf.py b/test/common/test_hf.py index 9f1ad736e..4204606e0 100644 --- a/test/common/test_hf.py +++ b/test/common/test_hf.py @@ -54,6 +54,34 @@ def test_load_model_from_task_test_model_config(model_config, hidden_layers_attr assert getattr(mock_model_class.from_config.call_args.args[0], hidden_layers_attr) == 2 +def test_load_model_from_task_test_model_config_saves_tokenizer(tmp_path): + """The reference tokenizer should be saved into the test model directory.""" + model_config = BertConfig(num_hidden_layers=12) # pylint: disable=unexpected-keyword-arg + created_model = MagicMock() + test_model_path = tmp_path / "saved_test_model" + mock_tokenizer = MagicMock() + + with ( + patch("transformers.pipelines.check_task") as mock_check_task, + patch("olive.common.hf.utils.from_pretrained", return_value=model_config), + patch("olive.common.hf.utils.get_tokenizer", return_value=mock_tokenizer) as mock_get_tokenizer, + patch("olive.common.hf.utils.save_tokenizer") as mock_save_tokenizer, + ): + mock_model_class = MagicMock() + mock_model_class.from_config.return_value = created_model + mock_check_task.return_value = ("text-classification", {"pt": (mock_model_class,)}, None) + + load_model_from_task( + "text-classification", + "dummy-model", + test_model_config={"num_hidden_layers": 2}, + test_model_path=str(test_model_path), + ) + + mock_get_tokenizer.assert_called_once_with("dummy-model") + mock_save_tokenizer.assert_called_once_with(mock_tokenizer, str(test_model_path)) + + def test_load_model_from_task_test_model_config_fails_without_fallback(): model_config = BertConfig(num_hidden_layers=12) # pylint: disable=unexpected-keyword-arg @@ -106,6 +134,8 @@ def test_load_model_from_task_test_model_config_reuses_saved_model(tmp_path): test_model_path.mkdir() (test_model_path / "config.json").write_text("{}") (test_model_path / TEST_MODEL_MARKER_FILE).write_text(json.dumps({"type": "olive_hf_test_model"})) + # Add a dummy weight shard so the weights-present branch is exercised. + (test_model_path / "model.safetensors").write_bytes(b"dummy") loaded_model = MagicMock(spec=torch.nn.Module) with ( @@ -129,6 +159,37 @@ def test_load_model_from_task_test_model_config_reuses_saved_model(tmp_path): assert mock_from_pretrained.call_args_list[1].args[1] == str(test_model_path) +def test_load_model_from_task_test_model_config_completes_config_only_dir(tmp_path): + """A config-only test model dir (created during --dry_run) should be completed with weights.""" + model_config = BertConfig(num_hidden_layers=12) # pylint: disable=unexpected-keyword-arg + test_model_path = tmp_path / "config_only_test_model" + test_model_path.mkdir() + # Simulate a config-only dir created by save_test_model_config during --dry_run: + # has config.json + marker but no weight shards. + (test_model_path / "config.json").write_text("{}") + (test_model_path / TEST_MODEL_MARKER_FILE).write_text(json.dumps({"type": "olive_hf_test_model"})) + created_model = MagicMock() + + with ( + patch("transformers.pipelines.check_task") as mock_check_task, + patch("olive.common.hf.utils.from_pretrained", return_value=model_config), + ): + mock_model_class = MagicMock() + mock_model_class.from_config.return_value = created_model + mock_check_task.return_value = ("text-classification", {"pt": (mock_model_class,)}, None) + + model = load_model_from_task( + "text-classification", + "dummy-model", + test_model_config={"num_hidden_layers": 2}, + test_model_path=str(test_model_path), + ) + + assert model is created_model + mock_model_class.from_config.assert_called_once() + created_model.save_pretrained.assert_called_once_with(str(test_model_path)) + + def test_load_model_from_task_test_model_config_rejects_non_test_model_dir(tmp_path): model_config = BertConfig(num_hidden_layers=12) # pylint: disable=unexpected-keyword-arg test_model_path = tmp_path / "saved_test_model" diff --git a/test/passes/onnx/test_discrepancy_check.py b/test/passes/onnx/test_discrepancy_check.py index 8bb53a966..d19e437b7 100644 --- a/test/passes/onnx/test_discrepancy_check.py +++ b/test/passes/onnx/test_discrepancy_check.py @@ -7,6 +7,8 @@ import sys from unittest.mock import MagicMock, patch +import pytest + from olive.passes.onnx.discrepancy_check import _longest_common_token_sequence @@ -60,6 +62,7 @@ def test_compare_generation_returns_common_prefix_length(self): config.genai_model_path = "mock_genai_model" config.generate_prompt = "Hello world" config.generate_max_new_tokens = 10 + config.first_n_tokens_timed = 5 # Mock transformers tokenizer and model mock_tokenizer = MagicMock() @@ -103,11 +106,24 @@ def get_next_tokens_side_effect(): patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer), ): pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck) - result = pass_instance.compare_generation(config, mock_ref_model) + result = pass_instance.compare_generation( + config, mock_ref_model, ref_model_path=config.reference_model_path + ) mock_generator.append_tokens.assert_called_once_with([[1, 2, 3]]) - # Common prefix: [1, 2, 3, 10, 11] = 5 tokens before divergence - assert result == 5 + # Generated-only common prefix: transformers [10, 11, 12, 13] vs genai [10, 11, 99, 99] + # matches on [10, 11] = 2 tokens before divergence (shared prompt is excluded). + assert result["longest_common_token_sequence"] == 2 + # Latency metrics are exposed for both transformers and ONNX Runtime GenAI. + assert result["first_n_tokens_timed"] == 5 + for key in ( + "transformers_time_to_first_token_s", + "transformers_time_to_first_n_tokens_s", + ): + assert key in result + assert isinstance(result[key], float) + for key in ("genai_time_to_first_token_s", "genai_time_to_first_n_tokens_s"): + assert key in result def test_compare_generation_fully_matching(self): """Test when both outputs are identical.""" @@ -120,6 +136,7 @@ def test_compare_generation_fully_matching(self): config.genai_model_path = "mock_genai_model" config.generate_prompt = "Test" config.generate_max_new_tokens = 5 + config.first_n_tokens_timed = 5 mock_tokenizer = MagicMock() mock_tokenizer.return_value = MagicMock(input_ids=torch.tensor([[10, 20]])) @@ -158,11 +175,203 @@ def get_next_tokens_side_effect(): patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer), ): pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck) - result = pass_instance.compare_generation(config, mock_ref_model) + result = pass_instance.compare_generation( + config, mock_ref_model, ref_model_path=config.reference_model_path + ) mock_generator.append_tokens.assert_called_once_with([[10, 20]]) - # All 5 tokens match - assert result == 5 + # All 3 generated tokens match (shared prompt is excluded) + assert result["longest_common_token_sequence"] == 3 + assert result["first_n_tokens_timed"] == 5 + for key in ( + "transformers_time_to_first_token_s", + "transformers_time_to_first_n_tokens_s", + ): + assert key in result + assert isinstance(result[key], float) + assert "genai_time_to_first_token_s" in result + assert isinstance(result["genai_time_to_first_token_s"], float) + assert "genai_time_to_first_n_tokens_s" in result + assert result["genai_time_to_first_n_tokens_s"] is None or isinstance( + result["genai_time_to_first_n_tokens_s"], float + ) + + def test_compare_generation_with_zero_max_new_tokens(self): + """Test that latency metrics are skipped when max_new_tokens is zero.""" + import torch + + from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck + + config = MagicMock() + config.reference_model_path = "mock_model" + config.genai_model_path = "mock_genai_model" + config.generate_prompt = "Test" + config.generate_max_new_tokens = 0 + config.first_n_tokens_timed = 5 + + mock_tokenizer = MagicMock() + mock_tokenizer.return_value = MagicMock(input_ids=torch.tensor([[10, 20]])) + + mock_ref_model = MagicMock() + mock_ref_model.device = torch.device("cpu") + mock_ref_model.generate.return_value = torch.tensor([[10, 20]]) + + mock_og = MagicMock() + mock_og.Model.return_value = MagicMock() + mock_genai_tokenizer = MagicMock() + mock_og.Tokenizer.return_value = mock_genai_tokenizer + mock_genai_tokenizer.encode.return_value = [10, 20] + mock_og.GeneratorParams.return_value = MagicMock() + + mock_generator = MagicMock() + mock_generator.is_done.return_value = True + mock_og.Generator.return_value = mock_generator + + with ( + patch.dict(sys.modules, {"onnxruntime_genai": mock_og}), + patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer), + ): + pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck) + result = pass_instance.compare_generation( + config, mock_ref_model, ref_model_path=config.reference_model_path + ) + + assert mock_ref_model.generate.call_count == 1 + assert mock_ref_model.generate.call_args.kwargs["max_new_tokens"] == 0 + assert result["first_n_tokens_timed"] == 0 + assert result["transformers_time_to_first_token_s"] is None + assert result["transformers_time_to_first_n_tokens_s"] is None + + def test_compare_generation_reports_first_token_match(self): + """first_token_matches is True when both first generated tokens are identical.""" + import torch + + from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck + + config = MagicMock() + config.reference_model_path = "mock_model" + config.genai_model_path = None + config.generate_prompt = "Hello world" + config.generate_max_new_tokens = 10 + config.first_n_tokens_timed = 5 + + mock_tokenizer = MagicMock() + mock_tokenizer.return_value = MagicMock(input_ids=torch.tensor([[1, 2, 3]])) + + mock_ref_model = MagicMock() + mock_ref_model.device = torch.device("cpu") + # First generated token (after the 3-token prompt) is 10. + mock_ref_model.generate.return_value = torch.tensor([[1, 2, 3, 10, 11, 12]]) + + mock_og = MagicMock() + mock_og.Model.return_value = MagicMock() + mock_genai_tokenizer = MagicMock() + mock_og.Tokenizer.return_value = mock_genai_tokenizer + mock_genai_tokenizer.encode.return_value = [1, 2, 3] + mock_og.GeneratorParams.return_value = MagicMock() + + mock_generator = MagicMock() + # GenAI first generated token is also 10 -> match. + genai_new_tokens = [10, 99, 99] + call_count = [0] + + def is_done_side_effect(): + return call_count[0] >= len(genai_new_tokens) + + def get_next_tokens_side_effect(): + token = genai_new_tokens[call_count[0]] + call_count[0] += 1 + return [token] + + mock_generator.is_done = is_done_side_effect + mock_generator.get_next_tokens = get_next_tokens_side_effect + mock_og.Generator.return_value = mock_generator + + with ( + patch.dict(sys.modules, {"onnxruntime_genai": mock_og}), + patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer), + ): + pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck) + result = pass_instance.compare_generation( + config, + mock_ref_model, + ref_model_path=config.reference_model_path, + genai_model_path="explicit_genai_dir", + max_new_tokens=20, + first_n=5, + ) + + # The explicit genai_model_path override is used for og.Model. + mock_og.Model.assert_called_once_with("explicit_genai_dir") + # The max_new_tokens override is forwarded to transformers.generate. + assert mock_ref_model.generate.call_args.kwargs["max_new_tokens"] == 20 + assert result["transformers_first_token"] == 10 + assert result["genai_first_token"] == 10 + assert result["first_token_matches"] is True + # transformers generated [10, 11, 12] and genai [10, 99, 99] -> second tokens differ. + assert result["transformers_second_token"] == 11 + assert result["genai_second_token"] == 99 + assert result["second_token_matches"] is False + + def test_compare_generation_reports_first_token_mismatch(self): + """first_token_matches is False when the first generated tokens differ.""" + import torch + + from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck + + config = MagicMock() + config.reference_model_path = "mock_model" + config.genai_model_path = "mock_genai_model" + config.generate_prompt = "Hello" + config.generate_max_new_tokens = 10 + config.first_n_tokens_timed = 5 + + mock_tokenizer = MagicMock() + mock_tokenizer.return_value = MagicMock(input_ids=torch.tensor([[1, 2]])) + + mock_ref_model = MagicMock() + mock_ref_model.device = torch.device("cpu") + mock_ref_model.generate.return_value = torch.tensor([[1, 2, 30, 31]]) + + mock_og = MagicMock() + mock_og.Model.return_value = MagicMock() + mock_genai_tokenizer = MagicMock() + mock_og.Tokenizer.return_value = mock_genai_tokenizer + mock_genai_tokenizer.encode.return_value = [1, 2] + mock_og.GeneratorParams.return_value = MagicMock() + + mock_generator = MagicMock() + genai_new_tokens = [40, 41] + call_count = [0] + + def is_done_side_effect(): + return call_count[0] >= len(genai_new_tokens) + + def get_next_tokens_side_effect(): + token = genai_new_tokens[call_count[0]] + call_count[0] += 1 + return [token] + + mock_generator.is_done = is_done_side_effect + mock_generator.get_next_tokens = get_next_tokens_side_effect + mock_og.Generator.return_value = mock_generator + + with ( + patch.dict(sys.modules, {"onnxruntime_genai": mock_og}), + patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer), + ): + pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck) + result = pass_instance.compare_generation( + config, mock_ref_model, ref_model_path=config.reference_model_path + ) + + assert result["transformers_first_token"] == 30 + assert result["genai_first_token"] == 40 + assert result["first_token_matches"] is False + # transformers generated [30, 31] and genai [40, 41] -> second tokens differ too. + assert result["transformers_second_token"] == 31 + assert result["genai_second_token"] == 41 + assert result["second_token_matches"] is False class TestWeightDtypeInference: @@ -313,3 +522,270 @@ def test_measure_speedup_skips_when_timing_iterations_is_zero(self): assert result is None ref_model.assert_not_called() session.run.assert_not_called() + + def test_measure_speedup_returns_latencies_and_speedup(self): + import torch + + from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck + + pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck) + ref_model = MagicMock() + session = MagicMock() + input_data = {"input_ids": torch.tensor([[1, 2, 3]], dtype=torch.int64)} + dataloader = [(input_data, None)] + + with ( + patch("olive.common.utils.format_data", return_value={"input_ids": [1, 2, 3]}), + patch("olive.passes.onnx.discrepancy_check.time.perf_counter", side_effect=[10.0, 14.0, 20.0, 22.0]), + ): + result = pass_instance._measure_speedup( + ref_model=ref_model, + session=session, + dataloader=dataloader, + io_config=MagicMock(), + torch_device=torch.device("cpu"), + warmup_iterations=1, + timing_iterations=2, + ) + + assert result == (2.0, 1.0, 2.0) + assert ref_model.call_count == 3 + assert session.run.call_count == 3 + + +class TestCompareLlamaCpp: + """Unit tests for OnnxDiscrepancyCheck.compare_llama_cpp.""" + + def _make_config(self): + config = MagicMock() + config.reference_model_path = "mock_model" + config.generate_prompt = "Hello world" + config.generate_max_new_tokens = 10 + config.first_n_tokens_timed = 5 + config.llama_cpp_env_path = "/mock/llama_env" + return config + + def _make_hf_config(self): + hf_cfg = MagicMock() + hf_cfg.max_position_embeddings = 64 + hf_cfg.hidden_size = 128 + hf_cfg.num_hidden_layers = 2 + hf_cfg.intermediate_size = 256 + hf_cfg.num_attention_heads = 8 + hf_cfg.num_key_value_heads = 8 + hf_cfg.rms_norm_eps = 1e-5 + hf_cfg.vocab_size = 32 + return hf_cfg + + def test_get_llama_env_python_posix(self, tmp_path): + """Test that the POSIX Python path is returned when it exists.""" + from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck + + (tmp_path / "bin").mkdir() + python = tmp_path / "bin" / "python" + python.touch() + + result = OnnxDiscrepancyCheck._get_llama_env_python(str(tmp_path)) + assert result == str(python) + + def test_get_llama_env_python_missing_raises(self, tmp_path): + """Test that a RuntimeError is raised when no interpreter is found.""" + from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck + + with pytest.raises(RuntimeError, match="llama_env"): + OnnxDiscrepancyCheck._get_llama_env_python(str(tmp_path)) + + def test_compare_llama_cpp_returns_expected_metrics(self, tmp_path): + """Test that compare_llama_cpp returns all expected keys and correct values.""" + import json + + import torch + + from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck + + config = self._make_config() + + mock_ref_model = MagicMock() + mock_ref_model.device = torch.device("cpu") + # First token from transformers: 42 + mock_ref_model.generate.return_value = torch.tensor([[1, 2, 3, 42]]) + mock_ref_model.state_dict.return_value = {} + + llama_output = { + "first_token_id": 42, + "generated_tokens": [42, 43, 44, 45, 46], + "ttft": 0.05, + "ttfn": 0.25, + "total_time": 0.50, + } + + mock_proc = MagicMock() + mock_proc.stdout = json.dumps(llama_output) + + mock_tokenizer = MagicMock() + mock_tokenizer.return_value = MagicMock( + input_ids=torch.tensor([[1, 2, 3]]), + __getitem__=lambda self, key: torch.tensor([[1, 2, 3]]) if key == "input_ids" else None, + ) + mock_tokenizer.return_value.__getitem__ = lambda self, k: ( + torch.tensor([[1, 2, 3]]) if k == "input_ids" else None + ) + # tokenizer(prompt) returns a dict with "input_ids" as a list + encoded = MagicMock() + encoded.__getitem__ = MagicMock(side_effect=lambda k: torch.tensor([[1, 2, 3]]) if k == "input_ids" else None) + mock_tokenizer.return_value = encoded + mock_tokenizer.get_vocab = MagicMock(return_value={}) + + with ( + patch.object(OnnxDiscrepancyCheck, "_get_llama_env_python", return_value="/mock/llama_env/bin/python"), + patch.object( + OnnxDiscrepancyCheck, "_get_convert_script", return_value="/mock/llama_env/convert_hf_to_gguf.py" + ), + patch("subprocess.run", return_value=mock_proc), + patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer), + patch("transformers.AutoConfig.from_pretrained", return_value=self._make_hf_config()), + patch("numpy.savez"), + ): + pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck) + result = pass_instance.compare_llama_cpp( + config, + mock_ref_model, + output_dir=str(tmp_path), + pytorch_latency_s=0.10, + onnx_latency_s=0.05, + ref_model_path=config.reference_model_path, + ) + + expected_keys = { + "llama_cpp_first_token_id", + "llama_cpp_pytorch_first_token_id", + "llama_cpp_first_token_matches_pytorch", + "llama_cpp_second_token_id", + "llama_cpp_pytorch_second_token_id", + "llama_cpp_second_token_matches_pytorch", + "llama_cpp_longest_common_token_sequence", + "llama_cpp_ttft_s", + "llama_cpp_ttfn_s", + "llama_cpp_total_time_s", + "llama_cpp_speedup_vs_pytorch", + "llama_cpp_speedup_vs_onnx", + } + assert expected_keys <= set(result.keys()) + + assert result["llama_cpp_first_token_id"] == 42 + # transformers generated only one token ([42]), so there is no reference second token. + assert result["llama_cpp_pytorch_second_token_id"] is None + assert result["llama_cpp_second_token_id"] == 43 + assert result["llama_cpp_second_token_matches_pytorch"] is False + # Generated-only comparison: transformers generated [42] vs llama.cpp [42, 43, ...] = 1 match. + assert result["llama_cpp_longest_common_token_sequence"] == 1 + assert result["llama_cpp_ttft_s"] == pytest.approx(0.05) + assert result["llama_cpp_ttfn_s"] == pytest.approx(0.25) + assert result["llama_cpp_total_time_s"] == pytest.approx(0.50) + # speedup = pytorch_latency / llama_ttft = 0.10 / 0.05 = 2.0 + assert result["llama_cpp_speedup_vs_pytorch"] == pytest.approx(2.0) + # speedup = onnx_latency / llama_ttft = 0.05 / 0.05 = 1.0 + assert result["llama_cpp_speedup_vs_onnx"] == pytest.approx(1.0) + + def test_compare_llama_cpp_no_latency_baselines(self, tmp_path): + """Speedup fields are None when pytorch/onnx latencies are not provided.""" + import json + + import torch + + from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck + + config = self._make_config() + + mock_ref_model = MagicMock() + mock_ref_model.device = torch.device("cpu") + mock_ref_model.generate.return_value = torch.tensor([[1, 2, 3, 7]]) + mock_ref_model.state_dict.return_value = {} + + llama_output = { + "first_token_id": 7, + "generated_tokens": [7, 8], + "ttft": 0.10, + "ttfn": None, + "total_time": 0.20, + } + + mock_proc = MagicMock() + mock_proc.stdout = json.dumps(llama_output) + + encoded = MagicMock() + encoded.__getitem__ = MagicMock(side_effect=lambda k: torch.tensor([[1, 2, 3]]) if k == "input_ids" else None) + mock_tokenizer = MagicMock() + mock_tokenizer.return_value = encoded + mock_tokenizer.get_vocab = MagicMock(return_value={}) + + with ( + patch.object(OnnxDiscrepancyCheck, "_get_llama_env_python", return_value="/mock/llama_env/bin/python"), + patch.object( + OnnxDiscrepancyCheck, "_get_convert_script", return_value="/mock/llama_env/convert_hf_to_gguf.py" + ), + patch("subprocess.run", return_value=mock_proc), + patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer), + patch("transformers.AutoConfig.from_pretrained", return_value=self._make_hf_config()), + patch("numpy.savez"), + ): + pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck) + result = pass_instance.compare_llama_cpp( + config, mock_ref_model, output_dir=str(tmp_path), ref_model_path=config.reference_model_path + ) + + assert result["llama_cpp_speedup_vs_pytorch"] is None + assert result["llama_cpp_speedup_vs_onnx"] is None + assert result["llama_cpp_first_token_id"] == 7 + assert result["llama_cpp_first_token_matches_pytorch"] is True + + def test_compare_llama_cpp_uses_preconverted_gguf(self, tmp_path): + import json + + import torch + + from olive.passes.onnx.discrepancy_check import OnnxDiscrepancyCheck + + config = self._make_config() + gguf_path = tmp_path / "prebuilt.gguf" + gguf_path.write_text("ok") + + mock_ref_model = MagicMock() + mock_ref_model.device = torch.device("cpu") + mock_ref_model.generate.return_value = torch.tensor([[1, 2, 3, 7]]) + + llama_output = { + "first_token_id": 7, + "generated_tokens": [7, 8], + "ttft": 0.10, + "ttfn": None, + "total_time": 0.20, + } + + mock_proc = MagicMock() + mock_proc.stdout = json.dumps(llama_output) + + encoded = MagicMock() + encoded.__getitem__ = MagicMock(side_effect=lambda k: torch.tensor([[1, 2, 3]]) if k == "input_ids" else None) + mock_tokenizer = MagicMock() + mock_tokenizer.return_value = encoded + mock_tokenizer.get_vocab = MagicMock(return_value={}) + + with ( + patch.object(OnnxDiscrepancyCheck, "_get_llama_env_python", return_value="/mock/llama_env/bin/python"), + patch.object(OnnxDiscrepancyCheck, "_get_convert_script") as mock_convert_script, + patch("subprocess.run", return_value=mock_proc) as mock_subprocess_run, + patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer), + ): + pass_instance = OnnxDiscrepancyCheck.__new__(OnnxDiscrepancyCheck) + result = pass_instance.compare_llama_cpp( + config, + mock_ref_model, + output_dir=str(tmp_path), + ref_model_path=config.reference_model_path, + preconverted_gguf_path=str(gguf_path), + ) + + assert result["llama_cpp_first_token_id"] == 7 + mock_convert_script.assert_not_called() + assert mock_subprocess_run.call_count == 1 diff --git a/test/passes/onnx/test_model_builder.py b/test/passes/onnx/test_model_builder.py index 0f71535db..634cc762a 100644 --- a/test/passes/onnx/test_model_builder.py +++ b/test/passes/onnx/test_model_builder.py @@ -187,6 +187,74 @@ def fake_create_model(*_, **kwargs): assert fake_builder.create_model.call_args.kwargs["input_path"] == str(test_model_path) +def test_model_builder_materializes_weights_for_config_only_test_dir(tmp_path): + """A config-only test-model dir (no weights) must still trigger weight materialization. + + Otherwise the model builder would initialize its own weights and the ONNX model would not + match the reference model that OnnxDiscrepancyCheck later loads from the same directory. + """ + from olive.common.hf.utils import TEST_MODEL_MARKER_FILE + + test_model_path = tmp_path / "reference_hf_model" + output_folder = tmp_path / "output_model" + + # Pre-create a config-only Olive test-model directory: marker + config.json, but no weights. + test_model_path.mkdir(parents=True, exist_ok=True) + (test_model_path / "config.json").write_text("{}") + (test_model_path / TEST_MODEL_MARKER_FILE).write_text( + json.dumps({"type": "olive_hf_test_model", "test_model_config": {}}) + ) + + mock_cfg = MagicMock() + mock_cfg.to_dict.return_value = {} + with patch.object(HfModelHandler, "get_hf_model_config", return_value=mock_cfg): + input_model = HfModelHandler( + model_path=TINY_RANDOM_LLAMA_MODEL_ID, + test_model_config={"hidden_layers": 2}, + test_model_path=str(test_model_path), + ) + + def materialize_weights(*args, **kwargs): + (test_model_path / "model.safetensors").write_text("weights") + return MagicMock() + + def fake_create_model(*_, **kwargs): + output_dir = Path(kwargs["output_dir"]) + (output_dir / kwargs["filename"]).write_text("dummy onnx file") + (output_dir / "genai_config.json").write_text("{}") + + fake_builder = types.ModuleType("onnxruntime_genai.models.builder") + fake_builder.create_model = MagicMock(side_effect=fake_create_model) + fake_models = types.ModuleType("onnxruntime_genai.models") + fake_models.builder = fake_builder + fake_ort_genai = types.ModuleType("onnxruntime_genai") + fake_ort_genai.models = fake_models + fake_ort_genai.__version__ = "0.0.0" + + p = create_pass_from_dict(ModelBuilder, {"precision": "fp32"}, disable_search=True) + + with ( + patch.object(ModelBuilder, "maybe_patch_quant"), + patch.dict( + sys.modules, + { + "onnxruntime_genai": fake_ort_genai, + "onnxruntime_genai.models": fake_models, + "onnxruntime_genai.models.builder": fake_builder, + }, + ), + patch.object(input_model, "load_model", side_effect=materialize_weights) as mock_load_model, + patch.object(input_model, "save_metadata", return_value=[]), + ): + output_model = p.run(input_model, output_folder) + + assert isinstance(output_model, ONNXModelHandler) + # Weights were missing, so load_model must be called to persist them into the shared dir. + assert mock_load_model.call_count == 1 + # The ONNX model is built from the shared test-model directory (same weights as the reference). + assert fake_builder.create_model.call_args.kwargs["input_path"] == str(test_model_path.resolve()) + + def test_model_builder_apply_annotations_on_single_file_fallback(tmp_path, monkeypatch): def fake_create_model( model_name, input_path, output_dir, precision, execution_provider, cache_dir, filename, **kwargs diff --git a/test/passes/pytorch/test_convert_hf_to_gguf.py b/test/passes/pytorch/test_convert_hf_to_gguf.py new file mode 100644 index 000000000..e8ed5d8eb --- /dev/null +++ b/test/passes/pytorch/test_convert_hf_to_gguf.py @@ -0,0 +1,66 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +# pylint: disable=protected-access + +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import patch + +from olive.passes.pytorch.convert_hf_to_gguf import ConvertHfToGGUF + + +def test_convert_hf_to_gguf_skips_when_missing_source(tmp_path): + pass_instance = ConvertHfToGGUF.__new__(ConvertHfToGGUF) + model = SimpleNamespace(test_model_path=str(tmp_path / "missing"), model_attributes=None) + config = SimpleNamespace( + llama_cpp_env_path=str(tmp_path / "llama_env"), + reference_model_path=str(tmp_path / "missing"), + gguf_file_name="model.gguf", + ) + + result = pass_instance._run_for_config(model, config, str(tmp_path / "out")) + assert result is model + + +def test_convert_hf_to_gguf_uses_existing_gguf(tmp_path): + source = tmp_path / "test_model" + source.mkdir(parents=True, exist_ok=True) + gguf_path = source / "model.gguf" + gguf_path.write_text("ok") + + pass_instance = ConvertHfToGGUF.__new__(ConvertHfToGGUF) + model = SimpleNamespace(test_model_path=str(source), model_attributes={}) + config = SimpleNamespace( + llama_cpp_env_path=str(tmp_path / "llama_env"), + reference_model_path=str(source), + gguf_file_name="model.gguf", + ) + + result = pass_instance._run_for_config(model, config, str(tmp_path / "out")) + assert result.model_attributes["reference_gguf_model_path"] == str(gguf_path) + + +def test_convert_hf_to_gguf_runs_conversion(tmp_path): + source = tmp_path / "test_model" + source.mkdir(parents=True, exist_ok=True) + env = tmp_path / "llama_env" + (env / "bin").mkdir(parents=True, exist_ok=True) + (env / "bin" / "python").write_text("") + (env / "convert_hf_to_gguf.py").write_text("") + (env / "conversion").mkdir(parents=True, exist_ok=True) + + pass_instance = ConvertHfToGGUF.__new__(ConvertHfToGGUF) + model = SimpleNamespace(test_model_path=str(source), model_attributes={}) + config = SimpleNamespace( + llama_cpp_env_path=str(env), + reference_model_path=str(source), + gguf_file_name="model.gguf", + ) + + with patch("olive.passes.pytorch.convert_hf_to_gguf.subprocess.run") as mock_run: + result = pass_instance._run_for_config(model, config, str(tmp_path / "out")) + + assert mock_run.call_count == 1 + assert Path(result.model_attributes["reference_gguf_model_path"]).name == "model.gguf"