diff --git a/speed-bench/README.md b/speed-bench/README.md index 32075fe18..2d0844c32 100644 --- a/speed-bench/README.md +++ b/speed-bench/README.md @@ -17,6 +17,7 @@ Run `ds4-bench` as: Provide PR including your numbers if your hardware was not already tested. Call the benchmark csv file something like `m3_max.csv` or alike, so that it is clear what hardware was used for the benchmark. +Record the machine, backend, model, and run parameters in `benchmarks.json`. To generate an SVG graph from a CSV file: @@ -26,3 +27,26 @@ python3 speed-bench/plot_speed.py speed-bench/m3_max.csv --title "M3 Max t/s" The script uses only the Python standard library. By default it writes a file next to the CSV using the `_ts.svg` suffix, such as `speed-bench/m3_max_ts.svg`. + + +## Benchmark Summary + +Generated from the CSV files in this directory by `python3 speed-bench/update_summary.py`. + +`@ 32k ctx` means the row where `ctx_tokens` is `32768`. + +### DeepSeek V4 Flash q2 + +| Hardware | Best gen (t/s) | Gen @ 32k ctx (t/s) | Avg gen (t/s) | Best prefill (t/s) | Prefill @ 32k ctx (t/s) | Avg prefill (t/s) | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| Apple M4 Max | 26.8 | 24.5 | 24.6 | 344 | 248 | 250 | +| Apple M2 Ultra | 23.2 | 21.9 | 21.9 | 411 | 326 | 325 | +| NVIDIA DGX Spark / GB10 | 14.2 | 13.0 | 13.1 | 403 | 346 | 343 | + +### DeepSeek V4 PRO q2 + +| Hardware | Best gen (t/s) | Gen @ 32k ctx (t/s) | Avg gen (t/s) | Best prefill (t/s) | Prefill @ 32k ctx (t/s) | Avg prefill (t/s) | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| Apple M3 Ultra | 12.4 | 9.56 | 9.90 | 183 | 139 | 149 | + + diff --git a/speed-bench/benchmarks.json b/speed-bench/benchmarks.json new file mode 100644 index 000000000..5f93bc2a1 --- /dev/null +++ b/speed-bench/benchmarks.json @@ -0,0 +1,57 @@ +{ + "schema_version": 1, + "benchmarks": [ + { + "csv": "gb10.csv", + "hardware": "NVIDIA DGX Spark / GB10", + "backend": "CUDA", + "model": "DeepSeek V4 Flash", + "quant": "q2", + "model_label": "DeepSeek V4 Flash q2", + "prompt_file": "speed-bench/promessi_sposi.txt", + "ctx_start": 2048, + "ctx_max": 65536, + "step_incr": 2048, + "gen_tokens": 128 + }, + { + "csv": "m2_ultra.csv", + "hardware": "Apple M2 Ultra", + "backend": "Metal", + "model": "DeepSeek V4 Flash", + "quant": "q2", + "model_label": "DeepSeek V4 Flash q2", + "prompt_file": "speed-bench/promessi_sposi.txt", + "ctx_start": 2048, + "ctx_max": 65536, + "step_incr": 2048, + "gen_tokens": 128 + }, + { + "csv": "m4_max.csv", + "hardware": "Apple M4 Max", + "backend": "Metal", + "model": "DeepSeek V4 Flash", + "quant": "q2", + "model_label": "DeepSeek V4 Flash q2", + "prompt_file": "speed-bench/promessi_sposi.txt", + "ctx_start": 2048, + "ctx_max": 65536, + "step_incr": 2048, + "gen_tokens": 128 + }, + { + "csv": "pro_model_m3_ultra.csv", + "hardware": "Apple M3 Ultra", + "backend": "Metal", + "model": "DeepSeek V4 PRO", + "quant": "q2", + "model_label": "DeepSeek V4 PRO q2", + "prompt_file": "speed-bench/promessi_sposi.txt", + "ctx_start": 2048, + "ctx_max": 32768, + "step_incr": 2048, + "gen_tokens": 128 + } + ] +} diff --git a/speed-bench/update_summary.py b/speed-bench/update_summary.py new file mode 100644 index 000000000..6639e5ab1 --- /dev/null +++ b/speed-bench/update_summary.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 +"""Update the generated benchmark summary in speed-bench/README.md.""" + +import csv +import json +from dataclasses import dataclass +from pathlib import Path + + +BEGIN_MARKER = "" +END_MARKER = "" +README = Path(__file__).with_name("README.md") +BENCH_DIR = Path(__file__).resolve().parent +METADATA = BENCH_DIR / "benchmarks.json" +REQUIRED_COLUMNS = {"ctx_tokens", "prefill_tps", "gen_tps"} +TARGET_CTX = 32768 + + +@dataclass(frozen=True) +class BenchmarkMetadata: + csv: str + hardware: str + backend: str + model: str + quant: str + model_label: str + prompt_file: str + ctx_start: int + ctx_max: int + step_incr: int + gen_tokens: int + + +@dataclass +class BenchSummary: + hardware: str + model: str + best_gen: float + gen_at_target_ctx: float | None + avg_gen: float + best_prefill: float + prefill_at_target_ctx: float | None + avg_prefill: float + + +def read_metadata() -> dict[str, BenchmarkMetadata]: + try: + data = json.loads(METADATA.read_text(encoding="utf-8")) + except FileNotFoundError: + raise SystemExit(f"{METADATA}: metadata file is required") from None + except json.JSONDecodeError as exc: + raise SystemExit(f"{METADATA}: invalid JSON: {exc}") from None + + try: + benchmarks = data["benchmarks"] + except (KeyError, TypeError): + raise SystemExit(f"{METADATA}: expected a benchmarks list") from None + if not isinstance(benchmarks, list): + raise SystemExit(f"{METADATA}: expected a benchmarks list") + + by_csv: dict[str, BenchmarkMetadata] = {} + for item in benchmarks: + try: + metadata = BenchmarkMetadata(**item) + except TypeError as exc: + raise SystemExit(f"{METADATA}: invalid benchmark metadata: {exc}") from None + if metadata.csv in by_csv: + raise SystemExit(f"{METADATA}: duplicate benchmark metadata for {metadata.csv}") + by_csv[metadata.csv] = metadata + return by_csv + + +def fmt_tps(value: float | None) -> str: + if value is None: + return "n/a" + if abs(value) >= 100: + return f"{value:.0f}" + if abs(value) >= 10: + return f"{value:.1f}" + return f"{value:.2f}" + + +def read_summary(path: Path, metadata: BenchmarkMetadata) -> BenchSummary: + rows = [] + with path.open("r", encoding="utf-8-sig", newline="") as fp: + reader = csv.DictReader(fp) + missing = REQUIRED_COLUMNS.difference(reader.fieldnames or ()) + if missing: + missing_list = ", ".join(sorted(missing)) + raise SystemExit(f"{path}: missing CSV column(s): {missing_list}") + + for row in reader: + rows.append( + { + "ctx_tokens": int(row["ctx_tokens"]), + "prefill_tps": float(row["prefill_tps"]), + "gen_tps": float(row["gen_tps"]), + } + ) + + if not rows: + raise SystemExit(f"{path}: no benchmark rows") + + target_row = next((row for row in rows if row["ctx_tokens"] == TARGET_CTX), None) + return BenchSummary( + hardware=metadata.hardware, + model=metadata.model_label, + best_gen=max(row["gen_tps"] for row in rows), + gen_at_target_ctx=target_row["gen_tps"] if target_row else None, + avg_gen=sum(row["gen_tps"] for row in rows) / len(rows), + best_prefill=max(row["prefill_tps"] for row in rows), + prefill_at_target_ctx=target_row["prefill_tps"] if target_row else None, + avg_prefill=sum(row["prefill_tps"] for row in rows) / len(rows), + ) + + +def render_summary(summaries: list[BenchSummary]) -> str: + by_model = {} + for summary in summaries: + by_model.setdefault(summary.model, []).append(summary) + model_groups = sorted( + by_model.items(), + key=lambda item: max(summary.best_gen for summary in item[1]), + reverse=True, + ) + lines = [ + BEGIN_MARKER, + "## Benchmark Summary", + "", + "Generated from the CSV files in this directory by `python3 speed-bench/update_summary.py`.", + "", + f"`@ 32k ctx` means the row where `ctx_tokens` is `{TARGET_CTX}`.", + "", + ] + for model, model_summaries in model_groups: + lines.extend( + [ + f"### {model}", + "", + "| Hardware | Best gen (t/s) | Gen @ 32k ctx (t/s) | Avg gen (t/s) | Best prefill (t/s) | Prefill @ 32k ctx (t/s) | Avg prefill (t/s) |", + "| --- | ---: | ---: | ---: | ---: | ---: | ---: |", + ] + ) + for summary in sorted(model_summaries, key=lambda item: item.best_gen, reverse=True): + lines.append( + "| " + + " | ".join( + [ + summary.hardware, + fmt_tps(summary.best_gen), + fmt_tps(summary.gen_at_target_ctx), + fmt_tps(summary.avg_gen), + fmt_tps(summary.best_prefill), + fmt_tps(summary.prefill_at_target_ctx), + fmt_tps(summary.avg_prefill), + ] + ) + + " |" + ) + lines.append("") + lines.extend([END_MARKER, ""]) + return "\n".join(lines) + + +def replace_generated_section(readme: str, generated: str) -> str: + begin = readme.find(BEGIN_MARKER) + end = readme.find(END_MARKER) + if begin == -1 and end == -1: + return readme.rstrip() + "\n\n" + generated + if begin == -1 or end == -1 or end < begin: + raise SystemExit("README.md has mismatched generated summary markers") + end += len(END_MARKER) + return readme[:begin].rstrip() + "\n\n" + generated.rstrip() + readme[end:].rstrip() + "\n" + + +def main() -> None: + csv_paths = sorted(BENCH_DIR.glob("*.csv")) + if not csv_paths: + raise SystemExit(f"{BENCH_DIR}: no CSV files found") + metadata = read_metadata() + missing = [path.name for path in csv_paths if path.name not in metadata] + if missing: + missing_list = ", ".join(missing) + raise SystemExit(f"{METADATA}: missing metadata for CSV file(s): {missing_list}") + summaries = [read_summary(path, metadata[path.name]) for path in csv_paths] + generated = render_summary(summaries) + README.write_text(replace_generated_section(README.read_text(encoding="utf-8"), generated), encoding="utf-8") + + +if __name__ == "__main__": + main()