Question about metal gemm #3380

Lazarus-931 · 2026-04-06T15:20:55Z

Lazarus-931
Apr 6, 2026

I've written a fused attention kernel targeting M2 and I'm benchmarking against MLX's
scaled_dot_product_attention. After several rounds of optimization I'm still ~2x slower and I'm trying to understand what architectural choices explain the gap.

The Kernel

//
//  attn_2048_128.metal
//
//  Fused attention for seq=2048, d=128
//  2×2 SIMD layout: 2 row groups × 2 col groups
//  Q persistent; K/V share one buffer; BlockMMA for tiled GEMM

#include <metal_stdlib>
#include "params.h"
#include "tools/tile.h"
#include "../../ops/block_mma.h"
#include "types.h"


using namespace metal;


kernel void attn_2048_128(
    device const half* Q   [[buffer(0)]],
    device const half* K   [[buffer(1)]],
    device const half* V   [[buffer(2)]],
    device half* O         [[buffer(3)]],
    constant Params& param [[buffer(4)]],
    uint2 gid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_id [[simdgroup_index_in_threadgroup]],
    uint lane_id [[thread_index_in_simdgroup]])
{
    constexpr uint D = 128;
    constexpr uint SEQ = 2048;
    constexpr uint KEY_TILES = SEQ / 128;
    constexpr float SCALE = (1.0f / 11.3137f) * 1.4426950408889634f;
    constexpr uint KV_STRIDE = 134;
    constexpr uint Q_FULL_STRIDE = 136;

    threadgroup half Qs[16 * Q_FULL_STRIDE];
    threadgroup half KVs[16 * KV_STRIDE];
    threadgroup half simd_scratch[4 * 8];

    uint tileRow = gid.y * 16;
    uint sr = (simd_id / 2) * 8;
    uint sc = (simd_id % 2) * 64;
    uint partner = simd_id ^ 1;
    uint col_group = simd_id % 2;

    using namespace Metalix::tools;
    short my_row = Frag::get_coord(lane_id).y;

    // Persistent output accumulator
    BlockMMA<64> output;
    output.clear();
    float rmax = -INFINITY;
    float rsum = 0.0f;

    // Load Q once
    for (uint i = lid; i < 16 * D; i += 128) {
        uint r = i / D, c = i % D;
        Qs[r * Q_FULL_STRIDE + c] = half(Q[(tileRow + r) * D + c]) * SCALE;
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);

    for (uint t = 0; t < KEY_TILES; t++) {

        // ── QK^T ──
        BlockMMA<64> scores;
        scores.clear();

        for (uint kb = 0; kb < D; kb += 16) {
            for (uint i = lid; i < 16 * 128; i += 128) {
                uint r = i % 16, c = i / 16;
                KVs[r * KV_STRIDE + c] = K[(t * 128 + c) * D + kb + r];
            }
            threadgroup_barrier(mem_flags::mem_threadgroup);

            scores.mma<16>(Qs + sr * Q_FULL_STRIDE + kb, Q_FULL_STRIDE,
                           KVs + sc, KV_STRIDE);

            threadgroup_barrier(mem_flags::mem_threadgroup);
        }

        // ── Online softmax ──
        Tile<1, 8> S;
        S.set_coord(lane_id);
        S.from_simd(scores.acc);

        half tile_max[1];
        S.row_max(tile_max);
        simd_scratch[simd_id * 8 + my_row] = tile_max[0];
        threadgroup_barrier(mem_flags::mem_threadgroup);
        tile_max[0] = max(tile_max[0], simd_scratch[partner * 8 + my_row]);
        threadgroup_barrier(mem_flags::mem_threadgroup);

        float old_max = rmax;
        rmax = max(rmax, float(tile_max[0]));
        float rescale = metal::fast::exp2(old_max - rmax);
        rsum *= rescale;

        // Rescale previous output
        Tile<1, 8> Out;
        Out.set_coord(lane_id);
        Out.from_simd(output.acc);
        float rescale_arr[1] = {rescale};
        Out.row_scale(rescale_arr);
        for (int i = 0; i < 8; i++) Out.at(0, i).to_simd(output.acc[i]);

        S.row_softmax_exp2(tile_max);
        half tile_sum[1];
        S.row_sum(tile_sum);
        simd_scratch[simd_id * 8 + my_row] = tile_sum[0];
        threadgroup_barrier(mem_flags::mem_threadgroup);
        tile_sum[0] += simd_scratch[partner * 8 + my_row];
        threadgroup_barrier(mem_flags::mem_threadgroup);
        rsum += tile_sum[0];

        // ── PV: scores × V ──
        // Store scores to KVs for PV multiply
        for (int i = 0; i < 8; i++)
            S.at(0, i).store(KVs + sr * KV_STRIDE + sc + i * 8, KV_STRIDE);
        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Load V blocks into Qs (temp), multiply with scores from KVs
        uint v_col_off = col_group * 68;
        threadgroup half* my_Vs = Qs + v_col_off;

        for (uint vk = 0; vk < 128; vk += 16) {
            uint pair_lid = (simd_id / 2) * 32 + lane_id;
            for (uint i = pair_lid; i < 16 * 64; i += 64) {
                uint r = i / 64, c = i % 64;
                my_Vs[r * Q_FULL_STRIDE + c] = V[(t * 128 + vk + r) * D + sc + c];
            }
            threadgroup_barrier(mem_flags::mem_threadgroup);

            output.mma<16>(KVs + sr * KV_STRIDE + vk, KV_STRIDE,
                           my_Vs, Q_FULL_STRIDE);

            threadgroup_barrier(mem_flags::mem_threadgroup);
        }

        // Reload Q for next tile (V trashed Qs)
        if (t + 1 < KEY_TILES) {
            for (uint i = lid; i < 16 * D; i += 128) {
                uint r = i / D, c = i % D;
                Qs[r * Q_FULL_STRIDE + c] = half(Q[(tileRow + r) * D + c]) * SCALE;
            }
            threadgroup_barrier(mem_flags::mem_threadgroup);
        }
    }

    // ── Normalize + coalesced write ──
    Tile<1, 8> O_final;
    O_final.set_coord(lane_id);
    O_final.from_simd(output.acc);

    float inv_sum[1] = {1.0f / rsum};
    O_final.row_scale(inv_sum);

    for (int j = 0; j < 8; j++)
        O_final.at(0, j).store(KVs + sr * KV_STRIDE + sc + j * 8, KV_STRIDE);
    threadgroup_barrier(mem_flags::mem_threadgroup);

    for (uint i = lid; i < 16 * D; i += 128) {
        uint r = i / D, c = i % D;
        O[(tileRow + r) * D + c] = KVs[r * KV_STRIDE + c];
    }
}

Benchmarking setup

Hardware: Apple M2 (8-core GPU, 100 GB/s bandwidth)
Problem: seq=2048, d=128, single head, fp16
Dispatch: threadgroups = (1, seq/16, 1), threads_per_tg = (128, 1, 1)
Timing: GPU timestamps via cmd->GPUEndTime() - cmd->GPUStartTime(), median of 10 runs after 3 warmup
Correctness: verified against CPU fp32 reference (max_abs_err < 0.001, l2_rel_err ≈ 0.0003)

Times

Mine: ~3230 us / 670 GFLOPS
MLX: ~1550 us / 1400 GFLOPS
Ratio: ~2.1x

Key design choices:

Q is loaded once before the main loop and stays in threadgroup memory
K and V share one buffer (MLX-style) — K^T is loaded for QK^T, scores overwrite it after softmax, then V overwrites it for PV
Since V trashes Qs, Q is reloaded once per key tile (16×128 halfs)
Softmax requires cross-SIMD reduction (the 2×2 split means each simdgroup only has 64 of 128 score columns) via threadgroup scratch + barriers
Output is staged through shared memory for coalesced device writes
MMA is wrapped in a templated BlockMMA<BN> struct

This was specifically a standard attention: tile Q into 16-row blocks, stream K/V in 128-column tiles, online softmax with running max/sum, accumulate O in registers.

Note: my kernel is specialized for seq=2048, d=128 with compile-time constants (loop bounds, strides, tile counts are all constexpr). MLX's SDPA is fully general. Despite this advantage, I'm still 2.1x slower — which suggests the gap is
architectural, not from runtime overhead.

Any thoughts or advice would be much appreciated!

Lazarus-931 · 2026-04-06T15:28:07Z

Lazarus-931
Apr 6, 2026
Author

For refrence, I was testing mlx's sdpa like this:

import mlx.core as mx, time, sys

seq = int(sys.argv[1]) if len(sys.argv) > 1 else 2048
d = int(sys.argv[2]) if len(sys.argv) > 2 else 128
iters = int(sys.argv[3]) if len(sys.argv) > 3 else 20

Q = mx.random.normal((1, 1, seq, d)).astype(mx.float16)
K = mx.random.normal((1, 1, seq, d)).astype(mx.float16)
V = mx.random.normal((1, 1, seq, d)).astype(mx.float16)
mx.eval(Q, K, V)

for _ in range(5):
    O = mx.fast.scaled_dot_product_attention(Q, K, V, scale=1.0/d**0.5)
    mx.eval(O)

times = []
for _ in range(iters):
    t0 = time.perf_counter()
    O = mx.fast.scaled_dot_product_attention(Q, K, V, scale=1.0/d**0.5)
    mx.eval(O)
    times.append((time.perf_counter() - t0) * 1e6)

times.sort()
t = times[len(times) // 2]
flops = 4 * seq * seq * d + 5 * seq * seq
print(f"{t:.0f},{flops / (t * 1e3):.1f}")

0 replies

aicayzer · 2026-06-06T18:07:46Z

aicayzer
Jun 6, 2026

Few likely culprits worth investigating — based on a quick read of your kernel vs mlx's steel_attention (in mlx/backend/metal/kernels/steel/attn/).

1. Your 2×2 simdgroup split is probably the biggest single cost. Each row of scores is owned by 2 simdgroups, so the softmax row-reductions need a partner-SIMD exchange via simd_scratch every key tile — that's 2 barriers + a half-precision dance per tile, × 16 tiles. mlx parameterises the layout as WM×WN (template params on the kernel) — typically WM=4, so each score row lives entirely in one simdgroup and row_max / row_sum become intra-simdgroup reductions with zero barriers.

2. Q reload per tile. Because V trashes Qs, you reload Q (16×128 halfs ≈ 4 KB) on every key tile. That's ~64 KB of redundant device→threadgroup traffic per output tile. mlx reserves a separate slot for V tiles so Q stays resident through the whole sequence.

3. fp16 accumulators for softmax stats. Your simd_scratch is half. mlx defaults AccumType = float specifically for max / sum (see the template signature in steel_attention.h). fp16 max/sum over 2048 keys is lossy enough to need extra correction work elsewhere — and even when it doesn't, the conversion ops cost more than fp32 would.

4. Score writeback for PV. You write scores → KVs → read them back for PV. That's an extra barrier + threadgroup bandwidth per tile. mlx keeps the simdgroup matrix accumulators live through softmax into PV via the mma ops directly.

5. Threadgroup size 128. Caps occupancy — mlx uses WM*WN*32 (typically 128–256+) which gives the scheduler more wavefronts to hide memory latency.

My guess is (1) + (3) together explain most of the 2×. Your constexpr specialisation already gets you full unrolling, so that's not the differentiator vs mlx's templates.

Source for reference: mlx/backend/metal/kernels/steel/attn/ and the attention<T, BQ, BK, BD, WM, WN, MaskType, AccumType> template signature in steel_attention.h line ~60.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Question about metal gemm #3380

Uh oh!

{{title}}

Uh oh!

Replies: 2 comments

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

Uh oh!

Question about metal gemm #3380

Uh oh!

Lazarus-931 Apr 6, 2026

Benchmarking setup

Times

Replies: 2 comments

Uh oh!

Lazarus-931 Apr 6, 2026 Author

Uh oh!

aicayzer Jun 6, 2026

Lazarus-931
Apr 6, 2026

Lazarus-931
Apr 6, 2026
Author

aicayzer
Jun 6, 2026