Skip to content
2 changes: 1 addition & 1 deletion libs/mlx-swift-lm
Submodule mlx-swift-lm updated 46 files
+10 −6 Libraries/MLXEmbedders/Models/Gemma3.swift
+4 −2 Libraries/MLXEmbedders/Models/NomicBert.swift
+29 −8 Libraries/MLXLLM/Models/GatedDelta.swift
+55 −7 Libraries/MLXLLM/Models/Gemma4Text.swift
+26 −13 Libraries/MLXLLM/Models/LFM2MoE.swift
+1 −1 Libraries/MLXLLM/Models/SSM.swift
+146 −4 Libraries/MLXLMCommon/BatchKVCache.swift
+255 −0 Libraries/MLXLMCommon/CompilableKVCache.swift
+76 −0 Libraries/MLXLMCommon/CompiledDecode.swift
+83 −0 Libraries/MLXLMCommon/DynamicSlice.swift
+10 −4 Libraries/MLXLMCommon/Evaluate.swift
+107 −12 Libraries/MLXLMCommon/KVCache.swift
+39 −0 Libraries/MLXLMCommon/MLXHardwareInfo.swift
+8 −0 Libraries/MLXLMCommon/Registries/ModelTypeRegistry.swift
+31 −73 Libraries/MLXLMCommon/RoPEUtils.swift
+173 −41 Libraries/MLXLMCommon/SwitchLayers.swift
+18 −0 Libraries/MLXLMCommon/Tool/Parsers/JSONToolCallParser.swift
+10 −2 Libraries/MLXLMCommon/Tool/ToolCallFormat.swift
+57 −6 Libraries/MLXLMCommon/Tool/ToolCallProcessor.swift
+23 −3 Libraries/MLXLMCommon/UserInput.swift
+6 −1 Libraries/MLXVLM/MediaProcessing.swift
+13 −1 Libraries/MLXVLM/Models/FastVLM.swift
+31 −12 Libraries/MLXVLM/Models/Gemma3.swift
+1,231 −2,340 Libraries/MLXVLM/Models/Gemma4.swift
+29 −2 Libraries/MLXVLM/Models/Idefics3.swift
+17 −1 Libraries/MLXVLM/Models/LFM2VL.swift
+18 −1 Libraries/MLXVLM/Models/Mistral3.swift
+18 −2 Libraries/MLXVLM/Models/Pixtral.swift
+20 −5 Libraries/MLXVLM/Models/Qwen25VL.swift
+29 −14 Libraries/MLXVLM/Models/Qwen2VL.swift
+17 −127 Libraries/MLXVLM/Models/Qwen35.swift
+9 −2 Libraries/MLXVLM/Models/SmolVLM2.swift
+20 −6 Libraries/MLXVLM/README.md
+3 −0 Package.swift
+68 −0 Tests/MLXLMServerTests/ToolCallParserIntegrationTests.swift
+313 −0 Tests/MLXLMTests/BatchRotatingKVCacheFastPathTests.swift
+196 −0 Tests/MLXLMTests/CompilableKVCacheTests.swift
+82 −0 Tests/MLXLMTests/GatedDeltaTests.swift
+134 −0 Tests/MLXLMTests/LFM2MoERoutingTests.swift
+53 −0 Tests/MLXLMTests/NomicBertTests.swift
+105 −0 Tests/MLXLMTests/Qwen35SanitizeTests.swift
+73 −0 Tests/MLXLMTests/Qwen35VLMGatedDeltaTests.swift
+39 −0 Tests/MLXLMTests/QwenImageBudgetTests.swift
+83 −0 Tests/MLXLMTests/SmolVLM2TilingTests.swift
+111 −0 Tests/MLXLMTests/SwitchGLUTests.swift
+51 −3 Tests/MLXLMTests/UserInputTests.swift

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ extension BatchScheduler {
topK: Int? = nil,
seed: UInt64? = nil,
requestId: String? = nil,
cacheScope: String = ""
cacheScope: String = "",
allowFastPath: Bool = true
) async -> AsyncStream<GenerationEvent> {
let id = requestId ?? "req-\(UUID().uuidString.prefix(12))"
let (stream, continuation) = AsyncStream<GenerationEvent>.makeStream()
Expand Down Expand Up @@ -89,6 +90,34 @@ extension BatchScheduler {
continuation.finish()
return stream
}
// B=1 greedy fast path eligibility — MUST be decided BEFORE inserting our
// bridge (it checks `activeBridges.isEmpty` for exclusivity). When taken,
// this bypasses the batched engine + planner for a single greedy request;
// otherwise the unchanged batched-engine path below runs.
let useFastPath = b1FastPathEligible(
temperature: temperature,
topP: topP,
topK: topK,
seed: seed,
promptTokenCount: promptTokens.count,
maxTokens: maxTokens,
cacheScope: cacheScope,
allowFastPath: allowFastPath
)
Comment on lines +100 to +106

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve context-window checks on the fast path

When the fast path is enabled, eligibility is decided without the prompt length or maxContextLength, and this branch returns before the planner's maxTokensPerBatch/context-window rejection runs. A greedy request whose prompt is longer than the model context but still fits the memory token budget can therefore be sent directly to ModelContainer.generate instead of producing the deterministic context error the engine path emits, risking runtime failures or malformed output for over-context prompts.

Useful? React with 👍 / 👎.

// Concurrency gate: never run the batched engine concurrently with an
// in-flight B=1 fast-path task. The fast path assumes it is the sole GPU /
// KV consumer (single-row), so a request that is NOT itself taking the
// fast path while one is active is rejected with a retryable signal
// (`token_budget_exhausted` ⇒ 503 upstream) so it reroutes / retries
// rather than overlapping. Inert when the fast path is OFF (the default):
// `fastPathTasks` is then always empty, so the engine path is unchanged.
if !useFastPath && !fastPathTasks.isEmpty {
noteAdmissionReject()
continuation.yield(.error(
"token_budget_exhausted: a single-request fast path is active; retry shortly"))
continuation.finish()
return stream
}
let bridge = BridgeState(
requestId: id,
promptTokens: promptTokens.count,
Expand All @@ -97,6 +126,50 @@ extension BatchScheduler {
)
activeBridges[id] = bridge

if useFastPath {
// Reserve KV bytes (cold; no restore, no planner, no engine enqueue)
// so the global KV budget and concurrent admissions still account for
// this request exactly as the engine path would.
let kvOutcome = await reserveKVForRequest(
requestId: id,
requestTokens: requestBudget,
reservationTokens: requestBudget,
restorePlanned: false
Comment on lines +133 to +137

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Reserve fp16 KV for the fast path

When the B=1 path is enabled on a KV-quantized model, this branch reserves through reserveKVForRequest, which charges kvBytesPerToken for the batched/quantized cache. The request then runs through ModelContainer.generate, the same non-batched path that reserveVisionRequest explicitly accounts with fp16KVBytesPerToken; as a result the fast-path request can hold substantially more KV than it reserved and let later admissions pass the 90% cap, causing OOMs under concurrent traffic. Use the fp16 byte reservation or disable this path when KV quantization is active.

Useful? React with 👍 / 👎.

)
guard kvOutcome != .failed else {
await dropBridge(requestId: id)
noteAdmissionReject()
continuation.yield(.error(
"token_budget_exhausted: insufficient global KV cache headroom"))
continuation.finish()
return stream
}
// Re-check the captured engine is still current and the container is
// live after the reserve await — a reload/unload may have run. Use
// releaseRequestResources (not bare dropBridge) so the reservation
// made above is not leaked if a cancel dropped the bridge meanwhile.
guard engineStillCurrent(submitEpoch, engine), let container = self.modelContainer else {
await releaseRequestResources(id)
continuation.yield(.error("model reloaded during submit; please retry"))
continuation.finish()
return stream
}
runGreedyFastPath(
requestId: id,
container: container,
promptTokens: promptTokens,
maxTokens: maxTokens,
continuation: continuation
)
let scheduler = self
continuation.onTermination = { @Sendable termination in
if case .cancelled = termination {
Task { await scheduler.cancel(requestId: id) }
}
}
return stream

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Keep fast-path requests in the concurrency gate

This returns before planner.admit, so the planner never counts the direct ModelContainer.generate request as active. While that request is decoding, a later non-fast-path submit can still be queued and scheduled because BatchQueuePlanner sees active.count == 0 even when maxConcurrentRequests == 1, which breaks the branch's “single exclusive” assumption and can overlap engine work with the fast path. Either register the fast-path request with the planner or block new engine admissions until it finishes.

Useful? React with 👍 / 👎.

}

if let planner = self.planner {
await refreshPlannerPolicy(activeTokenBudget: tokenBudgetMax)
let result = await planner.admit(
Expand Down Expand Up @@ -222,6 +295,19 @@ extension BatchScheduler {
// Pin the load epoch with the captured engine (see submitTokenized).
let submitEpoch = generationEpoch

// Concurrency gate (see submitTokenized): this ChatCompletionRequest path
// always uses the batched engine, which must not overlap an in-flight B=1
// fast-path task. Reject with a retryable signal while one is active.
// Inert when the fast path is OFF (`fastPathTasks` always empty), so the
// engine path is unchanged by default.
if !fastPathTasks.isEmpty {
noteAdmissionReject()
continuation.yield(.error(
"token_budget_exhausted: a single-request fast path is active; retry shortly"))
continuation.finish()
return stream
}

// Pre-tokenize so chat-template errors surface as `.error` events;
// engine's internal `buildPrompt` silently falls back to role:content.
let messages: [[String: any Sendable]] = request.messages.map { msg in
Expand Down Expand Up @@ -405,6 +491,14 @@ extension BatchScheduler {
}

public func cancel(requestId: String) async {
// B=1 fast-path request: it runs off-engine, so the engine abort below
// can't reach it. Cancel its task; the task observes the cancellation,
// runs its own finish bookkeeping (KV release + bridge removal + terminal
// events) and clears its handle. (If it already finished and self-removed,
// this is a no-op and we fall through to the harmless engine/local path.)
if cancelFastPathTask(requestId) {
return
}
if let engine = self.engine {
// Engine delivers a terminal RequestOutput synchronously; the
// streaming Task handles `recordFinish` + KV release.
Expand Down Expand Up @@ -436,6 +530,11 @@ extension BatchScheduler {
}

public func cancelAll() async {
// Cancel any off-engine B=1 fast-path tasks first; each self-removes and
// releases its KV/bridge. The bridge-id KV release + removeAll below then
// covers the engine-path bridges (and is an idempotent no-op for any
// fast-path bridge a racing task already tore down).
cancelAllFastPathTasks()
if let engine = self.engine {
_ = engine.core.abortAllRequests()
}
Expand Down
27 changes: 27 additions & 0 deletions provider-swift/Sources/ProviderCore/Inference/BatchScheduler.swift
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,22 @@ public actor BatchScheduler {
/// (vs. "request cancelled" for client-initiated aborts).
var timedOutBridges: Set<String> = []

/// In-flight B=1 greedy fast-path tasks, keyed by request id. The fast path
/// (see `BatchScheduler+B1FastPath.swift`) bypasses the batched engine for a
/// single exclusive greedy request and runs `ModelContainer.generate`
/// directly, so it is NOT registered with `engine.core` — `cancel` /
/// `cancelAll` / `stopCurrentEngine` must cancel these tasks here (the
/// engine abort path can't reach them). Each task removes its own entry on
/// completion via `clearFastPathTask`.
var fastPathTasks: [String: Task<Void, Never>] = [:]

/// Test-only override for the B=1 fast-path enablement gate. `nil` (the
/// production default) defers to the env flags via `b1GreedyFastPathEnabled()`.
/// Set via `_setForceB1FastPathForTest` so a benchmark can A/B the fast path
/// against the batched engine in one process without relying on mutating the
/// (often cached) `ProcessInfo` environment mid-run. @testable-only.
internal var _forceB1FastPathForTest: Bool? = nil

// MARK: - Telemetry state (read by `backendCapacity`)

var observedDecodeTpsEwma: Double = 0
Expand Down Expand Up @@ -398,6 +414,17 @@ public actor BatchScheduler {
// submits fail the guard and reject/retry against the next model instead.
var stoppingEngine = self.engine
self.engine = nil
// Tear down any in-flight B=1 fast-path tasks: they run off-engine via
// `ModelContainer.generate`, so the engine abort below can't reach them.
// We must FENCE (not just cancel) them here — before this teardown nil's
// `modelContainer` and runs `MLX.Memory.clearCache()` below — because a
// task still inside its `generate` loop holds and runs GPU work against
// the model + its KV. `waitForFastPathTasks` cancels each task and awaits
// its unwind (cancellation observation + finish bookkeeping: KV release,
// bridge removal, terminal events). The await suspends this actor so those
// callbacks make progress; no new fast path can start because `engine` is
// already nil above (every submit path short-circuits on a nil engine).
await waitForFastPathTasks()
pendingTimeoutTask?.cancel()
pendingTimeoutTask = nil
// Stop the backend-liveness watchdog; a recovery restart re-arms it via
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,11 @@ public struct MultiModelBatchSchedulerEngine: MLXServerEngine, Sendable {
topP: request.topP,
topK: request.topK,
requestId: requestId,
cacheScope: cacheScope
cacheScope: cacheScope,
// Keep tool-bearing requests off the greedy text-only B=1 fast path:
// it cannot reproduce the engine's raw-text tool-call contract. No
// tools ⇒ fast path may apply (subject to the scheduler's gates).
allowFastPath: toolHandler == nil
)

return AsyncThrowingStream { continuation in
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,33 @@ public enum VLMRequestInference {
request.maxTokens ?? defaultMaxTokens
}

/// Conservative per-image soft-token allotment for the KV-token estimate.
/// Gemma-4 pools every image to a FIXED `vision_soft_tokens_per_image` (256)
/// regardless of resolution; other VLMs run higher. 1024 (4× Gemma) is a
/// generous model-agnostic upper bound that is still bounded by the model's
/// context window via the clamp in `projectedKVTokens`.
/// Conservative per-image (and per-video-frame) soft-token allotment for the
/// KV-token estimate. Gemma-4 pools every image/frame to a FIXED soft-token
/// block (`image_seq_length`, default 280) regardless of resolution and wraps
/// it with 2 `boi`/`eoi` delimiter tokens; other VLMs run higher. 1024 is a
/// generous model-agnostic upper bound that over-covers that whole
/// `boi + soft_tokens + eoi` per-frame span, and is still bounded by the
/// model's context window via the clamp in `projectedKVTokens`.
static let visionTokensPerImage = 1024
/// A video samples multiple frames, each contributing image-like soft tokens.
/// Charge a larger fixed allotment per video; still clamped to the context.
static let visionTokensPerVideo = 4096
/// Max frames Gemma-4 samples from a single video. Mirrors `maxFrames: 32`
/// in the Gemma4 video processor (`Gemma4Processor.prepare`), which samples
/// up to 32 frames spread uniformly across the clip and expands EACH into its
/// own image-like `boi + soft_token*image_seq_length + eoi` block. A video's
/// KV footprint therefore scales with the sampled frame count — it is NOT a
/// flat per-video allotment.
static let maxVideoFramesSampled = 32
/// A video reserves KV for EVERY sampled frame: the processor emits one
/// image-like soft-token block per sampled frame (up to
/// `maxVideoFramesSampled`), so the worst case is `maxVideoFramesSampled ×
/// visionTokensPerImage`. Because `visionTokensPerImage` already over-covers
/// a single frame's soft tokens AND its `boi`/`eoi` delimiters, this product
/// bounds the full `32 × (soft tokens + delimiters)` span the prefill
/// actually writes into KV. The previous flat 4096 covered only ~4 frames and
/// badly under-reserved a full clip (Gemma-4's real worst case is
/// 32 × (280 + 2) = 9024 soft tokens). Still clamped to the model's context
/// window via the clamp in `projectedKVTokens`, so over-reservation never
/// projects past a request the context could actually hold.
static let visionTokensPerVideo = maxVideoFramesSampled * visionTokensPerImage
/// Conservative chars→tokens divisor for the text prompt estimate. Real
/// tokenizers average ~4 chars/token; dividing by 3 OVER-estimates the token
/// count (the safe direction for a reservation).
Expand Down
Loading
Loading