Layr-Labs · Gajesh2007 · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026
diff --git a/libs/mlx-swift b/libs/mlx-swift
diff --git a/libs/mlx-swift-lm b/libs/mlx-swift-lm
diff --git a/provider-swift/Sources/ProviderCore/Inference/BatchScheduler+B1FastPath.swift b/provider-swift/Sources/ProviderCore/Inference/BatchScheduler+B1FastPath.swift
diff --git a/provider-swift/Sources/ProviderCore/Inference/BatchScheduler+Submit.swift b/provider-swift/Sources/ProviderCore/Inference/BatchScheduler+Submit.swift
@@ -49,7 +49,8 @@ extension BatchScheduler {
         topK: Int? = nil,
         seed: UInt64? = nil,
         requestId: String? = nil,
-        cacheScope: String = ""
+        cacheScope: String = "",
+        allowFastPath: Bool = true
     ) async -> AsyncStream<GenerationEvent> {
         let id = requestId ?? "req-\(UUID().uuidString.prefix(12))"
         let (stream, continuation) = AsyncStream<GenerationEvent>.makeStream()
@@ -89,6 +90,34 @@ extension BatchScheduler {
             continuation.finish()
             return stream
         }
+        // B=1 greedy fast path eligibility — MUST be decided BEFORE inserting our
+        // bridge (it checks `activeBridges.isEmpty` for exclusivity). When taken,
+        // this bypasses the batched engine + planner for a single greedy request;
+        // otherwise the unchanged batched-engine path below runs.
+        let useFastPath = b1FastPathEligible(
+            temperature: temperature,
+            topP: topP,
+            topK: topK,
+            seed: seed,
+            promptTokenCount: promptTokens.count,
+            maxTokens: maxTokens,
+            cacheScope: cacheScope,
+            allowFastPath: allowFastPath
+        )
+        // Concurrency gate: never run the batched engine concurrently with an
+        // in-flight B=1 fast-path task. The fast path assumes it is the sole GPU /
+        // KV consumer (single-row), so a request that is NOT itself taking the
+        // fast path while one is active is rejected with a retryable signal
+        // (`token_budget_exhausted` ⇒ 503 upstream) so it reroutes / retries
+        // rather than overlapping. Inert when the fast path is OFF (the default):
+        // `fastPathTasks` is then always empty, so the engine path is unchanged.
+        if !useFastPath && !fastPathTasks.isEmpty {
+            noteAdmissionReject()
+            continuation.yield(.error(
+                "token_budget_exhausted: a single-request fast path is active; retry shortly"))
+            continuation.finish()
+            return stream
+        }
         let bridge = BridgeState(
             requestId: id,
             promptTokens: promptTokens.count,
@@ -97,6 +126,50 @@ extension BatchScheduler {
         )
         activeBridges[id] = bridge
 
+        if useFastPath {
+            // Reserve KV bytes (cold; no restore, no planner, no engine enqueue)
+            // so the global KV budget and concurrent admissions still account for
+            // this request exactly as the engine path would.
+            let kvOutcome = await reserveKVForRequest(
+                requestId: id,
+                requestTokens: requestBudget,
+                reservationTokens: requestBudget,
+                restorePlanned: false
+            )
+            guard kvOutcome != .failed else {
+                await dropBridge(requestId: id)
+                noteAdmissionReject()
+                continuation.yield(.error(
+                    "token_budget_exhausted: insufficient global KV cache headroom"))
+                continuation.finish()
+                return stream
+            }
+            // Re-check the captured engine is still current and the container is
+            // live after the reserve await — a reload/unload may have run. Use
+            // releaseRequestResources (not bare dropBridge) so the reservation
+            // made above is not leaked if a cancel dropped the bridge meanwhile.
+            guard engineStillCurrent(submitEpoch, engine), let container = self.modelContainer else {
+                await releaseRequestResources(id)
+                continuation.yield(.error("model reloaded during submit; please retry"))
+                continuation.finish()
+                return stream
+            }
+            runGreedyFastPath(
+                requestId: id,
+                container: container,
+                promptTokens: promptTokens,
+                maxTokens: maxTokens,
+                continuation: continuation
+            )
+            let scheduler = self
+            continuation.onTermination = { @Sendable termination in
+                if case .cancelled = termination {
+                    Task { await scheduler.cancel(requestId: id) }
+                }
+            }
+            return stream
+        }
+
         if let planner = self.planner {
             await refreshPlannerPolicy(activeTokenBudget: tokenBudgetMax)
             let result = await planner.admit(
@@ -222,6 +295,19 @@ extension BatchScheduler {
         // Pin the load epoch with the captured engine (see submitTokenized).
         let submitEpoch = generationEpoch
 
+        // Concurrency gate (see submitTokenized): this ChatCompletionRequest path
+        // always uses the batched engine, which must not overlap an in-flight B=1
+        // fast-path task. Reject with a retryable signal while one is active.
+        // Inert when the fast path is OFF (`fastPathTasks` always empty), so the
+        // engine path is unchanged by default.
+        if !fastPathTasks.isEmpty {
+            noteAdmissionReject()
+            continuation.yield(.error(
+                "token_budget_exhausted: a single-request fast path is active; retry shortly"))
+            continuation.finish()
+            return stream
+        }
+
         // Pre-tokenize so chat-template errors surface as `.error` events;
         // engine's internal `buildPrompt` silently falls back to role:content.
         let messages: [[String: any Sendable]] = request.messages.map { msg in
@@ -405,6 +491,14 @@ extension BatchScheduler {
     }
 
     public func cancel(requestId: String) async {
+        // B=1 fast-path request: it runs off-engine, so the engine abort below
+        // can't reach it. Cancel its task; the task observes the cancellation,
+        // runs its own finish bookkeeping (KV release + bridge removal + terminal
+        // events) and clears its handle. (If it already finished and self-removed,
+        // this is a no-op and we fall through to the harmless engine/local path.)
+        if cancelFastPathTask(requestId) {
+            return
+        }
         if let engine = self.engine {
             // Engine delivers a terminal RequestOutput synchronously; the
             // streaming Task handles `recordFinish` + KV release.
@@ -436,6 +530,11 @@ extension BatchScheduler {
     }
 
     public func cancelAll() async {
+        // Cancel any off-engine B=1 fast-path tasks first; each self-removes and
+        // releases its KV/bridge. The bridge-id KV release + removeAll below then
+        // covers the engine-path bridges (and is an idempotent no-op for any
+        // fast-path bridge a racing task already tore down).
+        cancelAllFastPathTasks()
         if let engine = self.engine {
             _ = engine.core.abortAllRequests()
         }

diff --git a/provider-swift/Sources/ProviderCore/Inference/BatchScheduler.swift b/provider-swift/Sources/ProviderCore/Inference/BatchScheduler.swift
@@ -261,6 +261,22 @@ public actor BatchScheduler {
     /// (vs. "request cancelled" for client-initiated aborts).
     var timedOutBridges: Set<String> = []
 
+    /// In-flight B=1 greedy fast-path tasks, keyed by request id. The fast path
+    /// (see `BatchScheduler+B1FastPath.swift`) bypasses the batched engine for a
+    /// single exclusive greedy request and runs `ModelContainer.generate`
+    /// directly, so it is NOT registered with `engine.core` — `cancel` /
+    /// `cancelAll` / `stopCurrentEngine` must cancel these tasks here (the
+    /// engine abort path can't reach them). Each task removes its own entry on
+    /// completion via `clearFastPathTask`.
+    var fastPathTasks: [String: Task<Void, Never>] = [:]
+
+    /// Test-only override for the B=1 fast-path enablement gate. `nil` (the
+    /// production default) defers to the env flags via `b1GreedyFastPathEnabled()`.
+    /// Set via `_setForceB1FastPathForTest` so a benchmark can A/B the fast path
+    /// against the batched engine in one process without relying on mutating the
+    /// (often cached) `ProcessInfo` environment mid-run. @testable-only.
+    internal var _forceB1FastPathForTest: Bool? = nil
+
     // MARK: - Telemetry state (read by `backendCapacity`)
 
     var observedDecodeTpsEwma: Double = 0
@@ -398,6 +414,17 @@ public actor BatchScheduler {
         // submits fail the guard and reject/retry against the next model instead.
         var stoppingEngine = self.engine
         self.engine = nil
+        // Tear down any in-flight B=1 fast-path tasks: they run off-engine via
+        // `ModelContainer.generate`, so the engine abort below can't reach them.
+        // We must FENCE (not just cancel) them here — before this teardown nil's
+        // `modelContainer` and runs `MLX.Memory.clearCache()` below — because a
+        // task still inside its `generate` loop holds and runs GPU work against
+        // the model + its KV. `waitForFastPathTasks` cancels each task and awaits
+        // its unwind (cancellation observation + finish bookkeeping: KV release,
+        // bridge removal, terminal events). The await suspends this actor so those
+        // callbacks make progress; no new fast path can start because `engine` is
+        // already nil above (every submit path short-circuits on a nil engine).
+        await waitForFastPathTasks()
         pendingTimeoutTask?.cancel()
         pendingTimeoutTask = nil
         // Stop the backend-liveness watchdog; a recovery restart re-arms it via

diff --git a/provider-swift/Sources/ProviderCore/Inference/MultiModelBatchSchedulerEngine.swift b/provider-swift/Sources/ProviderCore/Inference/MultiModelBatchSchedulerEngine.swift
@@ -350,7 +350,11 @@ public struct MultiModelBatchSchedulerEngine: MLXServerEngine, Sendable {
             topP: request.topP,
             topK: request.topK,
             requestId: requestId,
-            cacheScope: cacheScope
+            cacheScope: cacheScope,
+            // Keep tool-bearing requests off the greedy text-only B=1 fast path:
+            // it cannot reproduce the engine's raw-text tool-call contract. No
+            // tools ⇒ fast path may apply (subject to the scheduler's gates).
+            allowFastPath: toolHandler == nil
         )
 
         return AsyncThrowingStream { continuation in

diff --git a/provider-swift/Sources/ProviderCore/Inference/VLMRequestInference.swift b/provider-swift/Sources/ProviderCore/Inference/VLMRequestInference.swift
@@ -105,15 +105,33 @@ public enum VLMRequestInference {
         request.maxTokens ?? defaultMaxTokens
     }
 
-    /// Conservative per-image soft-token allotment for the KV-token estimate.
-    /// Gemma-4 pools every image to a FIXED `vision_soft_tokens_per_image` (256)
-    /// regardless of resolution; other VLMs run higher. 1024 (4× Gemma) is a
-    /// generous model-agnostic upper bound that is still bounded by the model's
-    /// context window via the clamp in `projectedKVTokens`.
+    /// Conservative per-image (and per-video-frame) soft-token allotment for the
+    /// KV-token estimate. Gemma-4 pools every image/frame to a FIXED soft-token
+    /// block (`image_seq_length`, default 280) regardless of resolution and wraps
+    /// it with 2 `boi`/`eoi` delimiter tokens; other VLMs run higher. 1024 is a
+    /// generous model-agnostic upper bound that over-covers that whole
+    /// `boi + soft_tokens + eoi` per-frame span, and is still bounded by the
+    /// model's context window via the clamp in `projectedKVTokens`.
     static let visionTokensPerImage = 1024
-    /// A video samples multiple frames, each contributing image-like soft tokens.
-    /// Charge a larger fixed allotment per video; still clamped to the context.
-    static let visionTokensPerVideo = 4096
+    /// Max frames Gemma-4 samples from a single video. Mirrors `maxFrames: 32`
+    /// in the Gemma4 video processor (`Gemma4Processor.prepare`), which samples
+    /// up to 32 frames spread uniformly across the clip and expands EACH into its
+    /// own image-like `boi + soft_token*image_seq_length + eoi` block. A video's
+    /// KV footprint therefore scales with the sampled frame count — it is NOT a
+    /// flat per-video allotment.
+    static let maxVideoFramesSampled = 32
+    /// A video reserves KV for EVERY sampled frame: the processor emits one
+    /// image-like soft-token block per sampled frame (up to
+    /// `maxVideoFramesSampled`), so the worst case is `maxVideoFramesSampled ×
+    /// visionTokensPerImage`. Because `visionTokensPerImage` already over-covers
+    /// a single frame's soft tokens AND its `boi`/`eoi` delimiters, this product
+    /// bounds the full `32 × (soft tokens + delimiters)` span the prefill
+    /// actually writes into KV. The previous flat 4096 covered only ~4 frames and
+    /// badly under-reserved a full clip (Gemma-4's real worst case is
+    /// 32 × (280 + 2) = 9024 soft tokens). Still clamped to the model's context
+    /// window via the clamp in `projectedKVTokens`, so over-reservation never
+    /// projects past a request the context could actually hold.
+    static let visionTokensPerVideo = maxVideoFramesSampled * visionTokensPerImage
     /// Conservative chars→tokens divisor for the text prompt estimate. Real
     /// tokenizers average ~4 chars/token; dividing by 3 OVER-estimates the token
     /// count (the safe direction for a reservation).
+3 −1		Source/MLX/Device.swift
+3 −1		Source/MLX/MLXArray.swift
+3 −1		Source/MLX/Stream.swift
+23 −5		Source/MLX/Transforms+Compile.swift
+61 −0		Tests/MLXTests/TransformTests.swift
+10 −6		Libraries/MLXEmbedders/Models/Gemma3.swift
+4 −2		Libraries/MLXEmbedders/Models/NomicBert.swift
+29 −8		Libraries/MLXLLM/Models/GatedDelta.swift
+55 −7		Libraries/MLXLLM/Models/Gemma4Text.swift
+26 −13		Libraries/MLXLLM/Models/LFM2MoE.swift
+1 −1		Libraries/MLXLLM/Models/SSM.swift
+146 −4		Libraries/MLXLMCommon/BatchKVCache.swift
+255 −0		Libraries/MLXLMCommon/CompilableKVCache.swift
+76 −0		Libraries/MLXLMCommon/CompiledDecode.swift
+83 −0		Libraries/MLXLMCommon/DynamicSlice.swift
+10 −4		Libraries/MLXLMCommon/Evaluate.swift
+107 −12		Libraries/MLXLMCommon/KVCache.swift
+39 −0		Libraries/MLXLMCommon/MLXHardwareInfo.swift
+8 −0		Libraries/MLXLMCommon/Registries/ModelTypeRegistry.swift
+31 −73		Libraries/MLXLMCommon/RoPEUtils.swift
+173 −41		Libraries/MLXLMCommon/SwitchLayers.swift
+18 −0		Libraries/MLXLMCommon/Tool/Parsers/JSONToolCallParser.swift
+10 −2		Libraries/MLXLMCommon/Tool/ToolCallFormat.swift
+57 −6		Libraries/MLXLMCommon/Tool/ToolCallProcessor.swift
+23 −3		Libraries/MLXLMCommon/UserInput.swift
+6 −1		Libraries/MLXVLM/MediaProcessing.swift
+13 −1		Libraries/MLXVLM/Models/FastVLM.swift
+31 −12		Libraries/MLXVLM/Models/Gemma3.swift
+1,231 −2,340		Libraries/MLXVLM/Models/Gemma4.swift
+29 −2		Libraries/MLXVLM/Models/Idefics3.swift
+17 −1		Libraries/MLXVLM/Models/LFM2VL.swift
+18 −1		Libraries/MLXVLM/Models/Mistral3.swift
+18 −2		Libraries/MLXVLM/Models/Pixtral.swift
+20 −5		Libraries/MLXVLM/Models/Qwen25VL.swift
+29 −14		Libraries/MLXVLM/Models/Qwen2VL.swift
+17 −127		Libraries/MLXVLM/Models/Qwen35.swift
+9 −2		Libraries/MLXVLM/Models/SmolVLM2.swift
+20 −6		Libraries/MLXVLM/README.md
+3 −0		Package.swift
+68 −0		Tests/MLXLMServerTests/ToolCallParserIntegrationTests.swift
+313 −0		Tests/MLXLMTests/BatchRotatingKVCacheFastPathTests.swift
+196 −0		Tests/MLXLMTests/CompilableKVCacheTests.swift
+82 −0		Tests/MLXLMTests/GatedDeltaTests.swift
+134 −0		Tests/MLXLMTests/LFM2MoERoutingTests.swift
+53 −0		Tests/MLXLMTests/NomicBertTests.swift
+105 −0		Tests/MLXLMTests/Qwen35SanitizeTests.swift
+73 −0		Tests/MLXLMTests/Qwen35VLMGatedDeltaTests.swift
+39 −0		Tests/MLXLMTests/QwenImageBudgetTests.swift
+83 −0		Tests/MLXLMTests/SmolVLM2TilingTests.swift
+111 −0		Tests/MLXLMTests/SwitchGLUTests.swift
+51 −3		Tests/MLXLMTests/UserInputTests.swift