diff --git a/.env.example b/.env.example
index 53eeaad..f5ca3e2 100644
--- a/.env.example
+++ b/.env.example
@@ -48,27 +48,19 @@ OPEN_WEBUI_DEFAULT_MODEL=local-chat
 # LLAMACPP_FLASH_ATTN=auto
 # Feature flag: when set to 1, llama.cpp starts with quantized KV cache flags.
 # LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION=0
-# KV cache types used when quantization is enabled. Valid values depend on the
-# llama.cpp build. With the default repo-built image (llamacpp/Dockerfile,
-# AmesianX TurboQuant fork) the TurboQuant types are available alongside the
-# mainline ones. Pick one of:
-#   tbq3_0    — TurboQuant 3-bit + Walsh-Hadamard rotation. Near-neutral quality.
-#   tbq4_0    — TurboQuant 4-bit + WHT. Closest to fp16 quality of the tbq set.
-#   tbqp3_0   — TurboQuant_prod (2-bit Lloyd-Max + 1-bit QJL). Max compression,
-#               marginal quality dip — the paper's "near-2.5 bpw" variant.
-#               Use this to fit 128k context + 19 GB weights in 32 GB VRAM.
-#   tbqp4_0   — TurboQuant_prod (3-bit Lloyd-Max + 1-bit QJL). Safer than tbqp3_0.
-#   tbq3_1 / tbq4_1 / tbqp3_1 / tbqp4_1  — head_dim=128 optimized variants.
-#   tbq3_2 / tbq4_2 / tbqp3_2 / tbqp4_2  — head_dim=64 optimized variants.
-#   q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 — mainline types, always available.
-# Any tbq* / tbqp* type requires Flash Attention; the entrypoint forces
-# --flash-attn on when either K or V is a tbq* type (see
-# scripts/llamacpp/run-llama-server.sh).
-# LLAMACPP_KV_CACHE_TYPE_K=q4_0
-# LLAMACPP_KV_CACHE_TYPE_V=q4_0
-# Image selection. Default is the repo-built TurboQuant build. Set to
-# ghcr.io/ggml-org/llama.cpp:server-cuda to fall back to upstream (loses turbo*).
-# LLAMACPP_IMAGE=ordo-ai-stack/llamacpp-turboquant:latest
+# KV cache types used when quantization is enabled (mainline llama.cpp types):
+#   q8_0  — 8-bit, highest quality of the quantized set. Repo default for the
+#           hybrid A3B (only ~11/41 layers are full-attention, so KV stays ~3 GB
+#           at 512k ctx even at q8_0).
+#   q4_0, q4_1, q5_0, q5_1, iq4_nl — smaller / more aggressive.
+# Quantized KV works best with Flash Attention; set LLAMACPP_FLASH_ATTN=on (or
+# leave it auto) when enabling it.
+# LLAMACPP_KV_CACHE_TYPE_K=q8_0
+# LLAMACPP_KV_CACHE_TYPE_V=q8_0
+# Image selection. llama.cpp is pinned by DIGEST in docker-compose.yml
+# (ggml-org mainline build 9765 / 73618f27a — loads Qwen3.6 MTP + qwen35moe
+# natively). Override here ONLY to test a different build, e.g.:
+# LLAMACPP_IMAGE=ghcr.io/ggml-org/llama.cpp:server-cuda
 # Hard ceiling on tokens per single request. Defense-in-depth against
 # runaway-reasoning loops where --reasoning-budget fails to close the
 # <think> block (model never emits </think> in a confused large-context
@@ -78,9 +70,9 @@ OPEN_WEBUI_DEFAULT_MODEL=local-chat
 # Cap on tokens the model is allowed to spend inside <think>...</think> per
 # response. Hoisted out of LLAMACPP_EXTRA_ARGS so it can be monitored and
 # tuned from one place. Llama.cpp's grammar engine force-closes the block
-# when this is hit. Note: in the current b1-0a8062d build this isn't fully
-# reliable on Qwen3-family models, which is why N_PREDICT above exists as a
-# hard backstop.
+# when this is hit. Note: this isn't fully reliable on Qwen3-family models
+# (the model may not emit a clean </think>), which is why N_PREDICT above
+# exists as a hard backstop.
 # LLAMACPP_REASONING_BUDGET=32768
 # Optional raw llama-server args appended last. Useful for one-off experiments without editing compose.
 # Note: if you previously had `--reasoning-budget N` in here, remove it —
diff --git a/docker-compose.yml b/docker-compose.yml
index aa9a10c..3611690 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -40,13 +40,12 @@ services:
       - backend
 
   llamacpp:
-    # Default points at the repo-built TurboQuant image (llamacpp/Dockerfile).
-    # Escape hatch: set LLAMACPP_IMAGE=ghcr.io/ggml-org/llama.cpp:server-cuda in
-    # .env to fall back to upstream (loses turbo2 / turbo3 KV types).
-    image: ${LLAMACPP_IMAGE:-ordo-ai-stack/llamacpp-turboquant:latest}
-    build:
-      context: ./llamacpp
-      dockerfile: Dockerfile
+    # Pinned mainline llama.cpp (ggml-org) by DIGEST for reproducibility —
+    # build 9765 (73618f27a), the first build that loads Qwen3.6 MTP + qwen35moe
+    # GGUFs natively (upstream PR #22673). This is the single source of truth for
+    # the image; bump the digest deliberately (stack_monitor tracks the build).
+    # Override LLAMACPP_IMAGE in .env only to test a different build.
+    image: ${LLAMACPP_IMAGE:-ghcr.io/ggml-org/llama.cpp@sha256:44cd08334f85c1fcb363e3798302f2986cdb78cdfc8c1cf56bdad44041595a32}
     restart: unless-stopped
     platform: linux/amd64
     entrypoint: ["/bin/sh", "/llamacpp-scripts/run-llama-server.sh"]
@@ -97,8 +96,9 @@ services:
       - backend
 
   # Use CUDA image so linux/amd64 is available (plain :server manifest can resolve to arm64 on Docker Desktop).
+  # Pinned by digest to the same mainline build as llamacpp (build 9765 / 73618f27a).
   llamacpp-embed:
-    image: ${LLAMACPP_EMBED_IMAGE:-ghcr.io/ggml-org/llama.cpp:server-cuda}
+    image: ${LLAMACPP_EMBED_IMAGE:-ghcr.io/ggml-org/llama.cpp@sha256:44cd08334f85c1fcb363e3798302f2986cdb78cdfc8c1cf56bdad44041595a32}
     restart: unless-stopped
     platform: linux/amd64
     # The upstream :server-cuda is a rolling tag that has flipped its
diff --git a/llamacpp/Dockerfile b/llamacpp/Dockerfile
deleted file mode 100644
index fd7d148..0000000
--- a/llamacpp/Dockerfile
+++ /dev/null
@@ -1,119 +0,0 @@
-# syntax=docker/dockerfile:1.6
-# Custom llama.cpp server built from the AmesianX TurboQuant fork.
-#
-# Ships turbo2 (2.5 bpw) and turbo3 (3.5 bpw) KV-cache quantization types
-# on top of mainline llama.cpp. See docs/configuration.md for the operator
-# reference and the plan note at the bottom of this file for rationale.
-#
-# Pinned commit: AmesianX/TurboQuant @ 0a8062ded5a4a3e050102874bf21a3e5303a95cf
-#                                      (HEAD of main as of 2026-04-22)
-# To bump: `git ls-remote https://github.com/AmesianX/TurboQuant.git HEAD`,
-# update TURBOQUANT_SHA below, rebuild. Re-run the perplexity sanity check
-# after any bump — the fork can rebase onto mainline without warning.
-#
-# Target GPU: RTX 5090 (Blackwell, compute capability 12.0 = sm_120).
-# Building for sm_120 explicitly avoids a PTX JIT stall on the first call.
-
-# CUDA 12.8+ is required for Blackwell sm_120 (RTX 50-series consumer) —
-# earlier versions' nvcc rejects `-gencode=arch=compute_120` with
-# "Unsupported gpu architecture 'compute_120'".
-ARG CUDA_VERSION=12.8.1
-ARG UBUNTU_VERSION=22.04
-
-# --- Build stage --------------------------------------------------------------
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS build
-
-ARG TURBOQUANT_REPO=https://github.com/AmesianX/TurboQuant.git
-ARG TURBOQUANT_SHA=0a8062ded5a4a3e050102874bf21a3e5303a95cf
-ARG CMAKE_CUDA_ARCHITECTURES=120
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-# CUDA driver API stubs — libcuda.so.1 is provided by the NVIDIA driver at
-# runtime (via --gpus all + nvidia-container-runtime), but at build time the
-# linker needs a stub to satisfy cuMemCreate / cuMemMap / etc references
-# emitted by ggml-cuda.
-#
-# The toolkit ships /usr/local/cuda/lib64/stubs/libcuda.so (unversioned), but
-# libggml-cuda.so is built with SONAME dependency on libcuda.so.1 — so when
-# linking the executable we need both LIBRARY_PATH AND a versioned symlink.
-# The symlink is created below in the apt-install step; LIBRARY_PATH lets gcc
-# pass the stubs dir to ld via its generated -L flags.
-ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        ca-certificates \
-        cmake \
-        git \
-        libcurl4-openssl-dev \
-        ninja-build \
-        pkg-config \
-    && rm -rf /var/lib/apt/lists/* \
-    && ln -sf libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
-
-WORKDIR /src
-# Fetch only the specific commit tree (no history, no submodules — the fork's
-# .gitmodules is empty as of the pinned SHA). Avoids the `--filter=blob:none`
-# lazy-fetch path which stalled during builds in this environment.
-RUN git init . \
-    && git remote add origin "${TURBOQUANT_REPO}" \
-    && git -c protocol.version=2 fetch --depth 1 origin "${TURBOQUANT_SHA}" \
-    && git checkout --detach FETCH_HEAD
-
-# Flags chosen for this deployment:
-#   GGML_CUDA=ON                    enable CUDA backend
-#   CMAKE_CUDA_ARCHITECTURES=120    Blackwell sm_120 (RTX 5090) — no PTX JIT
-#   GGML_CUDA_FA_ALL_QUANTS=ON      compile Flash Attention for every quant
-#                                   variant. TurboQuant silently corrupts
-#                                   without FA, so this + the shell-wrapper
-#                                   safety rail must both be in place.
-#   GGML_NATIVE=OFF                 don't tune to host CPU; we're CUDA-bound.
-#   LLAMA_BUILD_TESTS=OFF           skip fork test suite; we test at the
-#                                   llama-server arg + runtime level.
-# (LLAMA_CURL is deprecated in current llama.cpp — curl is enabled by default.)
-RUN cmake -S . -B build -G Ninja \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DGGML_CUDA=ON \
-        -DCMAKE_CUDA_ARCHITECTURES="${CMAKE_CUDA_ARCHITECTURES}" \
-        -DGGML_CUDA_FA_ALL_QUANTS=ON \
-        -DGGML_NATIVE=OFF \
-        -DLLAMA_BUILD_TESTS=OFF \
-        -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs" \
-        -DCMAKE_SHARED_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs" \
-    && cmake --build build -j --target llama-server llama-perplexity
-
-# --- Runtime stage ------------------------------------------------------------
-FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-ARG TURBOQUANT_SHA=0a8062ded5a4a3e050102874bf21a3e5303a95cf
-LABEL org.opencontainers.image.source="https://github.com/AmesianX/TurboQuant"
-LABEL org.opencontainers.image.revision="${TURBOQUANT_SHA}"
-LABEL org.opencontainers.image.description="llama.cpp server w/ TurboQuant KV-cache (turbo2/turbo3) for Blackwell sm_120"
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        ca-certificates \
-        curl \
-        libgomp1 \
-    && rm -rf /var/lib/apt/lists/*
-
-WORKDIR /app
-
-# Copy the built binaries plus every ggml / llama shared lib they dlopen.
-# COPY with a trailing slash pulls the whole build/bin directory — simpler
-# than enumerating every libggml-*.so variant, and works whether the fork
-# builds them as unversioned or versioned (libfoo.so.N) symlinks.
-COPY --from=build /src/build/bin/ /app/
-
-ENV LD_LIBRARY_PATH=/app:${LD_LIBRARY_PATH}
-
-EXPOSE 8080
-
-# Intentionally empty — the compose service mounts
-# scripts/llamacpp/run-llama-server.sh as the entrypoint so the stack's existing
-# env-driven arg assembly (with the turbo* -> --flash-attn on safety rail) stays
-# authoritative. See docker-compose.yml for the `entrypoint:` override.
-ENTRYPOINT []
-CMD ["/app/llama-server", "--host", "0.0.0.0", "--port", "8080"]