diff --git a/.env.example b/.env.example
index 53eeaad..f5ca3e2 100644
--- a/.env.example
+++ b/.env.example
@@ -48,27 +48,19 @@ OPEN_WEBUI_DEFAULT_MODEL=local-chat
# LLAMACPP_FLASH_ATTN=auto
# Feature flag: when set to 1, llama.cpp starts with quantized KV cache flags.
# LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION=0
-# KV cache types used when quantization is enabled. Valid values depend on the
-# llama.cpp build. With the default repo-built image (llamacpp/Dockerfile,
-# AmesianX TurboQuant fork) the TurboQuant types are available alongside the
-# mainline ones. Pick one of:
-# tbq3_0 — TurboQuant 3-bit + Walsh-Hadamard rotation. Near-neutral quality.
-# tbq4_0 — TurboQuant 4-bit + WHT. Closest to fp16 quality of the tbq set.
-# tbqp3_0 — TurboQuant_prod (2-bit Lloyd-Max + 1-bit QJL). Max compression,
-# marginal quality dip — the paper's "near-2.5 bpw" variant.
-# Use this to fit 128k context + 19 GB weights in 32 GB VRAM.
-# tbqp4_0 — TurboQuant_prod (3-bit Lloyd-Max + 1-bit QJL). Safer than tbqp3_0.
-# tbq3_1 / tbq4_1 / tbqp3_1 / tbqp4_1 — head_dim=128 optimized variants.
-# tbq3_2 / tbq4_2 / tbqp3_2 / tbqp4_2 — head_dim=64 optimized variants.
-# q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 — mainline types, always available.
-# Any tbq* / tbqp* type requires Flash Attention; the entrypoint forces
-# --flash-attn on when either K or V is a tbq* type (see
-# scripts/llamacpp/run-llama-server.sh).
-# LLAMACPP_KV_CACHE_TYPE_K=q4_0
-# LLAMACPP_KV_CACHE_TYPE_V=q4_0
-# Image selection. Default is the repo-built TurboQuant build. Set to
-# ghcr.io/ggml-org/llama.cpp:server-cuda to fall back to upstream (loses turbo*).
-# LLAMACPP_IMAGE=ordo-ai-stack/llamacpp-turboquant:latest
+# KV cache types used when quantization is enabled (mainline llama.cpp types):
+# q8_0 — 8-bit, highest quality of the quantized set. Repo default for the
+# hybrid A3B (only ~11/41 layers are full-attention, so KV stays ~3 GB
+# at 512k ctx even at q8_0).
+# q4_0, q4_1, q5_0, q5_1, iq4_nl — smaller / more aggressive.
+# Quantized KV works best with Flash Attention; set LLAMACPP_FLASH_ATTN=on (or
+# leave it auto) when enabling it.
+# LLAMACPP_KV_CACHE_TYPE_K=q8_0
+# LLAMACPP_KV_CACHE_TYPE_V=q8_0
+# Image selection. llama.cpp is pinned by DIGEST in docker-compose.yml
+# (ggml-org mainline build 9765 / 73618f27a — loads Qwen3.6 MTP + qwen35moe
+# natively). Override here ONLY to test a different build, e.g.:
+# LLAMACPP_IMAGE=ghcr.io/ggml-org/llama.cpp:server-cuda
# Hard ceiling on tokens per single request. Defense-in-depth against
# runaway-reasoning loops where --reasoning-budget fails to close the
# block (model never emits in a confused large-context
@@ -78,9 +70,9 @@ OPEN_WEBUI_DEFAULT_MODEL=local-chat
# Cap on tokens the model is allowed to spend inside ... per
# response. Hoisted out of LLAMACPP_EXTRA_ARGS so it can be monitored and
# tuned from one place. Llama.cpp's grammar engine force-closes the block
-# when this is hit. Note: in the current b1-0a8062d build this isn't fully
-# reliable on Qwen3-family models, which is why N_PREDICT above exists as a
-# hard backstop.
+# when this is hit. Note: this isn't fully reliable on Qwen3-family models
+# (the model may not emit a clean ), which is why N_PREDICT above
+# exists as a hard backstop.
# LLAMACPP_REASONING_BUDGET=32768
# Optional raw llama-server args appended last. Useful for one-off experiments without editing compose.
# Note: if you previously had `--reasoning-budget N` in here, remove it —
diff --git a/docker-compose.yml b/docker-compose.yml
index aa9a10c..3611690 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -40,13 +40,12 @@ services:
- backend
llamacpp:
- # Default points at the repo-built TurboQuant image (llamacpp/Dockerfile).
- # Escape hatch: set LLAMACPP_IMAGE=ghcr.io/ggml-org/llama.cpp:server-cuda in
- # .env to fall back to upstream (loses turbo2 / turbo3 KV types).
- image: ${LLAMACPP_IMAGE:-ordo-ai-stack/llamacpp-turboquant:latest}
- build:
- context: ./llamacpp
- dockerfile: Dockerfile
+ # Pinned mainline llama.cpp (ggml-org) by DIGEST for reproducibility —
+ # build 9765 (73618f27a), the first build that loads Qwen3.6 MTP + qwen35moe
+ # GGUFs natively (upstream PR #22673). This is the single source of truth for
+ # the image; bump the digest deliberately (stack_monitor tracks the build).
+ # Override LLAMACPP_IMAGE in .env only to test a different build.
+ image: ${LLAMACPP_IMAGE:-ghcr.io/ggml-org/llama.cpp@sha256:44cd08334f85c1fcb363e3798302f2986cdb78cdfc8c1cf56bdad44041595a32}
restart: unless-stopped
platform: linux/amd64
entrypoint: ["/bin/sh", "/llamacpp-scripts/run-llama-server.sh"]
@@ -97,8 +96,9 @@ services:
- backend
# Use CUDA image so linux/amd64 is available (plain :server manifest can resolve to arm64 on Docker Desktop).
+ # Pinned by digest to the same mainline build as llamacpp (build 9765 / 73618f27a).
llamacpp-embed:
- image: ${LLAMACPP_EMBED_IMAGE:-ghcr.io/ggml-org/llama.cpp:server-cuda}
+ image: ${LLAMACPP_EMBED_IMAGE:-ghcr.io/ggml-org/llama.cpp@sha256:44cd08334f85c1fcb363e3798302f2986cdb78cdfc8c1cf56bdad44041595a32}
restart: unless-stopped
platform: linux/amd64
# The upstream :server-cuda is a rolling tag that has flipped its
diff --git a/llamacpp/Dockerfile b/llamacpp/Dockerfile
deleted file mode 100644
index fd7d148..0000000
--- a/llamacpp/Dockerfile
+++ /dev/null
@@ -1,119 +0,0 @@
-# syntax=docker/dockerfile:1.6
-# Custom llama.cpp server built from the AmesianX TurboQuant fork.
-#
-# Ships turbo2 (2.5 bpw) and turbo3 (3.5 bpw) KV-cache quantization types
-# on top of mainline llama.cpp. See docs/configuration.md for the operator
-# reference and the plan note at the bottom of this file for rationale.
-#
-# Pinned commit: AmesianX/TurboQuant @ 0a8062ded5a4a3e050102874bf21a3e5303a95cf
-# (HEAD of main as of 2026-04-22)
-# To bump: `git ls-remote https://github.com/AmesianX/TurboQuant.git HEAD`,
-# update TURBOQUANT_SHA below, rebuild. Re-run the perplexity sanity check
-# after any bump — the fork can rebase onto mainline without warning.
-#
-# Target GPU: RTX 5090 (Blackwell, compute capability 12.0 = sm_120).
-# Building for sm_120 explicitly avoids a PTX JIT stall on the first call.
-
-# CUDA 12.8+ is required for Blackwell sm_120 (RTX 50-series consumer) —
-# earlier versions' nvcc rejects `-gencode=arch=compute_120` with
-# "Unsupported gpu architecture 'compute_120'".
-ARG CUDA_VERSION=12.8.1
-ARG UBUNTU_VERSION=22.04
-
-# --- Build stage --------------------------------------------------------------
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS build
-
-ARG TURBOQUANT_REPO=https://github.com/AmesianX/TurboQuant.git
-ARG TURBOQUANT_SHA=0a8062ded5a4a3e050102874bf21a3e5303a95cf
-ARG CMAKE_CUDA_ARCHITECTURES=120
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-# CUDA driver API stubs — libcuda.so.1 is provided by the NVIDIA driver at
-# runtime (via --gpus all + nvidia-container-runtime), but at build time the
-# linker needs a stub to satisfy cuMemCreate / cuMemMap / etc references
-# emitted by ggml-cuda.
-#
-# The toolkit ships /usr/local/cuda/lib64/stubs/libcuda.so (unversioned), but
-# libggml-cuda.so is built with SONAME dependency on libcuda.so.1 — so when
-# linking the executable we need both LIBRARY_PATH AND a versioned symlink.
-# The symlink is created below in the apt-install step; LIBRARY_PATH lets gcc
-# pass the stubs dir to ld via its generated -L flags.
-ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
- build-essential \
- ca-certificates \
- cmake \
- git \
- libcurl4-openssl-dev \
- ninja-build \
- pkg-config \
- && rm -rf /var/lib/apt/lists/* \
- && ln -sf libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
-
-WORKDIR /src
-# Fetch only the specific commit tree (no history, no submodules — the fork's
-# .gitmodules is empty as of the pinned SHA). Avoids the `--filter=blob:none`
-# lazy-fetch path which stalled during builds in this environment.
-RUN git init . \
- && git remote add origin "${TURBOQUANT_REPO}" \
- && git -c protocol.version=2 fetch --depth 1 origin "${TURBOQUANT_SHA}" \
- && git checkout --detach FETCH_HEAD
-
-# Flags chosen for this deployment:
-# GGML_CUDA=ON enable CUDA backend
-# CMAKE_CUDA_ARCHITECTURES=120 Blackwell sm_120 (RTX 5090) — no PTX JIT
-# GGML_CUDA_FA_ALL_QUANTS=ON compile Flash Attention for every quant
-# variant. TurboQuant silently corrupts
-# without FA, so this + the shell-wrapper
-# safety rail must both be in place.
-# GGML_NATIVE=OFF don't tune to host CPU; we're CUDA-bound.
-# LLAMA_BUILD_TESTS=OFF skip fork test suite; we test at the
-# llama-server arg + runtime level.
-# (LLAMA_CURL is deprecated in current llama.cpp — curl is enabled by default.)
-RUN cmake -S . -B build -G Ninja \
- -DCMAKE_BUILD_TYPE=Release \
- -DGGML_CUDA=ON \
- -DCMAKE_CUDA_ARCHITECTURES="${CMAKE_CUDA_ARCHITECTURES}" \
- -DGGML_CUDA_FA_ALL_QUANTS=ON \
- -DGGML_NATIVE=OFF \
- -DLLAMA_BUILD_TESTS=OFF \
- -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs" \
- -DCMAKE_SHARED_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs" \
- && cmake --build build -j --target llama-server llama-perplexity
-
-# --- Runtime stage ------------------------------------------------------------
-FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-ARG TURBOQUANT_SHA=0a8062ded5a4a3e050102874bf21a3e5303a95cf
-LABEL org.opencontainers.image.source="https://github.com/AmesianX/TurboQuant"
-LABEL org.opencontainers.image.revision="${TURBOQUANT_SHA}"
-LABEL org.opencontainers.image.description="llama.cpp server w/ TurboQuant KV-cache (turbo2/turbo3) for Blackwell sm_120"
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
- ca-certificates \
- curl \
- libgomp1 \
- && rm -rf /var/lib/apt/lists/*
-
-WORKDIR /app
-
-# Copy the built binaries plus every ggml / llama shared lib they dlopen.
-# COPY with a trailing slash pulls the whole build/bin directory — simpler
-# than enumerating every libggml-*.so variant, and works whether the fork
-# builds them as unversioned or versioned (libfoo.so.N) symlinks.
-COPY --from=build /src/build/bin/ /app/
-
-ENV LD_LIBRARY_PATH=/app:${LD_LIBRARY_PATH}
-
-EXPOSE 8080
-
-# Intentionally empty — the compose service mounts
-# scripts/llamacpp/run-llama-server.sh as the entrypoint so the stack's existing
-# env-driven arg assembly (with the turbo* -> --flash-attn on safety rail) stays
-# authoritative. See docker-compose.yml for the `entrypoint:` override.
-ENTRYPOINT []
-CMD ["/app/llama-server", "--host", "0.0.0.0", "--port", "8080"]