AlienWalker1995 · AlienWalker1995 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/.env.example b/.env.example
@@ -48,27 +48,19 @@ OPEN_WEBUI_DEFAULT_MODEL=local-chat
 # LLAMACPP_FLASH_ATTN=auto
 # Feature flag: when set to 1, llama.cpp starts with quantized KV cache flags.
 # LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION=0
-# KV cache types used when quantization is enabled. Valid values depend on the
-# llama.cpp build. With the default repo-built image (llamacpp/Dockerfile,
-# AmesianX TurboQuant fork) the TurboQuant types are available alongside the
-# mainline ones. Pick one of:
-#   tbq3_0    — TurboQuant 3-bit + Walsh-Hadamard rotation. Near-neutral quality.
-#   tbq4_0    — TurboQuant 4-bit + WHT. Closest to fp16 quality of the tbq set.
-#   tbqp3_0   — TurboQuant_prod (2-bit Lloyd-Max + 1-bit QJL). Max compression,
-#               marginal quality dip — the paper's "near-2.5 bpw" variant.
-#               Use this to fit 128k context + 19 GB weights in 32 GB VRAM.
-#   tbqp4_0   — TurboQuant_prod (3-bit Lloyd-Max + 1-bit QJL). Safer than tbqp3_0.
-#   tbq3_1 / tbq4_1 / tbqp3_1 / tbqp4_1  — head_dim=128 optimized variants.
-#   tbq3_2 / tbq4_2 / tbqp3_2 / tbqp4_2  — head_dim=64 optimized variants.
-#   q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 — mainline types, always available.
-# Any tbq* / tbqp* type requires Flash Attention; the entrypoint forces
-# --flash-attn on when either K or V is a tbq* type (see
-# scripts/llamacpp/run-llama-server.sh).
-# LLAMACPP_KV_CACHE_TYPE_K=q4_0
-# LLAMACPP_KV_CACHE_TYPE_V=q4_0
-# Image selection. Default is the repo-built TurboQuant build. Set to
-# ghcr.io/ggml-org/llama.cpp:server-cuda to fall back to upstream (loses turbo*).
-# LLAMACPP_IMAGE=ordo-ai-stack/llamacpp-turboquant:latest
+# KV cache types used when quantization is enabled (mainline llama.cpp types):
+#   q8_0  — 8-bit, highest quality of the quantized set. Repo default for the
+#           hybrid A3B (only ~11/41 layers are full-attention, so KV stays ~3 GB
+#           at 512k ctx even at q8_0).
+#   q4_0, q4_1, q5_0, q5_1, iq4_nl — smaller / more aggressive.
+# Quantized KV works best with Flash Attention; set LLAMACPP_FLASH_ATTN=on (or
+# leave it auto) when enabling it.
+# LLAMACPP_KV_CACHE_TYPE_K=q8_0
+# LLAMACPP_KV_CACHE_TYPE_V=q8_0
+# Image selection. llama.cpp is pinned by DIGEST in docker-compose.yml
+# (ggml-org mainline build 9765 / 73618f27a — loads Qwen3.6 MTP + qwen35moe
+# natively). Override here ONLY to test a different build, e.g.:
+# LLAMACPP_IMAGE=ghcr.io/ggml-org/llama.cpp:server-cuda
 # Hard ceiling on tokens per single request. Defense-in-depth against
 # runaway-reasoning loops where --reasoning-budget fails to close the
 # <think> block (model never emits </think> in a confused large-context
@@ -78,9 +70,9 @@ OPEN_WEBUI_DEFAULT_MODEL=local-chat
 # Cap on tokens the model is allowed to spend inside <think>...</think> per
 # response. Hoisted out of LLAMACPP_EXTRA_ARGS so it can be monitored and
 # tuned from one place. Llama.cpp's grammar engine force-closes the block
-# when this is hit. Note: in the current b1-0a8062d build this isn't fully
-# reliable on Qwen3-family models, which is why N_PREDICT above exists as a
-# hard backstop.
+# when this is hit. Note: this isn't fully reliable on Qwen3-family models
+# (the model may not emit a clean </think>), which is why N_PREDICT above
+# exists as a hard backstop.
 # LLAMACPP_REASONING_BUDGET=32768
 # Optional raw llama-server args appended last. Useful for one-off experiments without editing compose.
 # Note: if you previously had `--reasoning-budget N` in here, remove it —

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -40,13 +40,12 @@ services:
       - backend
 
   llamacpp:
-    # Default points at the repo-built TurboQuant image (llamacpp/Dockerfile).
-    # Escape hatch: set LLAMACPP_IMAGE=ghcr.io/ggml-org/llama.cpp:server-cuda in
-    # .env to fall back to upstream (loses turbo2 / turbo3 KV types).
-    image: ${LLAMACPP_IMAGE:-ordo-ai-stack/llamacpp-turboquant:latest}
-    build:
-      context: ./llamacpp
-      dockerfile: Dockerfile
+    # Pinned mainline llama.cpp (ggml-org) by DIGEST for reproducibility —
+    # build 9765 (73618f27a), the first build that loads Qwen3.6 MTP + qwen35moe
+    # GGUFs natively (upstream PR #22673). This is the single source of truth for
+    # the image; bump the digest deliberately (stack_monitor tracks the build).
+    # Override LLAMACPP_IMAGE in .env only to test a different build.
+    image: ${LLAMACPP_IMAGE:-ghcr.io/ggml-org/llama.cpp@sha256:44cd08334f85c1fcb363e3798302f2986cdb78cdfc8c1cf56bdad44041595a32}
     restart: unless-stopped
     platform: linux/amd64
     entrypoint: ["/bin/sh", "/llamacpp-scripts/run-llama-server.sh"]
@@ -97,8 +96,9 @@ services:
       - backend
 
   # Use CUDA image so linux/amd64 is available (plain :server manifest can resolve to arm64 on Docker Desktop).
+  # Pinned by digest to the same mainline build as llamacpp (build 9765 / 73618f27a).
   llamacpp-embed:
-    image: ${LLAMACPP_EMBED_IMAGE:-ghcr.io/ggml-org/llama.cpp:server-cuda}
+    image: ${LLAMACPP_EMBED_IMAGE:-ghcr.io/ggml-org/llama.cpp@sha256:44cd08334f85c1fcb363e3798302f2986cdb78cdfc8c1cf56bdad44041595a32}
     restart: unless-stopped
     platform: linux/amd64
     # The upstream :server-cuda is a rolling tag that has flipped its

diff --git a/llamacpp/Dockerfile b/llamacpp/Dockerfile