diff --git a/.env.example b/.env.example index 53eeaad..f5ca3e2 100644 --- a/.env.example +++ b/.env.example @@ -48,27 +48,19 @@ OPEN_WEBUI_DEFAULT_MODEL=local-chat # LLAMACPP_FLASH_ATTN=auto # Feature flag: when set to 1, llama.cpp starts with quantized KV cache flags. # LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION=0 -# KV cache types used when quantization is enabled. Valid values depend on the -# llama.cpp build. With the default repo-built image (llamacpp/Dockerfile, -# AmesianX TurboQuant fork) the TurboQuant types are available alongside the -# mainline ones. Pick one of: -# tbq3_0 — TurboQuant 3-bit + Walsh-Hadamard rotation. Near-neutral quality. -# tbq4_0 — TurboQuant 4-bit + WHT. Closest to fp16 quality of the tbq set. -# tbqp3_0 — TurboQuant_prod (2-bit Lloyd-Max + 1-bit QJL). Max compression, -# marginal quality dip — the paper's "near-2.5 bpw" variant. -# Use this to fit 128k context + 19 GB weights in 32 GB VRAM. -# tbqp4_0 — TurboQuant_prod (3-bit Lloyd-Max + 1-bit QJL). Safer than tbqp3_0. -# tbq3_1 / tbq4_1 / tbqp3_1 / tbqp4_1 — head_dim=128 optimized variants. -# tbq3_2 / tbq4_2 / tbqp3_2 / tbqp4_2 — head_dim=64 optimized variants. -# q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 — mainline types, always available. -# Any tbq* / tbqp* type requires Flash Attention; the entrypoint forces -# --flash-attn on when either K or V is a tbq* type (see -# scripts/llamacpp/run-llama-server.sh). -# LLAMACPP_KV_CACHE_TYPE_K=q4_0 -# LLAMACPP_KV_CACHE_TYPE_V=q4_0 -# Image selection. Default is the repo-built TurboQuant build. Set to -# ghcr.io/ggml-org/llama.cpp:server-cuda to fall back to upstream (loses turbo*). -# LLAMACPP_IMAGE=ordo-ai-stack/llamacpp-turboquant:latest +# KV cache types used when quantization is enabled (mainline llama.cpp types): +# q8_0 — 8-bit, highest quality of the quantized set. Repo default for the +# hybrid A3B (only ~11/41 layers are full-attention, so KV stays ~3 GB +# at 512k ctx even at q8_0). +# q4_0, q4_1, q5_0, q5_1, iq4_nl — smaller / more aggressive. +# Quantized KV works best with Flash Attention; set LLAMACPP_FLASH_ATTN=on (or +# leave it auto) when enabling it. +# LLAMACPP_KV_CACHE_TYPE_K=q8_0 +# LLAMACPP_KV_CACHE_TYPE_V=q8_0 +# Image selection. llama.cpp is pinned by DIGEST in docker-compose.yml +# (ggml-org mainline build 9765 / 73618f27a — loads Qwen3.6 MTP + qwen35moe +# natively). Override here ONLY to test a different build, e.g.: +# LLAMACPP_IMAGE=ghcr.io/ggml-org/llama.cpp:server-cuda # Hard ceiling on tokens per single request. Defense-in-depth against # runaway-reasoning loops where --reasoning-budget fails to close the # block (model never emits in a confused large-context @@ -78,9 +70,9 @@ OPEN_WEBUI_DEFAULT_MODEL=local-chat # Cap on tokens the model is allowed to spend inside ... per # response. Hoisted out of LLAMACPP_EXTRA_ARGS so it can be monitored and # tuned from one place. Llama.cpp's grammar engine force-closes the block -# when this is hit. Note: in the current b1-0a8062d build this isn't fully -# reliable on Qwen3-family models, which is why N_PREDICT above exists as a -# hard backstop. +# when this is hit. Note: this isn't fully reliable on Qwen3-family models +# (the model may not emit a clean ), which is why N_PREDICT above +# exists as a hard backstop. # LLAMACPP_REASONING_BUDGET=32768 # Optional raw llama-server args appended last. Useful for one-off experiments without editing compose. # Note: if you previously had `--reasoning-budget N` in here, remove it — diff --git a/docker-compose.yml b/docker-compose.yml index aa9a10c..3611690 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -40,13 +40,12 @@ services: - backend llamacpp: - # Default points at the repo-built TurboQuant image (llamacpp/Dockerfile). - # Escape hatch: set LLAMACPP_IMAGE=ghcr.io/ggml-org/llama.cpp:server-cuda in - # .env to fall back to upstream (loses turbo2 / turbo3 KV types). - image: ${LLAMACPP_IMAGE:-ordo-ai-stack/llamacpp-turboquant:latest} - build: - context: ./llamacpp - dockerfile: Dockerfile + # Pinned mainline llama.cpp (ggml-org) by DIGEST for reproducibility — + # build 9765 (73618f27a), the first build that loads Qwen3.6 MTP + qwen35moe + # GGUFs natively (upstream PR #22673). This is the single source of truth for + # the image; bump the digest deliberately (stack_monitor tracks the build). + # Override LLAMACPP_IMAGE in .env only to test a different build. + image: ${LLAMACPP_IMAGE:-ghcr.io/ggml-org/llama.cpp@sha256:44cd08334f85c1fcb363e3798302f2986cdb78cdfc8c1cf56bdad44041595a32} restart: unless-stopped platform: linux/amd64 entrypoint: ["/bin/sh", "/llamacpp-scripts/run-llama-server.sh"] @@ -97,8 +96,9 @@ services: - backend # Use CUDA image so linux/amd64 is available (plain :server manifest can resolve to arm64 on Docker Desktop). + # Pinned by digest to the same mainline build as llamacpp (build 9765 / 73618f27a). llamacpp-embed: - image: ${LLAMACPP_EMBED_IMAGE:-ghcr.io/ggml-org/llama.cpp:server-cuda} + image: ${LLAMACPP_EMBED_IMAGE:-ghcr.io/ggml-org/llama.cpp@sha256:44cd08334f85c1fcb363e3798302f2986cdb78cdfc8c1cf56bdad44041595a32} restart: unless-stopped platform: linux/amd64 # The upstream :server-cuda is a rolling tag that has flipped its diff --git a/llamacpp/Dockerfile b/llamacpp/Dockerfile deleted file mode 100644 index fd7d148..0000000 --- a/llamacpp/Dockerfile +++ /dev/null @@ -1,119 +0,0 @@ -# syntax=docker/dockerfile:1.6 -# Custom llama.cpp server built from the AmesianX TurboQuant fork. -# -# Ships turbo2 (2.5 bpw) and turbo3 (3.5 bpw) KV-cache quantization types -# on top of mainline llama.cpp. See docs/configuration.md for the operator -# reference and the plan note at the bottom of this file for rationale. -# -# Pinned commit: AmesianX/TurboQuant @ 0a8062ded5a4a3e050102874bf21a3e5303a95cf -# (HEAD of main as of 2026-04-22) -# To bump: `git ls-remote https://github.com/AmesianX/TurboQuant.git HEAD`, -# update TURBOQUANT_SHA below, rebuild. Re-run the perplexity sanity check -# after any bump — the fork can rebase onto mainline without warning. -# -# Target GPU: RTX 5090 (Blackwell, compute capability 12.0 = sm_120). -# Building for sm_120 explicitly avoids a PTX JIT stall on the first call. - -# CUDA 12.8+ is required for Blackwell sm_120 (RTX 50-series consumer) — -# earlier versions' nvcc rejects `-gencode=arch=compute_120` with -# "Unsupported gpu architecture 'compute_120'". -ARG CUDA_VERSION=12.8.1 -ARG UBUNTU_VERSION=22.04 - -# --- Build stage -------------------------------------------------------------- -FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS build - -ARG TURBOQUANT_REPO=https://github.com/AmesianX/TurboQuant.git -ARG TURBOQUANT_SHA=0a8062ded5a4a3e050102874bf21a3e5303a95cf -ARG CMAKE_CUDA_ARCHITECTURES=120 - -ENV DEBIAN_FRONTEND=noninteractive - -# CUDA driver API stubs — libcuda.so.1 is provided by the NVIDIA driver at -# runtime (via --gpus all + nvidia-container-runtime), but at build time the -# linker needs a stub to satisfy cuMemCreate / cuMemMap / etc references -# emitted by ggml-cuda. -# -# The toolkit ships /usr/local/cuda/lib64/stubs/libcuda.so (unversioned), but -# libggml-cuda.so is built with SONAME dependency on libcuda.so.1 — so when -# linking the executable we need both LIBRARY_PATH AND a versioned symlink. -# The symlink is created below in the apt-install step; LIBRARY_PATH lets gcc -# pass the stubs dir to ld via its generated -L flags. -ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs - -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - cmake \ - git \ - libcurl4-openssl-dev \ - ninja-build \ - pkg-config \ - && rm -rf /var/lib/apt/lists/* \ - && ln -sf libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 - -WORKDIR /src -# Fetch only the specific commit tree (no history, no submodules — the fork's -# .gitmodules is empty as of the pinned SHA). Avoids the `--filter=blob:none` -# lazy-fetch path which stalled during builds in this environment. -RUN git init . \ - && git remote add origin "${TURBOQUANT_REPO}" \ - && git -c protocol.version=2 fetch --depth 1 origin "${TURBOQUANT_SHA}" \ - && git checkout --detach FETCH_HEAD - -# Flags chosen for this deployment: -# GGML_CUDA=ON enable CUDA backend -# CMAKE_CUDA_ARCHITECTURES=120 Blackwell sm_120 (RTX 5090) — no PTX JIT -# GGML_CUDA_FA_ALL_QUANTS=ON compile Flash Attention for every quant -# variant. TurboQuant silently corrupts -# without FA, so this + the shell-wrapper -# safety rail must both be in place. -# GGML_NATIVE=OFF don't tune to host CPU; we're CUDA-bound. -# LLAMA_BUILD_TESTS=OFF skip fork test suite; we test at the -# llama-server arg + runtime level. -# (LLAMA_CURL is deprecated in current llama.cpp — curl is enabled by default.) -RUN cmake -S . -B build -G Ninja \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_CUDA=ON \ - -DCMAKE_CUDA_ARCHITECTURES="${CMAKE_CUDA_ARCHITECTURES}" \ - -DGGML_CUDA_FA_ALL_QUANTS=ON \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs" \ - -DCMAKE_SHARED_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs" \ - && cmake --build build -j --target llama-server llama-perplexity - -# --- Runtime stage ------------------------------------------------------------ -FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} - -ARG TURBOQUANT_SHA=0a8062ded5a4a3e050102874bf21a3e5303a95cf -LABEL org.opencontainers.image.source="https://github.com/AmesianX/TurboQuant" -LABEL org.opencontainers.image.revision="${TURBOQUANT_SHA}" -LABEL org.opencontainers.image.description="llama.cpp server w/ TurboQuant KV-cache (turbo2/turbo3) for Blackwell sm_120" - -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get update && apt-get install -y --no-install-recommends \ - ca-certificates \ - curl \ - libgomp1 \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /app - -# Copy the built binaries plus every ggml / llama shared lib they dlopen. -# COPY with a trailing slash pulls the whole build/bin directory — simpler -# than enumerating every libggml-*.so variant, and works whether the fork -# builds them as unversioned or versioned (libfoo.so.N) symlinks. -COPY --from=build /src/build/bin/ /app/ - -ENV LD_LIBRARY_PATH=/app:${LD_LIBRARY_PATH} - -EXPOSE 8080 - -# Intentionally empty — the compose service mounts -# scripts/llamacpp/run-llama-server.sh as the entrypoint so the stack's existing -# env-driven arg assembly (with the turbo* -> --flash-attn on safety rail) stays -# authoritative. See docker-compose.yml for the `entrypoint:` override. -ENTRYPOINT [] -CMD ["/app/llama-server", "--host", "0.0.0.0", "--port", "8080"]