bernardladenthin · bernardladenthin · Jun 21, 2026 · Jun 21, 2026 · Jun 21, 2026
@@ -98,6 +98,19 @@ if [ "${USE_CACHE:-true}" = "true" ] && command -v sccache >/dev/null 2>&1 \
    && [ -n "${SCCACHE_WEBDAV_TOKEN:-}${SCCACHE_GHA_ENABLED:-}" ] \
    && sccache_can_wrap_compiler; then
   LAUNCH="-DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache"
+  # CUDA builds: also wrap nvcc so the per-arch .cu device passes are cached too — not just
+  # the gcc host TUs. Those per-architecture device-pass objects are the dominant cost of the
+  # full-arch CUDA job, and sccache does support nvcc as a compiler. Scoped to CUDA builds
+  # (GGML_CUDA in the cmake args): CMAKE_CUDA_COMPILER_LAUNCHER is inert when CUDA is not an
+  # enabled language, but keeping it scoped leaves the CPU/Android jobs' configure output clean.
+  # If sccache cannot wrap nvcc it runs it directly (uncached); and the mid-build retry below
+  # also catches an sccache "Compiler not supported" failure and rebuilds without the launcher,
+  # so an nvcc-hostile sccache can never red the build.
+  case " $* " in
+    *" -DGGML_CUDA=1 "* | *" -DGGML_CUDA=ON "* | *" -DGGML_CUDA=on "*)
+      LAUNCH="$LAUNCH -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache"
+      echo "build.sh: sccache will also wrap nvcc (CUDA build detected)" ;;
+  esac
   echo "build.sh: sccache ON (endpoint=${SCCACHE_WEBDAV_ENDPOINT:-default}), building with -j${JOBS}"
 else
   echo "build.sh: sccache OFF, building with -j${JOBS}"
@@ -113,12 +126,15 @@ cmake -Bbuild $LAUNCH $@ || exit 1
 # check, so recover by retrying the build once WITHOUT the launcher: a from-scratch uncached -O3
 # build is content-identical and release-safe, so the cache can never red the build. The retry is
 # gated on the failure output actually showing an sccache cache error, so a genuine compile error
-# still fails fast (and is reported) instead of triggering a wasteful uncached rebuild.
+# still fails fast (and is reported) instead of triggering a wasteful uncached rebuild. The
+# "Compiler not supported" signature additionally covers the CUDA case: if wrapping nvcc breaks
+# (sccache declining/erroring on the nvcc driver), the retry rebuilds the full-arch CUDA job
+# without any launcher rather than redding it.
 build_log="$(mktemp 2>/dev/null || echo "/tmp/jllama-build.$$.log")"
 cmake --build build --config Release -j"${JOBS}" 2>&1 | tee "$build_log"
 build_rc=${PIPESTATUS[0]}
 if [ "$build_rc" -ne 0 ]; then
-  if [ -n "$LAUNCH" ] && grep -qiE 'sccache: error|Server startup failed|cache storage failed' "$build_log"; then
+  if [ -n "$LAUNCH" ] && grep -qiE 'sccache: error|Server startup failed|cache storage failed|Compiler not supported' "$build_log"; then
     echo "build.sh: build failed via an sccache cache error — retrying WITHOUT cache (clean reconfigure)."
     rm -f "$build_log"
     rm -rf build && mkdir -p build

@@ -171,30 +171,25 @@ jobs:
     name: Cross-Compile manylinux_2_28 x86_64 (CUDA)
     needs: [startgate, build-webui]
     runs-on: ubuntu-latest
-    # Phase 2 dockcross cache rollout — job 2, enabled after manylinux2014 (job 1) verified green
-    # in CI with sccache v0.16.0 caching to Depot. build_cuda_linux.sh execs build.sh, so the same
-    # probe guards this job: only the gcc C/C++ TUs cache (the nvcc .cu kernels are not wrapped),
-    # still a large partial win on this ~70 min build. Diagnostics are on for its first run on the
-    # manylinux_2_28 image; drop them (and their -e passthroughs) once it is confirmed green with a
-    # cache hit, then enable the next job. Inert without DEPOT_TOKEN (fork PRs) or use_cache=false.
+    # CUDA cache rollout. build_cuda_linux.sh execs build.sh, so the same sccache probe guards
+    # this job. Unlike the other jobs, build.sh now also wraps nvcc (CMAKE_CUDA_COMPILER_LAUNCHER
+    # =sccache) for CUDA builds, so the per-arch .cu device passes — the dominant cost of this job
+    # — are cached too, not just the gcc host TUs. Because nvcc kernels now cache, this job always
+    # builds the FULL CMAKE_CUDA_ARCHITECTURES set (no single-arch validation shortcut): the warm
+    # cache, not a reduced arch set, is what keeps it fast, and every artifact stays release-safe
+    # (runs on every GPU generation) on PR/push as well as publish. The first (cold-cache) run still
+    # pays the full nvcc cost; the win shows on subsequent warm runs. CUDA_FAST_BUILD still exists in
+    # build_cuda_linux.sh as a LOCAL-dev knob, but CI no longer sets it. Diagnostics (SCCACHE_LOG /
+    # SCCACHE_ERROR_LOG / RUST_BACKTRACE) stay on until a warm run confirms nvcc cache hits; drop them
+    # (and their -e passthroughs) afterwards. Inert without DEPOT_TOKEN (fork PRs) or use_cache=false.
     env:
       USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }}
       SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev
       SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}
       SCCACHE_LOG: debug
       SCCACHE_ERROR_LOG: /tmp/sccache_server.log
       RUST_BACKTRACE: full
-      # CUDA arch policy: FAST single-arch build for validation runs (PR / push / non-publish
-      # dispatch) to cut nvcc time; FULL arch set only when actually publishing to Central
-      # (publish_to_central=true) so the distributed jar runs on every GPU generation. The
-      # publish-snapshot/publish-release jobs require publish_to_central, so any artifact that
-      # reaches Central is always built with the full set. CI has no GPU, so the fast path pins a
-      # fixed CUDA_ARCH ('native' would fail at configure). '0' => full (release-safe), '1' => fast.
-      CUDA_FAST_BUILD: ${{ inputs.publish_to_central && '0' || '1' }}
-      # Newest CUDA 13.2 architecture: sm_120 (consumer Blackwell / RTX 50xx). Only used on the
-      # fast validation path; bump as newer GPU generations ship. Releases ignore it (full set).
-      CUDA_ARCH: '120'
-      DOCKCROSS_ARGS: "-e SCCACHE_WEBDAV_ENDPOINT -e SCCACHE_WEBDAV_TOKEN -e USE_CACHE -e SCCACHE_LOG -e SCCACHE_ERROR_LOG -e RUST_BACKTRACE -e CUDA_FAST_BUILD -e CUDA_ARCH"
+      DOCKCROSS_ARGS: "-e SCCACHE_WEBDAV_ENDPOINT -e SCCACHE_WEBDAV_TOKEN -e USE_CACHE -e SCCACHE_LOG -e SCCACHE_ERROR_LOG -e RUST_BACKTRACE"
     steps:
       - uses: actions/checkout@v7
       - name: Download shared WebUI assets

@@ -41,15 +41,18 @@ git commit -m "Upgrade CUDA from 13.2 to 13.3"
 ### Fast local CUDA builds (`CUDA_FAST_BUILD`) — single-arch speed knob
 
 The CUDA artifact must ship kernels for **every supported GPU generation**, so the default
-build — and every CI/release build — compiles the **full `CMAKE_CUDA_ARCHITECTURES` set** that
+build — and every CI build — compiles the **full `CMAKE_CUDA_ARCHITECTURES` set** that
 ggml/llama.cpp selects. nvcc recompiles each `.cu` kernel once per architecture, which is the
-dominant cost of the ~70 min CUDA job. **`sccache` does not help here:** it caches the gcc
-C/C++ TUs but not the nvcc `.cu` kernels (sccache's nvcc support is limited/experimental), so
-the per-arch nvcc passes remain even with the cache on. The one reliable lever to cut that time
-is to build **fewer architectures**.
+dominant cost of the ~70 min CUDA job. **`sccache` now wraps nvcc too:** `build.sh` adds
+`-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache` for CUDA builds (it detects `GGML_CUDA` in the cmake
+args), so the per-arch `.cu` device passes are cached over Depot alongside the gcc C/C++ TUs.
+Because the kernels are content-addressed and llama.cpp is pinned, a **warm** cache recompiles
+only what changed — so CI keeps the **full arch set on every run** (release-safe everywhere)
+and relies on the cache, not a reduced arch set, for speed. The first (cold-cache) run still
+pays the full nvcc cost; the win shows on subsequent warm runs.
 
-`build_cuda_linux.sh` therefore honors an **opt-in** env knob — default **off** (full arch set,
-release-safe):
+`CUDA_FAST_BUILD` remains as a **local-dev** single-arch knob (CI no longer sets it).
+`build_cuda_linux.sh` honors it — default **off** (full arch set, release-safe):
 
 ```bash
 # Full release build (default): all archs — slow, runs on every GPU generation.
@@ -65,17 +68,16 @@ CUDA_FAST_BUILD=1 CUDA_ARCH=90 .github/build_cuda_linux.sh "-DOS_NAME=Linux -DOS
 **Default + CI policy (release-safety is the invariant).** An artifact built with `CUDA_FAST_BUILD`
 runs on only the single GPU generation it was compiled for, so the **distributed jar must always be
 the full arch set**. The script default is **off** (full) so any *local/manual* build is
-release-safe. In CI (`publish.yml`, the `crosscompile-linux-x86_64-cuda` job) the flag is **on for
-validation runs** (PR / push / non-publish dispatch) to cut nvcc time, and **off only when actually
-publishing to Central** — it is wired as `CUDA_FAST_BUILD: ${{ inputs.publish_to_central && '0' || '1' }}`
-(`'0'`=full, `'1'`=fast). Because the `publish-snapshot`/`publish-release` jobs require
-`publish_to_central`, **every artifact that reaches Central is built with the full arch set** while
-ordinary PR/push CI stays fast. CI has no GPU, so the fast path pins a fixed `CUDA_ARCH` (default
-`120` — the newest CUDA 13.2 arch, sm_120 / consumer Blackwell — in the job env) — `native`
-would fail at configure. Both `CUDA_FAST_BUILD` and `CUDA_ARCH` are
-forwarded into the dockcross container via `DOCKCROSS_ARGS` `-e`. To cache the nvcc kernels too you
-would add `-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache` (gated behind the same probe), but sccache's nvcc
-caching is unreliable — the arch knob is the better lever and is what this repo ships.
+release-safe, and **CI no longer sets `CUDA_FAST_BUILD` at all** — the `crosscompile-linux-x86_64-cuda`
+job always builds the full set on PR / push / dispatch / publish, so every artifact (not just the ones
+that reach Central) runs on every GPU generation. The full-arch CI cost is absorbed by the
+sccache-over-Depot cache, which now wraps nvcc (`-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache`, added by
+`build.sh` for CUDA builds, gated behind the same probe). The launcher is safe to enable
+unconditionally: if sccache cannot wrap nvcc it runs it directly (uncached), and `build.sh`'s
+mid-build retry treats an sccache `Compiler not supported` failure like any other cache error and
+rebuilds the job without the launcher rather than redding it. **Verify it works:** the premise
+(sccache producing nvcc cache hits inside the manylinux_2_28 container) is proven only by a **warm**
+run — check `sccache --show-stats` shows CUDA hits on the second build before trusting the speedup.
 
 ## Android minimum API level
 
@@ -321,10 +323,14 @@ v0.16.0 + the probe this is no longer a risk.) Job-by-job status:
    **v0.16.0** probe passed in-container (devtoolset-10 gcc), `sccache ON` over Depot WebDAV,
    warm cache 277/278 hits (99.64%), 1m46s build time.
 2. `crosscompile-linux-x86_64-cuda` (via `build_cuda_linux.sh`, which execs `build.sh`) —
-   🚧 **first run in progress** (diagnostics on). Only the gcc C/C++ TUs cache (134 model files
-   + ggml + httplib); the nvcc `.cu` kernels won't (limited sccache nvcc support) — still a
-   large partial win on the ~70 min full-arch job; the fast single-arch (sm_120) validation path
-   cuts nvcc time independently of sccache.
+   🚧 **nvcc caching enabled, full-arch always** (diagnostics on). `build.sh` now also wraps nvcc
+   (`-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache`, scoped to CUDA builds), so both the gcc C/C++ TUs
+   (134 model files + ggml + httplib) **and** the per-arch `.cu` device passes cache over Depot.
+   CI dropped the single-arch validation shortcut (`CUDA_FAST_BUILD`/`CUDA_ARCH` removed from the
+   job) — every run builds the full arch set and leans on the warm cache for speed. **Unverified
+   until a warm run:** confirm `sccache --show-stats` reports CUDA hits on the second build; if
+   nvcc caching proves weak in this container, the cold-vs-warm delta will be small and the job
+   stays ~70 min (the mid-build retry guards against an nvcc-hostile sccache redding the build).
 3. `crosscompile-linux-aarch64` — ✅ **enabled**, now a **native `ubuntu-24.04-arm` build** (not
    dockcross): `build.sh` self-fetches the aarch64 static-musl sccache (the fetch block in
    `build.sh` maps `uname -m` → `x86_64`/`aarch64`) and the probe guards it. See "Linux aarch64:

@@ -216,7 +216,7 @@ public InferenceParameters withCachePrompt(boolean cachePrompt) {
      */
     public InferenceParameters withCacheReuse(int cacheReuse) {
         if (cacheReuse < 0) {
-            throw new IllegalArgumentException("cacheReuse must be non-negative");
+            throw new IllegalArgumentException("cacheReuse must be non-negative but was " + cacheReuse);
         }
         return withScalar(PARAM_CACHE_REUSE, cacheReuse);
     }
@@ -231,7 +231,7 @@ public InferenceParameters withCacheReuse(int cacheReuse) {
      */
     public InferenceParameters withSlotId(int slotId) {
         if (slotId < 0) {
-            throw new IllegalArgumentException("slotId must be non-negative");
+            throw new IllegalArgumentException("slotId must be non-negative but was " + slotId);
         }
         return withScalar(PARAM_SLOT_ID, slotId);
     }