From 8a953840749de65e0f23e8c6d82609fbfffd9936 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 09:22:20 +0000 Subject: [PATCH 1/2] CUDA CI: cache nvcc via sccache and always build the full arch set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wrap nvcc with sccache (CMAKE_CUDA_COMPILER_LAUNCHER) for CUDA builds so the per-arch .cu device passes — the dominant cost of the ~70 min CUDA job — cache over Depot alongside the gcc host TUs, not just the C/C++ TUs. With the kernels cached, drop the single-arch validation shortcut: CI no longer sets CUDA_FAST_BUILD/CUDA_ARCH, so every run builds the full CMAKE_CUDA_ARCHITECTURES set (release-safe on PR/push as well as publish) and relies on the warm cache for speed. - build.sh: add -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache, scoped to CUDA builds (GGML_CUDA in the cmake args), behind the existing probe. Broaden the mid-build retry trigger with "Compiler not supported" so an nvcc-hostile sccache falls back to an uncached green build instead of redding it. - publish.yml: remove CUDA_FAST_BUILD/CUDA_ARCH from the CUDA job and their DOCKCROSS_ARGS passthroughs; full arch every run. - CLAUDE.md: document nvcc caching + full-arch CI policy; CUDA_FAST_BUILD stays a local-dev-only knob. Warm-run verification of nvcc cache hits still pending. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01PJGUpbfRCjbRcovCTq5v4u --- .github/build.sh | 20 ++++++++++++-- .github/workflows/publish.yml | 29 +++++++++----------- CLAUDE.md | 50 ++++++++++++++++++++--------------- 3 files changed, 58 insertions(+), 41 deletions(-) diff --git a/.github/build.sh b/.github/build.sh index 6a3fee11..480e2009 100755 --- a/.github/build.sh +++ b/.github/build.sh @@ -98,6 +98,19 @@ if [ "${USE_CACHE:-true}" = "true" ] && command -v sccache >/dev/null 2>&1 \ && [ -n "${SCCACHE_WEBDAV_TOKEN:-}${SCCACHE_GHA_ENABLED:-}" ] \ && sccache_can_wrap_compiler; then LAUNCH="-DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache" + # CUDA builds: also wrap nvcc so the per-arch .cu device passes are cached too — not just + # the gcc host TUs. Those per-architecture device-pass objects are the dominant cost of the + # full-arch CUDA job, and sccache does support nvcc as a compiler. Scoped to CUDA builds + # (GGML_CUDA in the cmake args): CMAKE_CUDA_COMPILER_LAUNCHER is inert when CUDA is not an + # enabled language, but keeping it scoped leaves the CPU/Android jobs' configure output clean. + # If sccache cannot wrap nvcc it runs it directly (uncached); and the mid-build retry below + # also catches an sccache "Compiler not supported" failure and rebuilds without the launcher, + # so an nvcc-hostile sccache can never red the build. + case " $* " in + *" -DGGML_CUDA=1 "* | *" -DGGML_CUDA=ON "* | *" -DGGML_CUDA=on "*) + LAUNCH="$LAUNCH -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache" + echo "build.sh: sccache will also wrap nvcc (CUDA build detected)" ;; + esac echo "build.sh: sccache ON (endpoint=${SCCACHE_WEBDAV_ENDPOINT:-default}), building with -j${JOBS}" else echo "build.sh: sccache OFF, building with -j${JOBS}" @@ -113,12 +126,15 @@ cmake -Bbuild $LAUNCH $@ || exit 1 # check, so recover by retrying the build once WITHOUT the launcher: a from-scratch uncached -O3 # build is content-identical and release-safe, so the cache can never red the build. The retry is # gated on the failure output actually showing an sccache cache error, so a genuine compile error -# still fails fast (and is reported) instead of triggering a wasteful uncached rebuild. +# still fails fast (and is reported) instead of triggering a wasteful uncached rebuild. The +# "Compiler not supported" signature additionally covers the CUDA case: if wrapping nvcc breaks +# (sccache declining/erroring on the nvcc driver), the retry rebuilds the full-arch CUDA job +# without any launcher rather than redding it. build_log="$(mktemp 2>/dev/null || echo "/tmp/jllama-build.$$.log")" cmake --build build --config Release -j"${JOBS}" 2>&1 | tee "$build_log" build_rc=${PIPESTATUS[0]} if [ "$build_rc" -ne 0 ]; then - if [ -n "$LAUNCH" ] && grep -qiE 'sccache: error|Server startup failed|cache storage failed' "$build_log"; then + if [ -n "$LAUNCH" ] && grep -qiE 'sccache: error|Server startup failed|cache storage failed|Compiler not supported' "$build_log"; then echo "build.sh: build failed via an sccache cache error — retrying WITHOUT cache (clean reconfigure)." rm -f "$build_log" rm -rf build && mkdir -p build diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index e25a19e8..72b94786 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -171,12 +171,17 @@ jobs: name: Cross-Compile manylinux_2_28 x86_64 (CUDA) needs: [startgate, build-webui] runs-on: ubuntu-latest - # Phase 2 dockcross cache rollout — job 2, enabled after manylinux2014 (job 1) verified green - # in CI with sccache v0.16.0 caching to Depot. build_cuda_linux.sh execs build.sh, so the same - # probe guards this job: only the gcc C/C++ TUs cache (the nvcc .cu kernels are not wrapped), - # still a large partial win on this ~70 min build. Diagnostics are on for its first run on the - # manylinux_2_28 image; drop them (and their -e passthroughs) once it is confirmed green with a - # cache hit, then enable the next job. Inert without DEPOT_TOKEN (fork PRs) or use_cache=false. + # CUDA cache rollout. build_cuda_linux.sh execs build.sh, so the same sccache probe guards + # this job. Unlike the other jobs, build.sh now also wraps nvcc (CMAKE_CUDA_COMPILER_LAUNCHER + # =sccache) for CUDA builds, so the per-arch .cu device passes — the dominant cost of this job + # — are cached too, not just the gcc host TUs. Because nvcc kernels now cache, this job always + # builds the FULL CMAKE_CUDA_ARCHITECTURES set (no single-arch validation shortcut): the warm + # cache, not a reduced arch set, is what keeps it fast, and every artifact stays release-safe + # (runs on every GPU generation) on PR/push as well as publish. The first (cold-cache) run still + # pays the full nvcc cost; the win shows on subsequent warm runs. CUDA_FAST_BUILD still exists in + # build_cuda_linux.sh as a LOCAL-dev knob, but CI no longer sets it. Diagnostics (SCCACHE_LOG / + # SCCACHE_ERROR_LOG / RUST_BACKTRACE) stay on until a warm run confirms nvcc cache hits; drop them + # (and their -e passthroughs) afterwards. Inert without DEPOT_TOKEN (fork PRs) or use_cache=false. env: USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }} SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev @@ -184,17 +189,7 @@ jobs: SCCACHE_LOG: debug SCCACHE_ERROR_LOG: /tmp/sccache_server.log RUST_BACKTRACE: full - # CUDA arch policy: FAST single-arch build for validation runs (PR / push / non-publish - # dispatch) to cut nvcc time; FULL arch set only when actually publishing to Central - # (publish_to_central=true) so the distributed jar runs on every GPU generation. The - # publish-snapshot/publish-release jobs require publish_to_central, so any artifact that - # reaches Central is always built with the full set. CI has no GPU, so the fast path pins a - # fixed CUDA_ARCH ('native' would fail at configure). '0' => full (release-safe), '1' => fast. - CUDA_FAST_BUILD: ${{ inputs.publish_to_central && '0' || '1' }} - # Newest CUDA 13.2 architecture: sm_120 (consumer Blackwell / RTX 50xx). Only used on the - # fast validation path; bump as newer GPU generations ship. Releases ignore it (full set). - CUDA_ARCH: '120' - DOCKCROSS_ARGS: "-e SCCACHE_WEBDAV_ENDPOINT -e SCCACHE_WEBDAV_TOKEN -e USE_CACHE -e SCCACHE_LOG -e SCCACHE_ERROR_LOG -e RUST_BACKTRACE -e CUDA_FAST_BUILD -e CUDA_ARCH" + DOCKCROSS_ARGS: "-e SCCACHE_WEBDAV_ENDPOINT -e SCCACHE_WEBDAV_TOKEN -e USE_CACHE -e SCCACHE_LOG -e SCCACHE_ERROR_LOG -e RUST_BACKTRACE" steps: - uses: actions/checkout@v7 - name: Download shared WebUI assets diff --git a/CLAUDE.md b/CLAUDE.md index 92ac3ef9..f489c0b8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -41,15 +41,18 @@ git commit -m "Upgrade CUDA from 13.2 to 13.3" ### Fast local CUDA builds (`CUDA_FAST_BUILD`) — single-arch speed knob The CUDA artifact must ship kernels for **every supported GPU generation**, so the default -build — and every CI/release build — compiles the **full `CMAKE_CUDA_ARCHITECTURES` set** that +build — and every CI build — compiles the **full `CMAKE_CUDA_ARCHITECTURES` set** that ggml/llama.cpp selects. nvcc recompiles each `.cu` kernel once per architecture, which is the -dominant cost of the ~70 min CUDA job. **`sccache` does not help here:** it caches the gcc -C/C++ TUs but not the nvcc `.cu` kernels (sccache's nvcc support is limited/experimental), so -the per-arch nvcc passes remain even with the cache on. The one reliable lever to cut that time -is to build **fewer architectures**. +dominant cost of the ~70 min CUDA job. **`sccache` now wraps nvcc too:** `build.sh` adds +`-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache` for CUDA builds (it detects `GGML_CUDA` in the cmake +args), so the per-arch `.cu` device passes are cached over Depot alongside the gcc C/C++ TUs. +Because the kernels are content-addressed and llama.cpp is pinned, a **warm** cache recompiles +only what changed — so CI keeps the **full arch set on every run** (release-safe everywhere) +and relies on the cache, not a reduced arch set, for speed. The first (cold-cache) run still +pays the full nvcc cost; the win shows on subsequent warm runs. -`build_cuda_linux.sh` therefore honors an **opt-in** env knob — default **off** (full arch set, -release-safe): +`CUDA_FAST_BUILD` remains as a **local-dev** single-arch knob (CI no longer sets it). +`build_cuda_linux.sh` honors it — default **off** (full arch set, release-safe): ```bash # Full release build (default): all archs — slow, runs on every GPU generation. @@ -65,17 +68,16 @@ CUDA_FAST_BUILD=1 CUDA_ARCH=90 .github/build_cuda_linux.sh "-DOS_NAME=Linux -DOS **Default + CI policy (release-safety is the invariant).** An artifact built with `CUDA_FAST_BUILD` runs on only the single GPU generation it was compiled for, so the **distributed jar must always be the full arch set**. The script default is **off** (full) so any *local/manual* build is -release-safe. In CI (`publish.yml`, the `crosscompile-linux-x86_64-cuda` job) the flag is **on for -validation runs** (PR / push / non-publish dispatch) to cut nvcc time, and **off only when actually -publishing to Central** — it is wired as `CUDA_FAST_BUILD: ${{ inputs.publish_to_central && '0' || '1' }}` -(`'0'`=full, `'1'`=fast). Because the `publish-snapshot`/`publish-release` jobs require -`publish_to_central`, **every artifact that reaches Central is built with the full arch set** while -ordinary PR/push CI stays fast. CI has no GPU, so the fast path pins a fixed `CUDA_ARCH` (default -`120` — the newest CUDA 13.2 arch, sm_120 / consumer Blackwell — in the job env) — `native` -would fail at configure. Both `CUDA_FAST_BUILD` and `CUDA_ARCH` are -forwarded into the dockcross container via `DOCKCROSS_ARGS` `-e`. To cache the nvcc kernels too you -would add `-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache` (gated behind the same probe), but sccache's nvcc -caching is unreliable — the arch knob is the better lever and is what this repo ships. +release-safe, and **CI no longer sets `CUDA_FAST_BUILD` at all** — the `crosscompile-linux-x86_64-cuda` +job always builds the full set on PR / push / dispatch / publish, so every artifact (not just the ones +that reach Central) runs on every GPU generation. The full-arch CI cost is absorbed by the +sccache-over-Depot cache, which now wraps nvcc (`-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache`, added by +`build.sh` for CUDA builds, gated behind the same probe). The launcher is safe to enable +unconditionally: if sccache cannot wrap nvcc it runs it directly (uncached), and `build.sh`'s +mid-build retry treats an sccache `Compiler not supported` failure like any other cache error and +rebuilds the job without the launcher rather than redding it. **Verify it works:** the premise +(sccache producing nvcc cache hits inside the manylinux_2_28 container) is proven only by a **warm** +run — check `sccache --show-stats` shows CUDA hits on the second build before trusting the speedup. ## Android minimum API level @@ -321,10 +323,14 @@ v0.16.0 + the probe this is no longer a risk.) Job-by-job status: **v0.16.0** probe passed in-container (devtoolset-10 gcc), `sccache ON` over Depot WebDAV, warm cache 277/278 hits (99.64%), 1m46s build time. 2. `crosscompile-linux-x86_64-cuda` (via `build_cuda_linux.sh`, which execs `build.sh`) — - 🚧 **first run in progress** (diagnostics on). Only the gcc C/C++ TUs cache (134 model files - + ggml + httplib); the nvcc `.cu` kernels won't (limited sccache nvcc support) — still a - large partial win on the ~70 min full-arch job; the fast single-arch (sm_120) validation path - cuts nvcc time independently of sccache. + 🚧 **nvcc caching enabled, full-arch always** (diagnostics on). `build.sh` now also wraps nvcc + (`-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache`, scoped to CUDA builds), so both the gcc C/C++ TUs + (134 model files + ggml + httplib) **and** the per-arch `.cu` device passes cache over Depot. + CI dropped the single-arch validation shortcut (`CUDA_FAST_BUILD`/`CUDA_ARCH` removed from the + job) — every run builds the full arch set and leans on the warm cache for speed. **Unverified + until a warm run:** confirm `sccache --show-stats` reports CUDA hits on the second build; if + nvcc caching proves weak in this container, the cold-vs-warm delta will be small and the job + stays ~70 min (the mid-build retry guards against an nvcc-hostile sccache redding the build). 3. `crosscompile-linux-aarch64` — ✅ **enabled**, now a **native `ubuntu-24.04-arm` build** (not dockcross): `build.sh` self-fetches the aarch64 static-musl sccache (the fetch block in `build.sh` maps `uname -m` → `x86_64`/`aarch64`) and the probe guards it. See "Linux aarch64: From c91d6f245a3ec323f5b06b7d8b6a0f29a98811e0 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 09:38:00 +0000 Subject: [PATCH 2/2] Fix SpotBugs WEM_WEAK_EXCEPTION_MESSAGING in InferenceParameters withCacheReuse(int) and withSlotId(int) threw IllegalArgumentException with a static message string, which SpotBugs flags as WEM_WEAK_EXCEPTION_MESSAGING and fails the Build-and-analyze job. Include the offending value in each message so the exception carries dynamic context. Pre-existing on the base branch; surfaced on this PR's CI. The SessionTest assertion uses containsString and still matches. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01PJGUpbfRCjbRcovCTq5v4u --- .../net/ladenthin/llama/parameters/InferenceParameters.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java b/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java index dd2b8fe4..647d9425 100644 --- a/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java +++ b/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java @@ -216,7 +216,7 @@ public InferenceParameters withCachePrompt(boolean cachePrompt) { */ public InferenceParameters withCacheReuse(int cacheReuse) { if (cacheReuse < 0) { - throw new IllegalArgumentException("cacheReuse must be non-negative"); + throw new IllegalArgumentException("cacheReuse must be non-negative but was " + cacheReuse); } return withScalar(PARAM_CACHE_REUSE, cacheReuse); } @@ -231,7 +231,7 @@ public InferenceParameters withCacheReuse(int cacheReuse) { */ public InferenceParameters withSlotId(int slotId) { if (slotId < 0) { - throw new IllegalArgumentException("slotId must be non-negative"); + throw new IllegalArgumentException("slotId must be non-negative but was " + slotId); } return withScalar(PARAM_SLOT_ID, slotId); }