bernardladenthin · bernardladenthin · Jun 21, 2026 · Jun 21, 2026
@@ -17,6 +17,9 @@ from version 5.0.0 onward. Pre-fork releases (`1.x`–`4.2.0`) were authored by
 - Real-model tool-calling integration tests for blocking and streaming required tool calls (`ToolCallingIntegrationTest`, Qwen2.5-1.5B-Instruct), wired into CI and `validate-models`.
 - End-to-end vision input across blocking, typed `ChatRequest`, streaming, and OpenAI-compatible request mapping; real-model tests verify that distinct red and blue images produce the correct semantic answers.
 - Explicit `setMmprojAuto(boolean)` and `setMmprojOffload(boolean)` controls, including the upstream `--no-mmproj-auto` and `--no-mmproj-offload` flags.
+- Per-request KV controls: `InferenceParameters.withSlotId(int)` and `withCacheReuse(int)`.
+- Typed cache observability through `Usage.getCachedTokens()`, `Usage.getProcessedPromptTokens()`, `SlotMetrics`, and `ServerMetrics.getSlotMetrics()`.
+- Authenticated JSON `GET /metrics` and `GET /slots` endpoints on the embedded server.
 
 ### Changed
 - Unified `CONTRIBUTING.md` and `SECURITY.md` structure with sibling repositories in the project family.
@@ -30,6 +33,8 @@ from version 5.0.0 onward. Pre-fork releases (`1.x`–`4.2.0`) were authored by
 - Preserved decoded image buffers across the JNI chat boundary and submitted media requests through llama.cpp's upstream multimodal task path instead of silently tokenizing them as text-only prompts.
 - Preserved multipart image content when using the typed `ChatRequest` serializer.
 - The standalone OpenAI-compatible server now advertises vision only when the loaded model confirms usable vision support.
+- `Session` now pins every inference request to its configured slot, so generation and slot save/restore/erase target the same KV state.
+- Cached-token usage is preserved through typed Java responses and OpenAI Responses/Anthropic blocking and streaming adapters.
 
 ### Added
 - Reasoning-budget tests (Qwen3-0.6B).

@@ -473,6 +473,23 @@ a JSON response, matching the HTTP server's contract:
 Server state is exposed via `getMetrics()`, `eraseSlot(int)`, `saveSlot(int, String)`,
 `restoreSlot(int, String)`, and `getModelMeta()`.
 
+### Prompt and KV Cache Reuse
+
+Prompt-prefix reuse is enabled by default in llama.cpp and can be controlled per request with
+`InferenceParameters.withCachePrompt(boolean)`. `withCacheReuse(int)` enables non-prefix chunk reuse,
+while `withSlotId(int)` pins a request to a specific server slot. `Session` applies its slot id to every
+request, so generation and `save`/`restore` operate on the same KV state.
+
+Typed results expose logical prompt, generated, cached prompt, and evaluated prompt counts through
+`Usage`. Per-request timing also remains available through `Timings.getCacheN()`.
+`LlamaModel.getMetricsTyped().getSlotMetrics()` reports each slot's logical, processed, cached,
+decoded, and remaining token counts.
+
+The embedded HTTP server exposes the same native JSON at authenticated `GET /metrics`, with the slot
+array alone at `GET /slots`. OpenAI responses preserve
+`usage.prompt_tokens_details.cached_tokens`; Responses API output uses
+`usage.input_tokens_details.cached_tokens`; Anthropic output uses `cache_read_input_tokens`.
+
 ### OpenAI-compatible HTTP server
 
 `net.ladenthin.llama.server.OpenAiCompatServer` turns a loaded model into a local
@@ -488,6 +505,8 @@ serves:
 | `POST /v1/rerank` (requires `--reranking`) | `LlamaModel.handleRerank` (reshaped to `results`/`data`) |
 | `POST /infill` | `LlamaModel.handleInfill` (fill-in-the-middle autocomplete) |
 | `GET /v1/models` | the configured model id |
+| `GET /metrics` | native server and per-slot token/cache counters (JSON) |
+| `GET /slots` | native per-slot token/cache counters (JSON array) |
 | `GET /health` | static `{"status":"ok"}` (unauthenticated) |
 
 Chat completions support **streaming via Server-Sent Events** and non-streaming, forwarding

@@ -205,6 +205,7 @@ static void populate_completion_task(server_task &task, jllama_context *jctx, in
         }
     }
     task.params = server_schema::eval_llama_cmpl_schema(jctx->vocab, jctx->params, n_ctx_slot, logit_bias_eog, data);
+    configure_task_slot_impl(task, data);
 }
 
 [[nodiscard]] static jint dispatch_streaming_completion(JNIEnv *env, jllama_context *jctx, const json &data,

@@ -14,7 +14,7 @@
 //     require_json_field_impl, jint_array_to_tokens_impl
 //
 //   Layer B — JNI + server orchestration:
-//     configure_multimodal_task_impl,
+//     configure_multimodal_task_impl, configure_task_slot_impl,
 //     json_to_jstring_impl, results_to_jstring_impl,
 //     embedding_to_jfloat_array_impl, tokens_to_jint_array_impl
 //
@@ -175,6 +175,12 @@ inline void erase_reader(jllama_context *jctx, int id_task) {
     return true;
 }
 
+// Match server_routes::handle_completions_impl(): slot selection is task
+// metadata, not part of task_params, so eval_llama_cmpl_schema() does not set it.
+inline void configure_task_slot_impl(server_task &task, const json &data) {
+    task.id_slot = json_value(data, "id_slot", -1);
+}
+
 // ---------------------------------------------------------------------------
 // json_to_jstring_impl
 //

@@ -185,7 +185,14 @@ public void close() {
      * @return inference parameters carrying the system message + wire messages
      */
     private InferenceParameters buildParams(@Nullable String systemMessage, List<Pair<String, String>> wireMessages) {
-        InferenceParameters params = InferenceParameters.empty().withMessages(systemMessage, wireMessages);
-        return paramsCustomizer == null ? params : paramsCustomizer.apply(params);
+        InferenceParameters params = InferenceParameters.empty()
+                .withMessages(systemMessage, wireMessages)
+                .withCachePrompt(true);
+        if (paramsCustomizer != null) {
+            params = paramsCustomizer.apply(params);
+        }
+        // Apply last: a Session must never drift away from the slot used by
+        // save(), restore(), and close(), even if a customizer supplies another id.
+        return params.withSlotId(slotId);
     }
 }
@@ -150,9 +150,14 @@ public ChatResponse parseResponse(String json) {
             JsonNode node = OBJECT_MAPPER.readTree(json);
             String id = node.path("id").asText("");
             List<ChatChoice> choices = parseChoices(node.path("choices"));
+            JsonNode usageNode = node.path("usage");
             Usage usage = new Usage(
-                    node.path("usage").path("prompt_tokens").asLong(0L),
-                    node.path("usage").path("completion_tokens").asLong(0L));
+                    usageNode.path("prompt_tokens").asLong(0L),
+                    usageNode.path("completion_tokens").asLong(0L),
+                    usageNode
+                            .path("prompt_tokens_details")
+                            .path("cached_tokens")
+                            .asLong(0L));
             Timings timings = Timings.fromJson(node.path("timings"));
             TimingsLogger.log(timings);
             return new ChatResponse(id, choices, usage, timings, json);

@@ -187,10 +187,11 @@ public CompletionResult parseCompletionResult(String json) {
         try {
             JsonNode node = OBJECT_MAPPER.readTree(json);
             String text = extractContent(node);
+            Timings timings = Timings.fromJson(node.path("timings"));
             Usage usage = new Usage(
                     node.path("tokens_evaluated").asLong(0L),
-                    node.path("tokens_predicted").asLong(0L));
-            Timings timings = Timings.fromJson(node.path("timings"));
+                    node.path("tokens_predicted").asLong(0L),
+                    Math.max(0, timings.getCacheN()));
             TimingsLogger.log(timings);
             List<TokenLogprob> logprobs = parseLogprobs(node);
             StopReason stopReason =

@@ -58,6 +58,8 @@ public final class InferenceParameters extends JsonParameters {
     private static final String PARAM_INPUT_PREFIX = "input_prefix";
     private static final String PARAM_INPUT_SUFFIX = "input_suffix";
     private static final String PARAM_CACHE_PROMPT = "cache_prompt";
+    private static final String PARAM_CACHE_REUSE = "n_cache_reuse";
+    private static final String PARAM_SLOT_ID = "id_slot";
     private static final String PARAM_STREAM_OPTIONS = "stream_options";
     private static final String PARAM_RESPONSE_FORMAT = "response_format";
     private static final String PARAM_N_PREDICT = "n_predict";
@@ -204,6 +206,36 @@ public InferenceParameters withCachePrompt(boolean cachePrompt) {
         return withScalar(PARAM_CACHE_PROMPT, cachePrompt);
     }
 
+    /**
+     * Returns a new request with the minimum reusable KV-cache chunk size replaced.
+     * A value of {@code 0} disables non-prefix chunk reuse. Ordinary common-prefix
+     * reuse remains controlled by {@link #withCachePrompt(boolean)}.
+     *
+     * @param cacheReuse minimum reusable chunk size, or {@code 0} to disable
+     * @return a new instance; this instance is unchanged
+     */
+    public InferenceParameters withCacheReuse(int cacheReuse) {
+        if (cacheReuse < 0) {
+            throw new IllegalArgumentException("cacheReuse must be non-negative");
+        }
+        return withScalar(PARAM_CACHE_REUSE, cacheReuse);
+    }
+
+    /**
+     * Returns a new request pinned to a llama.cpp server slot. Pinning is useful
+     * for deterministic multi-turn KV reuse and for matching inference with
+     * {@code saveSlot}/{@code restoreSlot} operations.
+     *
+     * @param slotId non-negative slot identifier
+     * @return a new instance; this instance is unchanged
+     */
+    public InferenceParameters withSlotId(int slotId) {
+        if (slotId < 0) {
+            throw new IllegalArgumentException("slotId must be non-negative");
+        }
+        return withScalar(PARAM_SLOT_ID, slotId);
+    }
+
     /**
      * Returns a new request with the number of tokens to predict replaced
      * (default: -1, -1 = infinity, -2 = until context filled).

@@ -1398,10 +1398,10 @@ public ModelParameters setKvUnified(boolean kvUnified) {
     /**
      * Set the maximum RAM cache size in MiB used to store saved slot KV state.
      * <p>
-     * Requires {@link #setKvUnified(boolean) unified KV} to be enabled.
      * Set to {@code -1} for no limit, {@code 0} to disable (default: 8192 MiB).
-     * Together with {@link #setClearIdle} this allows idle slots to be evicted
-     * from GPU/CPU memory and restored quickly on the next matching request.
+     * Together with {@link #setClearIdle}, idle slot states are copied into this
+     * RAM cache and restored on a matching request. Unified KV is required only
+     * when those idle slots should also be cleared from the active KV buffer.
      *
      * @param cacheRamMib maximum cache size in MiB, or {@code -1} for unlimited
      * @return this builder
@@ -1414,14 +1414,13 @@ public ModelParameters setCacheRamMib(int cacheRamMib) {
      * Enable or disable saving and clearing idle slots when a new task starts.
      * <p>
      * When enabled (the default), idle slots have their KV state saved to the
-     * RAM cache ({@link #setCacheRamMib}) and are then cleared, freeing GPU/CPU
-     * memory for the active request.  The saved state is transparently restored
-     * on the next request that shares the same prompt prefix, so cache-hit
-     * latency is preserved.
+     * RAM cache ({@link #setCacheRamMib}). With unified KV enabled, the active
+     * slot state is also cleared, freeing KV-buffer capacity for other requests.
+     * Without unified KV the RAM-cache copy is still created, but the active
+     * slot remains allocated.
      * <p>
-     * Requires {@link #setKvUnified(boolean) unified KV} and a non-zero
-     * {@link #setCacheRamMib RAM cache}.  If either dependency is absent the
-     * server logs a warning and silently disables the feature.
+     * Requires a non-zero {@link #setCacheRamMib RAM cache}. Unified KV is
+     * required only for active-buffer eviction.
      *
      * @param clearIdle {@code true} to enable idle-slot eviction (default), {@code false} to disable
      * @return this builder

@@ -254,7 +254,13 @@ static String toAnthropicResponse(String openAiCompletionJson, String model) {
             stopReason = anthropicStopReason(choice.path("finish_reason").asText("stop"));
             JsonNode openAiUsage = completion.path("usage");
             if (openAiUsage.isObject()) {
-                usage.put("input_tokens", openAiUsage.path("prompt_tokens").asInt(0));
+                int promptTokens = openAiUsage.path("prompt_tokens").asInt(0);
+                int cachedTokens = openAiUsage
+                        .path("prompt_tokens_details")
+                        .path("cached_tokens")
+                        .asInt(0);
+                usage.put("input_tokens", Math.max(0, promptTokens - cachedTokens));
+                usage.put("cache_read_input_tokens", cachedTokens);
                 usage.put("output_tokens", openAiUsage.path("completion_tokens").asInt(0));
             }
         } catch (IOException e) {
@@ -391,12 +397,20 @@ static String blockStopEvent(int index) {
 
     /** {@code message_delta} event carrying the final stop reason. */
     static String messageDeltaEvent(String stopReason) {
+        return messageDeltaEvent(stopReason, 0, 0, 0);
+    }
+
+    /** Final message delta carrying token usage collected from the trailing OpenAI usage chunk. */
+    static String messageDeltaEvent(String stopReason, int inputTokens, int outputTokens, int cachedTokens) {
         ObjectNode data = OBJECT_MAPPER.createObjectNode();
         data.put("type", "message_delta");
         ObjectNode delta = data.putObject("delta");
         delta.put("stop_reason", stopReason);
         delta.putNull("stop_sequence");
-        data.putObject("usage").put("output_tokens", 0);
+        ObjectNode usage = data.putObject("usage");
+        usage.put("input_tokens", inputTokens);
+        usage.put("output_tokens", outputTokens);
+        usage.put("cache_read_input_tokens", cachedTokens);
         return sseEvent("message_delta", data.toString());
     }
 

@@ -32,6 +32,9 @@ final class AnthropicStreamTranslator {
     private int textBlockIndex = -1;
     private int nextIndex;
     private String finishReason = "stop";
+    private int inputTokens;
+    private int outputTokens;
+    private int cachedTokens;
 
     AnthropicStreamTranslator(String id, String model) {
         this.id = id;
@@ -60,6 +63,15 @@ String onChunk(String openAiChunkJson) {
         try {
             JsonNode chunk = OBJECT_MAPPER.readTree(openAiChunkJson);
             accumulator.accept(chunk);
+            JsonNode usage = chunk.path("usage");
+            if (usage.isObject()) {
+                int promptTokens = usage.path("prompt_tokens").asInt(0);
+                cachedTokens = usage.path("prompt_tokens_details")
+                        .path("cached_tokens")
+                        .asInt(0);
+                inputTokens = Math.max(0, promptTokens - cachedTokens);
+                outputTokens = usage.path("completion_tokens").asInt(0);
+            }
             JsonNode choice = chunk.path("choices").path(0);
             JsonNode content = choice.path("delta").path("content");
             if (content.isTextual() && !content.asText().isEmpty()) {
@@ -102,7 +114,8 @@ String end() {
                 out.append(AnthropicApiSupport.blockStopEvent(index));
             }
         }
-        out.append(AnthropicApiSupport.messageDeltaEvent(AnthropicApiSupport.anthropicStopReason(finishReason)));
+        out.append(AnthropicApiSupport.messageDeltaEvent(
+                AnthropicApiSupport.anthropicStopReason(finishReason), inputTokens, outputTokens, cachedTokens));
         out.append(AnthropicApiSupport.messageStopEvent());
         return out.toString();
     }

@@ -40,6 +40,11 @@ final class LlamaModelBackend implements OpenAiBackend {
         this.mapper = mapper;
     }
 
+    @Override
+    public String metrics() {
+        return model.getMetrics();
+    }
+
     @Override
     public String complete(JsonNode request) {
         return model.chatComplete(mapper.toInferenceParameters(request));

@@ -22,6 +22,17 @@
  */
 interface OpenAiBackend {
 
+    /**
+     * Return llama.cpp server metrics, including per-slot cache counters.
+     * Test backends may rely on the empty default.
+     *
+     * @return metrics JSON
+     * @throws IOException if metrics cannot be read
+     */
+    default String metrics() throws IOException {
+        return "{\"slots\":[]}";
+    }
+
     /**
      * Run a non-streaming chat completion ({@code POST /v1/chat/completions}).
      *
-Original file line number
+Diff line change
@@ Expand Up @@
             }
         }
         task.params = server_schema::eval_llama_cmpl_schema(jctx->vocab, jctx->params, n_ctx_slot, logit_bias_eog, data);
+        configure_task_slot_impl(task, data);
     }
     [[nodiscard]] static jint dispatch_streaming_completion(JNIEnv *env, jllama_context *jctx, const json &data,
@@ Expand Down @@