From 02192f3c333681b9e9a7d17f2172d588770e44e5 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Jun 2026 07:40:28 +0000
Subject: [PATCH] fix(tts): resolve post-merge TTS findings (PR #268)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Re-checked all six findings raised after PR #268 against the pinned upstream
tts.cpp @ b9739. Two were false alarms; four were genuine.

Verified upstream-faithful (NOT divergences) — added provenance comments only:
- #1 llama_model_n_embd_out: upstream tts.cpp:1042 uses the exact same call.
  Comment now notes it reads the vocoder OUTPUT embedding width, matching upstream.
- #2 0.25 s silence lead-in: upstream tts.cpp:1077-1080 zeroes the first 0.25 s
  identically. Comment notes it mirrors upstream and that our `i < audio.size()`
  bound is an added safety guard over upstream's fixed 24000/4.

Genuine findings fixed:
- #3 heavy include in shared header: tts_upstream.h now includes
  <nlohmann/json_fwd.hpp> (forward-declares ordered_json) instead of the full
  <nlohmann/json.hpp>, and drops the json default argument. The single caller,
  tts_engine.cpp, includes the full json and passes an explicit empty object.
  Future includers of the shared interface no longer pull in ~25k lines of json.
- #4 unasserted duplicate enum: generate-tts-upstream.cmake now captures the
  upstream `outetts_version` enum body and pins its enumerators + order against
  the hand-written copy in tts_upstream.h, so a reorder/rename fails the configure
  instead of silently assigning different integer values across the two TUs.
- #5 prompt_add overload coverage: the bare `void prompt_add(` prefix de-statics
  all three upstream overloads but only proved >=1 existed. The generator now
  pins (whitespace-tolerant) both overloads the header declares, turning a future
  cryptic link error into a clear configure-time failure.
- #6 weak WAV assertion: TtsIntegrationTest now parses the RIFF/WAVE header
  (PCM format, mono, 24 kHz, 16-bit), checks chunk-size self-consistency, and
  scans the PCM payload for non-zero samples — so a near-empty or all-silent
  result no longer passes the way `length > 44` did.

Generator regexes validated against the real b9739 source via `cmake -P`
(including a negative drift control); Java test compiles; clang-format 22.1.5
and Spotless both clean. Native build not run here (sandbox proxy blocks the
dependency FetchContent clones) — exercised in CI.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01QTQ8mBM9tyKkHbXVBpGwET
---
 CLAUDE.md                                     | 13 ++--
 cmake/generate-tts-upstream.cmake             | 31 +++++++++
 src/main/cpp/tts_engine.cpp                   | 17 ++++-
 src/main/cpp/tts_upstream.h                   | 19 +++++-
 .../ladenthin/llama/TtsIntegrationTest.java   | 63 ++++++++++++++++---
 5 files changed, 123 insertions(+), 20 deletions(-)
diff --git a/CLAUDE.md b/CLAUDE.md
index 2fa5a455..90074a3f 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -410,11 +410,14 @@ Instead the helpers are **DERIVED mechanically at configure time** from the pinn
   generated definitions. The in-memory WAV writer (`tts_wav.hpp`) is ours, not extracted.
 
 **Fail-loud on drift (same contract as `patches/`):** the generator asserts every anchor — the
-`int main(` split point, each `static <signature>` it de-statics, and both speaker literals. If an
-upgrade renames a helper or moves a literal, the **configure step aborts** with a pointer to the
-generator; if upstream changes a *type*, `tts_upstream.h` stops matching and the **link fails**.
-Either way a silent divergence is impossible. On a llama.cpp bump, re-verify the generator the same
-way you re-verify `patches/`.
+`int main(` split point, each `static <signature>` it de-statics, the `outetts_version` enum
+(enumerators + order, kept ODR-identical to the hand-written copy in `tts_upstream.h`), both
+`prompt_add` overloads the header declares (the bare `void prompt_add(` prefix de-statics all three
+upstream overloads, so the two the header relies on are pinned individually), and both speaker
+literals. If an upgrade renames a helper, reorders the enum, or moves a literal, the **configure step
+aborts** with a pointer to the generator; if upstream changes a *type*, `tts_upstream.h` stops
+matching and the **link fails**. Either way a silent divergence is impossible. On a llama.cpp bump,
+re-verify the generator the same way you re-verify `patches/`.
 
 ## Upgrading/Downgrading llama.cpp Version
 
diff --git a/cmake/generate-tts-upstream.cmake b/cmake/generate-tts-upstream.cmake
index 4fafef8a..4232c462 100644
--- a/cmake/generate-tts-upstream.cmake
+++ b/cmake/generate-tts-upstream.cmake
@@ -56,6 +56,37 @@ foreach(sig IN LISTS JLLAMA_TTS_DESTATIC)
     string(REPLACE "static ${sig}" "${sig}" PREMAIN "${PREMAIN}")
 endforeach()
 
+# --- 2a. pin the outetts_version enum against the hand-written copy in tts_upstream.h ---
+# src/main/cpp/tts_upstream.h re-declares `enum outetts_version { OUTETTS_V0_2, OUTETTS_V0_3 }` because
+# it cannot include the generated TU. The two definitions live in different translation units and must
+# stay token-identical: if upstream reorders/renames/extends the enum, the generated TU and the header
+# would bind the same name to different integer values (a silent miscompile). Capture the upstream enum
+# body and compare its enumerator list so a drift fails the configure with a pointer to update the header.
+string(REGEX MATCH "enum[ \t\r\n]+outetts_version[ \t\r\n]*{([^}]*)}" _enum_match "${PREMAIN}")
+if(_enum_match STREQUAL "")
+    message(FATAL_ERROR "generate-tts-upstream: 'enum outetts_version' not found in tts.cpp — upstream changed; update cmake/generate-tts-upstream.cmake and src/main/cpp/tts_upstream.h")
+endif()
+set(_enum_body "${CMAKE_MATCH_1}")
+string(REGEX REPLACE "//[^\n]*" "" _enum_body "${_enum_body}")  # strip any line comments
+string(REGEX REPLACE "[ \t\r\n]+" "" _enum_body "${_enum_body}") # strip all whitespace
+string(REGEX REPLACE ",+$" "" _enum_body "${_enum_body}")       # strip a trailing comma
+if(NOT _enum_body STREQUAL "OUTETTS_V0_2,OUTETTS_V0_3")
+    message(FATAL_ERROR "generate-tts-upstream: upstream 'enum outetts_version' enumerators are now '${_enum_body}' (expected 'OUTETTS_V0_2,OUTETTS_V0_3'). Update the matching enum in src/main/cpp/tts_upstream.h to keep the two definitions ODR-identical, then update this assertion in cmake/generate-tts-upstream.cmake")
+endif()
+
+# --- 2b. verify BOTH prompt_add overloads that tts_upstream.h declares are present ---
+# `void prompt_add(` is shared by three upstream overloads; the de-static REPLACE above (correctly) gives
+# all of them external linkage, but the single string(FIND) only proves >=1 exists. tts_upstream.h
+# declares exactly two — (llama_tokens&, const llama_tokens&) and the (vocab, txt, add_special,
+# parse_special) builder — and tts_engine.cpp links against them. Pin both here (whitespace-tolerant) so
+# dropping or renaming either fails the configure with a clear pointer instead of a cryptic link error.
+if(NOT PREMAIN MATCHES "void[ \t]+prompt_add[ \t]*\\([^)]*const[ \t]+llama_tokens[ \t]*&[ \t]*tokens[ \t]*\\)")
+    message(FATAL_ERROR "generate-tts-upstream: the prompt_add(llama_tokens&, const llama_tokens&) overload declared in src/main/cpp/tts_upstream.h was not found in tts.cpp — upstream changed; update the de-static list and src/main/cpp/tts_upstream.h")
+endif()
+if(NOT PREMAIN MATCHES "void[ \t]+prompt_add[ \t]*\\([^)]*vocab[^)]*add_special[^)]*parse_special[^)]*\\)")
+    message(FATAL_ERROR "generate-tts-upstream: the prompt_add(llama_tokens&, const llama_vocab*, const std::string&, bool, bool) overload declared in src/main/cpp/tts_upstream.h was not found in tts.cpp — upstream changed; update the de-static list and src/main/cpp/tts_upstream.h")
+endif()
+
 # --- 3. extract the two default-speaker literals from inside main() ---
 # audio_text: a single-line  std::string audio_text = "<|text_start|>the<|text_sep|>...";
 # The leading "<|text_start|>the<|text_sep|>" disambiguates it from the empty-seed literal
diff --git a/src/main/cpp/tts_engine.cpp b/src/main/cpp/tts_engine.cpp
index e310ec4f..586b0571 100644
--- a/src/main/cpp/tts_engine.cpp
+++ b/src/main/cpp/tts_engine.cpp
@@ -21,6 +21,11 @@
 #include "llama.h"
 #include "sampling.h"
 
+// Full json definition: tts_upstream.h only forward-declares nlohmann::ordered_json (keeping the heavy
+// header out of the shared interface), but this TU constructs the empty-object speaker argument for
+// get_tts_version(), which needs the complete type.
+#include <nlohmann/json.hpp>
+
 #include <algorithm>
 #include <cstdint>
 #include <regex>
@@ -67,7 +72,9 @@ tts_engine *engine_init(const std::string &ttc_model_path, const std::string &ct
         return nullptr;
     }
     engine->vocab = llama_model_get_vocab(engine->model_ttc);
-    engine->tts_version = get_tts_version(engine->model_ttc);
+    // Explicit empty-object speaker: tts_upstream.h declares no default (it forward-declares json), so
+    // the default lives only in the generated TU. We always use the built-in default speaker profile.
+    engine->tts_version = get_tts_version(engine->model_ttc, nlohmann::ordered_json::object());
 
     // Codes-to-speech (CTS) vocoder, loaded in embedding mode.
     params.model.path = cts_model_path;
@@ -202,13 +209,19 @@ bool engine_synthesize(tts_engine *engine, const std::string &text, int n_predic
     }
     llama_synchronize(engine->ctx_cts);
 
+    // llama_model_n_embd_out (not llama_model_n_embd): read the vocoder's OUTPUT embedding width, which
+    // is what llama_get_embeddings returns here. This matches upstream tts.cpp, which also queries
+    // llama_model_n_embd_out at this step.
     const int n_embd = llama_model_n_embd_out(engine->model_cts);
     const float *embd = llama_get_embeddings(engine->ctx_cts);
     std::vector<float> audio = embd_to_audio(embd, n_codes, n_embd, engine->n_threads);
     llama_batch_free(cts_batch);
 
-    // Zero the first 0.25 s (suppresses a leading click).
+    // 24 kHz mono — the OuteTTS / WavTokenizer output rate.
     const int n_sr = 24000;
+    // Zero the first 0.25 s, mirroring upstream tts.cpp's post-vocoder cleanup (it suppresses a leading
+    // click). The `&& i < audio.size()` guard is ours: it keeps the loop in-bounds for clips shorter
+    // than 0.25 s, where upstream's fixed 24000/4 bound would read past the buffer.
     for (int i = 0; i < n_sr / 4 && i < (int)audio.size(); ++i) {
         audio[i] = 0.0f;
     }
diff --git a/src/main/cpp/tts_upstream.h b/src/main/cpp/tts_upstream.h
index 394ff3c0..0ef82e79 100644
--- a/src/main/cpp/tts_upstream.h
+++ b/src/main/cpp/tts_upstream.h
@@ -15,12 +15,21 @@
 #include <string>
 #include <vector>
 
-#include <nlohmann/json.hpp>
+// Forward declarations only. This shared interface header names nlohmann::ordered_json once (the
+// get_tts_version() speaker parameter) but never instantiates it, so it must not pull the full
+// ~25k-line <nlohmann/json.hpp> into every translation unit that includes it. The single caller that
+// constructs the empty-object default (tts_engine.cpp) includes the full <nlohmann/json.hpp> itself.
+#include <nlohmann/json_fwd.hpp>
 
 #include "common.h" // llama_tokens
 #include "llama.h"  // llama_model, llama_vocab, llama_token
 
-// Mirrors the upstream enum (identical definition; ODR-compatible across translation units).
+// Mirrors the upstream enum (identical definition; ODR-compatible across translation units). The
+// generated TU carries upstream's own copy, so these enumerators and their order MUST stay
+// token-identical to upstream — otherwise the two definitions assign different integer values to the
+// same name (a silent miscompile). cmake/generate-tts-upstream.cmake asserts the upstream enum still
+// reads `{ OUTETTS_V0_2, OUTETTS_V0_3 }` at configure time and fails loud (pointing here) if a
+// llama.cpp bump changes it.
 enum outetts_version { OUTETTS_V0_2, OUTETTS_V0_3 };
 
 // --- derived from upstream tts.cpp (defined in the generated translation unit) ---
@@ -40,7 +49,11 @@ void prompt_init(llama_tokens &prompt, const llama_vocab *vocab);
 std::vector<llama_token> prepare_guide_tokens(const llama_vocab *vocab, const std::string &str,
                                               outetts_version tts_version);
 
-outetts_version get_tts_version(llama_model *model, nlohmann::ordered_json speaker = nlohmann::ordered_json::object());
+// No default argument here on purpose: constructing nlohmann::ordered_json::object() needs the full
+// json definition, which this header deliberately does not include (see the json_fwd note above). The
+// sole caller (tts_engine.cpp) passes an explicit empty object; the generated TU keeps upstream's own
+// default, so its internal calls are unaffected.
+outetts_version get_tts_version(llama_model *model, nlohmann::ordered_json speaker);
 
 // Default OuteTTS speaker profile, extracted from upstream main() into the generated TU.
 extern const std::string jllama_tts_default_audio_text;
diff --git a/src/test/java/net/ladenthin/llama/TtsIntegrationTest.java b/src/test/java/net/ladenthin/llama/TtsIntegrationTest.java
index 07a7799a..2516bbd8 100644
--- a/src/test/java/net/ladenthin/llama/TtsIntegrationTest.java
+++ b/src/test/java/net/ladenthin/llama/TtsIntegrationTest.java
@@ -9,6 +9,9 @@
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.File;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.StandardCharsets;
 import java.util.concurrent.TimeUnit;
 import org.junit.jupiter.api.Assumptions;
 import org.junit.jupiter.api.DisplayName;
@@ -26,8 +29,11 @@
  */
 public class TtsIntegrationTest {
 
+    /** Canonical RIFF/WAVE header size in bytes (16-bit PCM, no extra chunks). */
+    private static final int WAV_HEADER_BYTES = 44;
+
     @Test
-    @DisplayName("synthesize() returns a well-formed 16-bit WAV byte stream")
+    @DisplayName("synthesize() returns a well-formed, non-silent 24 kHz mono 16-bit WAV")
     @Timeout(value = 300_000, unit = TimeUnit.MILLISECONDS)
     public void synthesizesWellFormedWav() {
         String ttc = System.getProperty(TestConstants.PROP_TTS_TTC_MODEL);
@@ -45,15 +51,52 @@ public void synthesizesWellFormedWav() {
             byte[] wav = tts.synthesize("hello from llama");
 
             assertNotNull(wav, "WAV bytes must not be null");
-            assertTrue(wav.length > 44, "WAV must carry a header plus samples; got " + wav.length + " bytes");
-            assertEquals('R', (char) wav[0]);
-            assertEquals('I', (char) wav[1]);
-            assertEquals('F', (char) wav[2]);
-            assertEquals('F', (char) wav[3]);
-            assertEquals('W', (char) wav[8]);
-            assertEquals('A', (char) wav[9]);
-            assertEquals('V', (char) wav[10]);
-            assertEquals('E', (char) wav[11]);
+            // A bare 44-byte header with no payload is not a valid clip: require real samples beyond it.
+            assertTrue(
+                    wav.length > WAV_HEADER_BYTES,
+                    "WAV must carry a header plus samples; got " + wav.length + " bytes");
+
+            // RIFF/WAVE container magic.
+            assertEquals("RIFF", tag(wav, 0), "RIFF magic");
+            assertEquals("WAVE", tag(wav, 8), "WAVE magic");
+            assertEquals("fmt ", tag(wav, 12), "fmt subchunk tag");
+            assertEquals("data", tag(wav, 36), "data subchunk tag");
+
+            // fmt fields must match the documented output format: 24 kHz mono 16-bit PCM. A mis-loaded
+            // model that still framed a header would not silently pass with the wrong rate/channels.
+            ByteBuffer header = ByteBuffer.wrap(wav).order(ByteOrder.LITTLE_ENDIAN);
+            assertEquals(1, header.getShort(20) & 0xFFFF, "audio format must be PCM (1)");
+            assertEquals(1, header.getShort(22) & 0xFFFF, "must be mono (1 channel)");
+            assertEquals(24_000, header.getInt(24), "sample rate must be 24 kHz");
+            assertEquals(16, header.getShort(34) & 0xFFFF, "must be 16-bit samples");
+
+            // Declared chunk sizes must be self-consistent with the actual byte-array length.
+            assertEquals(wav.length - 8, header.getInt(4), "RIFF chunk size must equal fileLength - 8");
+            int dataSize = header.getInt(40);
+            assertEquals(wav.length - WAV_HEADER_BYTES, dataSize, "data chunk size must equal fileLength - 44");
+            assertEquals(0, dataSize % 2, "16-bit PCM data size must be even");
+
+            // The clip must contain real audio, not just the zeroed 0.25 s lead-in (or the all-silent
+            // buffer a mis-configured model could still frame inside an otherwise valid header). The
+            // original `length > 44` check passed on a single padding byte; scan the PCM payload instead.
+            assertTrue(
+                    hasNonZeroSample(wav, WAV_HEADER_BYTES),
+                    "synthesized PCM must contain audible (non-zero) samples, not pure silence");
+        }
+    }
+
+    /** Reads the 4-byte ASCII chunk tag at {@code offset}. */
+    private static String tag(byte[] wav, int offset) {
+        return new String(wav, offset, 4, StandardCharsets.US_ASCII);
+    }
+
+    /** Returns {@code true} if any byte of the PCM payload at or after {@code from} is non-zero. */
+    private static boolean hasNonZeroSample(byte[] wav, int from) {
+        for (int i = from; i < wav.length; i++) {
+            if (wav[i] != 0) {
+                return true;
+            }
         }
+        return false;
     }
 }