From 02192f3c333681b9e9a7d17f2172d588770e44e5 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Jun 2026 07:40:28 +0000 Subject: [PATCH] fix(tts): resolve post-merge TTS findings (PR #268) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-checked all six findings raised after PR #268 against the pinned upstream tts.cpp @ b9739. Two were false alarms; four were genuine. Verified upstream-faithful (NOT divergences) — added provenance comments only: - #1 llama_model_n_embd_out: upstream tts.cpp:1042 uses the exact same call. Comment now notes it reads the vocoder OUTPUT embedding width, matching upstream. - #2 0.25 s silence lead-in: upstream tts.cpp:1077-1080 zeroes the first 0.25 s identically. Comment notes it mirrors upstream and that our `i < audio.size()` bound is an added safety guard over upstream's fixed 24000/4. Genuine findings fixed: - #3 heavy include in shared header: tts_upstream.h now includes (forward-declares ordered_json) instead of the full , and drops the json default argument. The single caller, tts_engine.cpp, includes the full json and passes an explicit empty object. Future includers of the shared interface no longer pull in ~25k lines of json. - #4 unasserted duplicate enum: generate-tts-upstream.cmake now captures the upstream `outetts_version` enum body and pins its enumerators + order against the hand-written copy in tts_upstream.h, so a reorder/rename fails the configure instead of silently assigning different integer values across the two TUs. - #5 prompt_add overload coverage: the bare `void prompt_add(` prefix de-statics all three upstream overloads but only proved >=1 existed. The generator now pins (whitespace-tolerant) both overloads the header declares, turning a future cryptic link error into a clear configure-time failure. - #6 weak WAV assertion: TtsIntegrationTest now parses the RIFF/WAVE header (PCM format, mono, 24 kHz, 16-bit), checks chunk-size self-consistency, and scans the PCM payload for non-zero samples — so a near-empty or all-silent result no longer passes the way `length > 44` did. Generator regexes validated against the real b9739 source via `cmake -P` (including a negative drift control); Java test compiles; clang-format 22.1.5 and Spotless both clean. Native build not run here (sandbox proxy blocks the dependency FetchContent clones) — exercised in CI. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01QTQ8mBM9tyKkHbXVBpGwET --- CLAUDE.md | 13 ++-- cmake/generate-tts-upstream.cmake | 31 +++++++++ src/main/cpp/tts_engine.cpp | 17 ++++- src/main/cpp/tts_upstream.h | 19 +++++- .../ladenthin/llama/TtsIntegrationTest.java | 63 ++++++++++++++++--- 5 files changed, 123 insertions(+), 20 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 2fa5a455..90074a3f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -410,11 +410,14 @@ Instead the helpers are **DERIVED mechanically at configure time** from the pinn generated definitions. The in-memory WAV writer (`tts_wav.hpp`) is ours, not extracted. **Fail-loud on drift (same contract as `patches/`):** the generator asserts every anchor — the -`int main(` split point, each `static ` it de-statics, and both speaker literals. If an -upgrade renames a helper or moves a literal, the **configure step aborts** with a pointer to the -generator; if upstream changes a *type*, `tts_upstream.h` stops matching and the **link fails**. -Either way a silent divergence is impossible. On a llama.cpp bump, re-verify the generator the same -way you re-verify `patches/`. +`int main(` split point, each `static ` it de-statics, the `outetts_version` enum +(enumerators + order, kept ODR-identical to the hand-written copy in `tts_upstream.h`), both +`prompt_add` overloads the header declares (the bare `void prompt_add(` prefix de-statics all three +upstream overloads, so the two the header relies on are pinned individually), and both speaker +literals. If an upgrade renames a helper, reorders the enum, or moves a literal, the **configure step +aborts** with a pointer to the generator; if upstream changes a *type*, `tts_upstream.h` stops +matching and the **link fails**. Either way a silent divergence is impossible. On a llama.cpp bump, +re-verify the generator the same way you re-verify `patches/`. ## Upgrading/Downgrading llama.cpp Version diff --git a/cmake/generate-tts-upstream.cmake b/cmake/generate-tts-upstream.cmake index 4fafef8a..4232c462 100644 --- a/cmake/generate-tts-upstream.cmake +++ b/cmake/generate-tts-upstream.cmake @@ -56,6 +56,37 @@ foreach(sig IN LISTS JLLAMA_TTS_DESTATIC) string(REPLACE "static ${sig}" "${sig}" PREMAIN "${PREMAIN}") endforeach() +# --- 2a. pin the outetts_version enum against the hand-written copy in tts_upstream.h --- +# src/main/cpp/tts_upstream.h re-declares `enum outetts_version { OUTETTS_V0_2, OUTETTS_V0_3 }` because +# it cannot include the generated TU. The two definitions live in different translation units and must +# stay token-identical: if upstream reorders/renames/extends the enum, the generated TU and the header +# would bind the same name to different integer values (a silent miscompile). Capture the upstream enum +# body and compare its enumerator list so a drift fails the configure with a pointer to update the header. +string(REGEX MATCH "enum[ \t\r\n]+outetts_version[ \t\r\n]*{([^}]*)}" _enum_match "${PREMAIN}") +if(_enum_match STREQUAL "") + message(FATAL_ERROR "generate-tts-upstream: 'enum outetts_version' not found in tts.cpp — upstream changed; update cmake/generate-tts-upstream.cmake and src/main/cpp/tts_upstream.h") +endif() +set(_enum_body "${CMAKE_MATCH_1}") +string(REGEX REPLACE "//[^\n]*" "" _enum_body "${_enum_body}") # strip any line comments +string(REGEX REPLACE "[ \t\r\n]+" "" _enum_body "${_enum_body}") # strip all whitespace +string(REGEX REPLACE ",+$" "" _enum_body "${_enum_body}") # strip a trailing comma +if(NOT _enum_body STREQUAL "OUTETTS_V0_2,OUTETTS_V0_3") + message(FATAL_ERROR "generate-tts-upstream: upstream 'enum outetts_version' enumerators are now '${_enum_body}' (expected 'OUTETTS_V0_2,OUTETTS_V0_3'). Update the matching enum in src/main/cpp/tts_upstream.h to keep the two definitions ODR-identical, then update this assertion in cmake/generate-tts-upstream.cmake") +endif() + +# --- 2b. verify BOTH prompt_add overloads that tts_upstream.h declares are present --- +# `void prompt_add(` is shared by three upstream overloads; the de-static REPLACE above (correctly) gives +# all of them external linkage, but the single string(FIND) only proves >=1 exists. tts_upstream.h +# declares exactly two — (llama_tokens&, const llama_tokens&) and the (vocab, txt, add_special, +# parse_special) builder — and tts_engine.cpp links against them. Pin both here (whitespace-tolerant) so +# dropping or renaming either fails the configure with a clear pointer instead of a cryptic link error. +if(NOT PREMAIN MATCHES "void[ \t]+prompt_add[ \t]*\\([^)]*const[ \t]+llama_tokens[ \t]*&[ \t]*tokens[ \t]*\\)") + message(FATAL_ERROR "generate-tts-upstream: the prompt_add(llama_tokens&, const llama_tokens&) overload declared in src/main/cpp/tts_upstream.h was not found in tts.cpp — upstream changed; update the de-static list and src/main/cpp/tts_upstream.h") +endif() +if(NOT PREMAIN MATCHES "void[ \t]+prompt_add[ \t]*\\([^)]*vocab[^)]*add_special[^)]*parse_special[^)]*\\)") + message(FATAL_ERROR "generate-tts-upstream: the prompt_add(llama_tokens&, const llama_vocab*, const std::string&, bool, bool) overload declared in src/main/cpp/tts_upstream.h was not found in tts.cpp — upstream changed; update the de-static list and src/main/cpp/tts_upstream.h") +endif() + # --- 3. extract the two default-speaker literals from inside main() --- # audio_text: a single-line std::string audio_text = "<|text_start|>the<|text_sep|>..."; # The leading "<|text_start|>the<|text_sep|>" disambiguates it from the empty-seed literal diff --git a/src/main/cpp/tts_engine.cpp b/src/main/cpp/tts_engine.cpp index e310ec4f..586b0571 100644 --- a/src/main/cpp/tts_engine.cpp +++ b/src/main/cpp/tts_engine.cpp @@ -21,6 +21,11 @@ #include "llama.h" #include "sampling.h" +// Full json definition: tts_upstream.h only forward-declares nlohmann::ordered_json (keeping the heavy +// header out of the shared interface), but this TU constructs the empty-object speaker argument for +// get_tts_version(), which needs the complete type. +#include + #include #include #include @@ -67,7 +72,9 @@ tts_engine *engine_init(const std::string &ttc_model_path, const std::string &ct return nullptr; } engine->vocab = llama_model_get_vocab(engine->model_ttc); - engine->tts_version = get_tts_version(engine->model_ttc); + // Explicit empty-object speaker: tts_upstream.h declares no default (it forward-declares json), so + // the default lives only in the generated TU. We always use the built-in default speaker profile. + engine->tts_version = get_tts_version(engine->model_ttc, nlohmann::ordered_json::object()); // Codes-to-speech (CTS) vocoder, loaded in embedding mode. params.model.path = cts_model_path; @@ -202,13 +209,19 @@ bool engine_synthesize(tts_engine *engine, const std::string &text, int n_predic } llama_synchronize(engine->ctx_cts); + // llama_model_n_embd_out (not llama_model_n_embd): read the vocoder's OUTPUT embedding width, which + // is what llama_get_embeddings returns here. This matches upstream tts.cpp, which also queries + // llama_model_n_embd_out at this step. const int n_embd = llama_model_n_embd_out(engine->model_cts); const float *embd = llama_get_embeddings(engine->ctx_cts); std::vector audio = embd_to_audio(embd, n_codes, n_embd, engine->n_threads); llama_batch_free(cts_batch); - // Zero the first 0.25 s (suppresses a leading click). + // 24 kHz mono — the OuteTTS / WavTokenizer output rate. const int n_sr = 24000; + // Zero the first 0.25 s, mirroring upstream tts.cpp's post-vocoder cleanup (it suppresses a leading + // click). The `&& i < audio.size()` guard is ours: it keeps the loop in-bounds for clips shorter + // than 0.25 s, where upstream's fixed 24000/4 bound would read past the buffer. for (int i = 0; i < n_sr / 4 && i < (int)audio.size(); ++i) { audio[i] = 0.0f; } diff --git a/src/main/cpp/tts_upstream.h b/src/main/cpp/tts_upstream.h index 394ff3c0..0ef82e79 100644 --- a/src/main/cpp/tts_upstream.h +++ b/src/main/cpp/tts_upstream.h @@ -15,12 +15,21 @@ #include #include -#include +// Forward declarations only. This shared interface header names nlohmann::ordered_json once (the +// get_tts_version() speaker parameter) but never instantiates it, so it must not pull the full +// ~25k-line into every translation unit that includes it. The single caller that +// constructs the empty-object default (tts_engine.cpp) includes the full itself. +#include #include "common.h" // llama_tokens #include "llama.h" // llama_model, llama_vocab, llama_token -// Mirrors the upstream enum (identical definition; ODR-compatible across translation units). +// Mirrors the upstream enum (identical definition; ODR-compatible across translation units). The +// generated TU carries upstream's own copy, so these enumerators and their order MUST stay +// token-identical to upstream — otherwise the two definitions assign different integer values to the +// same name (a silent miscompile). cmake/generate-tts-upstream.cmake asserts the upstream enum still +// reads `{ OUTETTS_V0_2, OUTETTS_V0_3 }` at configure time and fails loud (pointing here) if a +// llama.cpp bump changes it. enum outetts_version { OUTETTS_V0_2, OUTETTS_V0_3 }; // --- derived from upstream tts.cpp (defined in the generated translation unit) --- @@ -40,7 +49,11 @@ void prompt_init(llama_tokens &prompt, const llama_vocab *vocab); std::vector prepare_guide_tokens(const llama_vocab *vocab, const std::string &str, outetts_version tts_version); -outetts_version get_tts_version(llama_model *model, nlohmann::ordered_json speaker = nlohmann::ordered_json::object()); +// No default argument here on purpose: constructing nlohmann::ordered_json::object() needs the full +// json definition, which this header deliberately does not include (see the json_fwd note above). The +// sole caller (tts_engine.cpp) passes an explicit empty object; the generated TU keeps upstream's own +// default, so its internal calls are unaffected. +outetts_version get_tts_version(llama_model *model, nlohmann::ordered_json speaker); // Default OuteTTS speaker profile, extracted from upstream main() into the generated TU. extern const std::string jllama_tts_default_audio_text; diff --git a/src/test/java/net/ladenthin/llama/TtsIntegrationTest.java b/src/test/java/net/ladenthin/llama/TtsIntegrationTest.java index 07a7799a..2516bbd8 100644 --- a/src/test/java/net/ladenthin/llama/TtsIntegrationTest.java +++ b/src/test/java/net/ladenthin/llama/TtsIntegrationTest.java @@ -9,6 +9,9 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.File; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.charset.StandardCharsets; import java.util.concurrent.TimeUnit; import org.junit.jupiter.api.Assumptions; import org.junit.jupiter.api.DisplayName; @@ -26,8 +29,11 @@ */ public class TtsIntegrationTest { + /** Canonical RIFF/WAVE header size in bytes (16-bit PCM, no extra chunks). */ + private static final int WAV_HEADER_BYTES = 44; + @Test - @DisplayName("synthesize() returns a well-formed 16-bit WAV byte stream") + @DisplayName("synthesize() returns a well-formed, non-silent 24 kHz mono 16-bit WAV") @Timeout(value = 300_000, unit = TimeUnit.MILLISECONDS) public void synthesizesWellFormedWav() { String ttc = System.getProperty(TestConstants.PROP_TTS_TTC_MODEL); @@ -45,15 +51,52 @@ public void synthesizesWellFormedWav() { byte[] wav = tts.synthesize("hello from llama"); assertNotNull(wav, "WAV bytes must not be null"); - assertTrue(wav.length > 44, "WAV must carry a header plus samples; got " + wav.length + " bytes"); - assertEquals('R', (char) wav[0]); - assertEquals('I', (char) wav[1]); - assertEquals('F', (char) wav[2]); - assertEquals('F', (char) wav[3]); - assertEquals('W', (char) wav[8]); - assertEquals('A', (char) wav[9]); - assertEquals('V', (char) wav[10]); - assertEquals('E', (char) wav[11]); + // A bare 44-byte header with no payload is not a valid clip: require real samples beyond it. + assertTrue( + wav.length > WAV_HEADER_BYTES, + "WAV must carry a header plus samples; got " + wav.length + " bytes"); + + // RIFF/WAVE container magic. + assertEquals("RIFF", tag(wav, 0), "RIFF magic"); + assertEquals("WAVE", tag(wav, 8), "WAVE magic"); + assertEquals("fmt ", tag(wav, 12), "fmt subchunk tag"); + assertEquals("data", tag(wav, 36), "data subchunk tag"); + + // fmt fields must match the documented output format: 24 kHz mono 16-bit PCM. A mis-loaded + // model that still framed a header would not silently pass with the wrong rate/channels. + ByteBuffer header = ByteBuffer.wrap(wav).order(ByteOrder.LITTLE_ENDIAN); + assertEquals(1, header.getShort(20) & 0xFFFF, "audio format must be PCM (1)"); + assertEquals(1, header.getShort(22) & 0xFFFF, "must be mono (1 channel)"); + assertEquals(24_000, header.getInt(24), "sample rate must be 24 kHz"); + assertEquals(16, header.getShort(34) & 0xFFFF, "must be 16-bit samples"); + + // Declared chunk sizes must be self-consistent with the actual byte-array length. + assertEquals(wav.length - 8, header.getInt(4), "RIFF chunk size must equal fileLength - 8"); + int dataSize = header.getInt(40); + assertEquals(wav.length - WAV_HEADER_BYTES, dataSize, "data chunk size must equal fileLength - 44"); + assertEquals(0, dataSize % 2, "16-bit PCM data size must be even"); + + // The clip must contain real audio, not just the zeroed 0.25 s lead-in (or the all-silent + // buffer a mis-configured model could still frame inside an otherwise valid header). The + // original `length > 44` check passed on a single padding byte; scan the PCM payload instead. + assertTrue( + hasNonZeroSample(wav, WAV_HEADER_BYTES), + "synthesized PCM must contain audible (non-zero) samples, not pure silence"); + } + } + + /** Reads the 4-byte ASCII chunk tag at {@code offset}. */ + private static String tag(byte[] wav, int offset) { + return new String(wav, offset, 4, StandardCharsets.US_ASCII); + } + + /** Returns {@code true} if any byte of the PCM payload at or after {@code from} is non-zero. */ + private static boolean hasNonZeroSample(byte[] wav, int from) { + for (int i = from; i < wav.length; i++) { + if (wav[i] != 0) { + return true; + } } + return false; } }