Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -410,11 +410,14 @@ Instead the helpers are **DERIVED mechanically at configure time** from the pinn
generated definitions. The in-memory WAV writer (`tts_wav.hpp`) is ours, not extracted.

**Fail-loud on drift (same contract as `patches/`):** the generator asserts every anchor — the
`int main(` split point, each `static <signature>` it de-statics, and both speaker literals. If an
upgrade renames a helper or moves a literal, the **configure step aborts** with a pointer to the
generator; if upstream changes a *type*, `tts_upstream.h` stops matching and the **link fails**.
Either way a silent divergence is impossible. On a llama.cpp bump, re-verify the generator the same
way you re-verify `patches/`.
`int main(` split point, each `static <signature>` it de-statics, the `outetts_version` enum
(enumerators + order, kept ODR-identical to the hand-written copy in `tts_upstream.h`), both
`prompt_add` overloads the header declares (the bare `void prompt_add(` prefix de-statics all three
upstream overloads, so the two the header relies on are pinned individually), and both speaker
literals. If an upgrade renames a helper, reorders the enum, or moves a literal, the **configure step
aborts** with a pointer to the generator; if upstream changes a *type*, `tts_upstream.h` stops
matching and the **link fails**. Either way a silent divergence is impossible. On a llama.cpp bump,
re-verify the generator the same way you re-verify `patches/`.

## Upgrading/Downgrading llama.cpp Version

Expand Down
31 changes: 31 additions & 0 deletions cmake/generate-tts-upstream.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,37 @@ foreach(sig IN LISTS JLLAMA_TTS_DESTATIC)
string(REPLACE "static ${sig}" "${sig}" PREMAIN "${PREMAIN}")
endforeach()

# --- 2a. pin the outetts_version enum against the hand-written copy in tts_upstream.h ---
# src/main/cpp/tts_upstream.h re-declares `enum outetts_version { OUTETTS_V0_2, OUTETTS_V0_3 }` because
# it cannot include the generated TU. The two definitions live in different translation units and must
# stay token-identical: if upstream reorders/renames/extends the enum, the generated TU and the header
# would bind the same name to different integer values (a silent miscompile). Capture the upstream enum
# body and compare its enumerator list so a drift fails the configure with a pointer to update the header.
string(REGEX MATCH "enum[ \t\r\n]+outetts_version[ \t\r\n]*{([^}]*)}" _enum_match "${PREMAIN}")
if(_enum_match STREQUAL "")
message(FATAL_ERROR "generate-tts-upstream: 'enum outetts_version' not found in tts.cpp — upstream changed; update cmake/generate-tts-upstream.cmake and src/main/cpp/tts_upstream.h")
endif()
set(_enum_body "${CMAKE_MATCH_1}")
string(REGEX REPLACE "//[^\n]*" "" _enum_body "${_enum_body}") # strip any line comments
string(REGEX REPLACE "[ \t\r\n]+" "" _enum_body "${_enum_body}") # strip all whitespace
string(REGEX REPLACE ",+$" "" _enum_body "${_enum_body}") # strip a trailing comma
if(NOT _enum_body STREQUAL "OUTETTS_V0_2,OUTETTS_V0_3")
message(FATAL_ERROR "generate-tts-upstream: upstream 'enum outetts_version' enumerators are now '${_enum_body}' (expected 'OUTETTS_V0_2,OUTETTS_V0_3'). Update the matching enum in src/main/cpp/tts_upstream.h to keep the two definitions ODR-identical, then update this assertion in cmake/generate-tts-upstream.cmake")
endif()

# --- 2b. verify BOTH prompt_add overloads that tts_upstream.h declares are present ---
# `void prompt_add(` is shared by three upstream overloads; the de-static REPLACE above (correctly) gives
# all of them external linkage, but the single string(FIND) only proves >=1 exists. tts_upstream.h
# declares exactly two — (llama_tokens&, const llama_tokens&) and the (vocab, txt, add_special,
# parse_special) builder — and tts_engine.cpp links against them. Pin both here (whitespace-tolerant) so
# dropping or renaming either fails the configure with a clear pointer instead of a cryptic link error.
if(NOT PREMAIN MATCHES "void[ \t]+prompt_add[ \t]*\\([^)]*const[ \t]+llama_tokens[ \t]*&[ \t]*tokens[ \t]*\\)")
message(FATAL_ERROR "generate-tts-upstream: the prompt_add(llama_tokens&, const llama_tokens&) overload declared in src/main/cpp/tts_upstream.h was not found in tts.cpp — upstream changed; update the de-static list and src/main/cpp/tts_upstream.h")
endif()
if(NOT PREMAIN MATCHES "void[ \t]+prompt_add[ \t]*\\([^)]*vocab[^)]*add_special[^)]*parse_special[^)]*\\)")
message(FATAL_ERROR "generate-tts-upstream: the prompt_add(llama_tokens&, const llama_vocab*, const std::string&, bool, bool) overload declared in src/main/cpp/tts_upstream.h was not found in tts.cpp — upstream changed; update the de-static list and src/main/cpp/tts_upstream.h")
endif()

# --- 3. extract the two default-speaker literals from inside main() ---
# audio_text: a single-line std::string audio_text = "<|text_start|>the<|text_sep|>...";
# The leading "<|text_start|>the<|text_sep|>" disambiguates it from the empty-seed literal
Expand Down
17 changes: 15 additions & 2 deletions src/main/cpp/tts_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
#include "llama.h"
#include "sampling.h"

// Full json definition: tts_upstream.h only forward-declares nlohmann::ordered_json (keeping the heavy
// header out of the shared interface), but this TU constructs the empty-object speaker argument for
// get_tts_version(), which needs the complete type.
#include <nlohmann/json.hpp>

#include <algorithm>
#include <cstdint>
#include <regex>
Expand Down Expand Up @@ -67,7 +72,9 @@ tts_engine *engine_init(const std::string &ttc_model_path, const std::string &ct
return nullptr;
}
engine->vocab = llama_model_get_vocab(engine->model_ttc);
engine->tts_version = get_tts_version(engine->model_ttc);
// Explicit empty-object speaker: tts_upstream.h declares no default (it forward-declares json), so
// the default lives only in the generated TU. We always use the built-in default speaker profile.
engine->tts_version = get_tts_version(engine->model_ttc, nlohmann::ordered_json::object());

// Codes-to-speech (CTS) vocoder, loaded in embedding mode.
params.model.path = cts_model_path;
Expand Down Expand Up @@ -202,13 +209,19 @@ bool engine_synthesize(tts_engine *engine, const std::string &text, int n_predic
}
llama_synchronize(engine->ctx_cts);

// llama_model_n_embd_out (not llama_model_n_embd): read the vocoder's OUTPUT embedding width, which
// is what llama_get_embeddings returns here. This matches upstream tts.cpp, which also queries
// llama_model_n_embd_out at this step.
const int n_embd = llama_model_n_embd_out(engine->model_cts);
const float *embd = llama_get_embeddings(engine->ctx_cts);
std::vector<float> audio = embd_to_audio(embd, n_codes, n_embd, engine->n_threads);
llama_batch_free(cts_batch);

// Zero the first 0.25 s (suppresses a leading click).
// 24 kHz mono — the OuteTTS / WavTokenizer output rate.
const int n_sr = 24000;
// Zero the first 0.25 s, mirroring upstream tts.cpp's post-vocoder cleanup (it suppresses a leading
// click). The `&& i < audio.size()` guard is ours: it keeps the loop in-bounds for clips shorter
// than 0.25 s, where upstream's fixed 24000/4 bound would read past the buffer.
for (int i = 0; i < n_sr / 4 && i < (int)audio.size(); ++i) {
audio[i] = 0.0f;
}
Expand Down
19 changes: 16 additions & 3 deletions src/main/cpp/tts_upstream.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,21 @@
#include <string>
#include <vector>

#include <nlohmann/json.hpp>
// Forward declarations only. This shared interface header names nlohmann::ordered_json once (the
// get_tts_version() speaker parameter) but never instantiates it, so it must not pull the full
// ~25k-line <nlohmann/json.hpp> into every translation unit that includes it. The single caller that
// constructs the empty-object default (tts_engine.cpp) includes the full <nlohmann/json.hpp> itself.
#include <nlohmann/json_fwd.hpp>

#include "common.h" // llama_tokens
#include "llama.h" // llama_model, llama_vocab, llama_token

// Mirrors the upstream enum (identical definition; ODR-compatible across translation units).
// Mirrors the upstream enum (identical definition; ODR-compatible across translation units). The
// generated TU carries upstream's own copy, so these enumerators and their order MUST stay
// token-identical to upstream — otherwise the two definitions assign different integer values to the
// same name (a silent miscompile). cmake/generate-tts-upstream.cmake asserts the upstream enum still
// reads `{ OUTETTS_V0_2, OUTETTS_V0_3 }` at configure time and fails loud (pointing here) if a
// llama.cpp bump changes it.
enum outetts_version { OUTETTS_V0_2, OUTETTS_V0_3 };

// --- derived from upstream tts.cpp (defined in the generated translation unit) ---
Expand All @@ -40,7 +49,11 @@ void prompt_init(llama_tokens &prompt, const llama_vocab *vocab);
std::vector<llama_token> prepare_guide_tokens(const llama_vocab *vocab, const std::string &str,
outetts_version tts_version);

outetts_version get_tts_version(llama_model *model, nlohmann::ordered_json speaker = nlohmann::ordered_json::object());
// No default argument here on purpose: constructing nlohmann::ordered_json::object() needs the full
// json definition, which this header deliberately does not include (see the json_fwd note above). The
// sole caller (tts_engine.cpp) passes an explicit empty object; the generated TU keeps upstream's own
// default, so its internal calls are unaffected.
outetts_version get_tts_version(llama_model *model, nlohmann::ordered_json speaker);

// Default OuteTTS speaker profile, extracted from upstream main() into the generated TU.
extern const std::string jllama_tts_default_audio_text;
Expand Down
63 changes: 53 additions & 10 deletions src/test/java/net/ladenthin/llama/TtsIntegrationTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.io.File;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.TimeUnit;
import org.junit.jupiter.api.Assumptions;
import org.junit.jupiter.api.DisplayName;
Expand All @@ -26,8 +29,11 @@
*/
public class TtsIntegrationTest {

/** Canonical RIFF/WAVE header size in bytes (16-bit PCM, no extra chunks). */
private static final int WAV_HEADER_BYTES = 44;

@Test
@DisplayName("synthesize() returns a well-formed 16-bit WAV byte stream")
@DisplayName("synthesize() returns a well-formed, non-silent 24 kHz mono 16-bit WAV")
@Timeout(value = 300_000, unit = TimeUnit.MILLISECONDS)
public void synthesizesWellFormedWav() {
String ttc = System.getProperty(TestConstants.PROP_TTS_TTC_MODEL);
Expand All @@ -45,15 +51,52 @@ public void synthesizesWellFormedWav() {
byte[] wav = tts.synthesize("hello from llama");

assertNotNull(wav, "WAV bytes must not be null");
assertTrue(wav.length > 44, "WAV must carry a header plus samples; got " + wav.length + " bytes");
assertEquals('R', (char) wav[0]);
assertEquals('I', (char) wav[1]);
assertEquals('F', (char) wav[2]);
assertEquals('F', (char) wav[3]);
assertEquals('W', (char) wav[8]);
assertEquals('A', (char) wav[9]);
assertEquals('V', (char) wav[10]);
assertEquals('E', (char) wav[11]);
// A bare 44-byte header with no payload is not a valid clip: require real samples beyond it.
assertTrue(
wav.length > WAV_HEADER_BYTES,
"WAV must carry a header plus samples; got " + wav.length + " bytes");

// RIFF/WAVE container magic.
assertEquals("RIFF", tag(wav, 0), "RIFF magic");
assertEquals("WAVE", tag(wav, 8), "WAVE magic");
assertEquals("fmt ", tag(wav, 12), "fmt subchunk tag");
assertEquals("data", tag(wav, 36), "data subchunk tag");

// fmt fields must match the documented output format: 24 kHz mono 16-bit PCM. A mis-loaded
// model that still framed a header would not silently pass with the wrong rate/channels.
ByteBuffer header = ByteBuffer.wrap(wav).order(ByteOrder.LITTLE_ENDIAN);
assertEquals(1, header.getShort(20) & 0xFFFF, "audio format must be PCM (1)");
assertEquals(1, header.getShort(22) & 0xFFFF, "must be mono (1 channel)");
assertEquals(24_000, header.getInt(24), "sample rate must be 24 kHz");
assertEquals(16, header.getShort(34) & 0xFFFF, "must be 16-bit samples");

// Declared chunk sizes must be self-consistent with the actual byte-array length.
assertEquals(wav.length - 8, header.getInt(4), "RIFF chunk size must equal fileLength - 8");
int dataSize = header.getInt(40);
assertEquals(wav.length - WAV_HEADER_BYTES, dataSize, "data chunk size must equal fileLength - 44");
assertEquals(0, dataSize % 2, "16-bit PCM data size must be even");

// The clip must contain real audio, not just the zeroed 0.25 s lead-in (or the all-silent
// buffer a mis-configured model could still frame inside an otherwise valid header). The
// original `length > 44` check passed on a single padding byte; scan the PCM payload instead.
assertTrue(
hasNonZeroSample(wav, WAV_HEADER_BYTES),
"synthesized PCM must contain audible (non-zero) samples, not pure silence");
}
}

/** Reads the 4-byte ASCII chunk tag at {@code offset}. */
private static String tag(byte[] wav, int offset) {
return new String(wav, offset, 4, StandardCharsets.US_ASCII);
}

/** Returns {@code true} if any byte of the PCM payload at or after {@code from} is non-zero. */
private static boolean hasNonZeroSample(byte[] wav, int from) {
for (int i = from; i < wav.length; i++) {
if (wav[i] != 0) {
return true;
}
}
return false;
}
}
Loading