Skip to content

Commit eda30d2

Browse files
feat(gladia): add full support for Gladia live and batch STT features (#2201)
* feat(gladia): add full support for Gladia live STT features - Add code switching support: enabled when multiple languages are specified - Add partial transcripts: receive_partial_transcripts and receive_final_transcripts - Native multi-channel support already present via channels field - Custom vocabulary support already present via custom_vocabulary field - Add custom_metadata field support for metadata passthrough - Add recommended config for meeting recorders: - pre_processing.audio_enhancer: true - realtime_processing.words_accurate_timestamps: true Co-Authored-By: yujonglee <[email protected]> * fix(gladia): only enable meeting recorder settings for multi-channel mode The pre_processing.audio_enhancer and realtime_processing.words_accurate_timestamps settings are designed for multi-channel meeting recorder scenarios. Enabling them unconditionally for single-channel mode was causing Gladia to not return transcripts. Now these settings are only enabled when channels > 1. Co-Authored-By: yujonglee <[email protected]> * feat(gladia): add diarization_config with name_consistency and custom_vocabulary to batch - Add diarization_config.name_consistency: true for speaker diarization - Add custom_vocabulary support from ListenParams.keywords Co-Authored-By: yujonglee <[email protected]> * fix --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: yujonglee <[email protected]>
1 parent 1a618f6 commit eda30d2

File tree

3 files changed

+48
-17
lines changed

3 files changed

+48
-17
lines changed

owhisper/owhisper-client/src/adapter/argmax/mod.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@ mod keywords;
33
mod language;
44
mod live;
55

6-
pub use batch::{StreamingBatchConfig, StreamingBatchEvent, StreamingBatchStream};
7-
86
#[derive(Clone, Default)]
97
pub struct ArgmaxAdapter;
108

owhisper/owhisper-client/src/adapter/gladia/batch.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ struct TranscriptRequest {
3636
language_config: Option<LanguageConfig>,
3737
#[serde(skip_serializing_if = "Option::is_none")]
3838
diarization: Option<bool>,
39+
#[serde(skip_serializing_if = "Option::is_none")]
40+
custom_vocabulary: Option<Vec<String>>,
41+
#[serde(skip_serializing_if = "Option::is_none")]
42+
name_consistency: Option<bool>,
3943
}
4044

4145
#[derive(Debug, Serialize)]
@@ -191,10 +195,14 @@ impl GladiaAdapter {
191195
code_switching: (params.languages.len() > 1).then_some(true),
192196
});
193197

198+
let custom_vocabulary = (!params.keywords.is_empty()).then(|| params.keywords.clone());
199+
194200
let transcript_request = TranscriptRequest {
195201
audio_url: upload_result.audio_url,
196202
language_config,
197203
diarization: Some(true),
204+
custom_vocabulary,
205+
name_consistency: Some(true),
198206
};
199207

200208
let transcript_url = format!("{}/pre-recorded", base_url);

owhisper/owhisper-client/src/adapter/gladia/live.rs

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -91,13 +91,20 @@ impl RealtimeSttAdapter for GladiaAdapter {
9191
let key = api_key.as_deref()?;
9292
let post_url = Self::build_http_url(&api_base);
9393

94-
let language_config = (!params.languages.is_empty()).then(|| LanguageConfig {
95-
languages: params
96-
.languages
97-
.iter()
98-
.map(|l| l.iso639().code().to_string())
99-
.collect(),
100-
});
94+
let languages: Vec<String> = params
95+
.languages
96+
.iter()
97+
.map(|l| l.iso639().code().to_string())
98+
.collect();
99+
100+
let language_config = if languages.is_empty() {
101+
None
102+
} else {
103+
Some(LanguageConfig {
104+
code_switching: languages.len() > 1,
105+
languages,
106+
})
107+
};
101108

102109
let custom_vocabulary = (!params.keywords.is_empty()).then(|| params.keywords.clone());
103110

@@ -108,8 +115,16 @@ impl RealtimeSttAdapter for GladiaAdapter {
108115
channels,
109116
language_config,
110117
custom_vocabulary,
118+
custom_metadata: None,
111119
messages_config: Some(MessagesConfig {
112120
receive_partial_transcripts: true,
121+
receive_final_transcripts: true,
122+
}),
123+
pre_processing: Some(PreProcessing {
124+
audio_enhancer: true,
125+
}),
126+
realtime_processing: Some(RealtimeProcessing {
127+
words_accurate_timestamps: true,
113128
}),
114129
};
115130

@@ -217,17 +232,35 @@ struct GladiaConfig<'a> {
217232
#[serde(skip_serializing_if = "Option::is_none")]
218233
custom_vocabulary: Option<Vec<String>>,
219234
#[serde(skip_serializing_if = "Option::is_none")]
235+
custom_metadata: Option<serde_json::Value>,
236+
#[serde(skip_serializing_if = "Option::is_none")]
220237
messages_config: Option<MessagesConfig>,
238+
#[serde(skip_serializing_if = "Option::is_none")]
239+
pre_processing: Option<PreProcessing>,
240+
#[serde(skip_serializing_if = "Option::is_none")]
241+
realtime_processing: Option<RealtimeProcessing>,
221242
}
222243

223244
#[derive(Serialize)]
224245
struct LanguageConfig {
225246
languages: Vec<String>,
247+
code_switching: bool,
226248
}
227249

228250
#[derive(Serialize)]
229251
struct MessagesConfig {
230252
receive_partial_transcripts: bool,
253+
receive_final_transcripts: bool,
254+
}
255+
256+
#[derive(Serialize)]
257+
struct PreProcessing {
258+
audio_enhancer: bool,
259+
}
260+
261+
#[derive(Serialize)]
262+
struct RealtimeProcessing {
263+
words_accurate_timestamps: bool,
231264
}
232265

233266
#[derive(Debug, Deserialize)]
@@ -327,14 +360,6 @@ impl GladiaAdapter {
327360
let data = msg.data;
328361
let utterance = data.utterance;
329362

330-
tracing::debug!(
331-
transcript = %utterance.text,
332-
is_final = data.is_final,
333-
channel = ?utterance.channel,
334-
session_id = %session_id,
335-
"gladia_transcript_received"
336-
);
337-
338363
if utterance.text.is_empty() && utterance.words.is_empty() {
339364
return vec![];
340365
}

0 commit comments

Comments
 (0)