diff --git a/denojs/README.md b/denojs/README.md new file mode 100644 index 00000000..5a16cda6 --- /dev/null +++ b/denojs/README.md @@ -0,0 +1,120 @@ +# Deno Vosk API Wrapper + +This is an FFI wrapper for the Vosk library for Deno. + +## Usage + +These bindings largely follow the native Vosk C API, adapted for Deno's FFI +interface. Some advanced methods might not be fully implemented yet. + +See the +[demo folder](https://github.com/alphacep/vosk-api/tree/master/denojs/demo) for +example usage. + +**Important:** Unlike the Node.js bindings (`vosk`), this Deno wrapper **does +not bundle the native Vosk libraries**. You need to ensure the correct `libvosk` +shared library (`.so`, `.dylib`, or `.dll`) is available on your system. + +The wrapper attempts to load the native library using the following search +order: + +1. **Environment Variable:** Checks if the `VOSK_LIB_DIR_PATH` environment + variable is set. If it is, the wrapper attempts to load the library directly + from the specified directory (e.g., `$VOSK_LIB_DIR_PATH/libvosk.so`). This is + the recommended way to specify a custom library location. +2. **Relative to the module (if possible):** If `VOSK_LIB_DIR_PATH` is not set + and the script's location (`import.meta.dirname`) can be determined, it looks + inside a `lib/` subdirectory relative to where this Deno module + (`mod.ts`) is located (e.g., `lib/linux-x86_64/libvosk.so`). This method + maybe removed in the future, since deno doenst really have a concept of a + package. +3. **System library paths:** If the above methods fail or are not applicable, it + tries loading the library by its standard name (`libvosk.so`, + `libvosk.dylib`, `libvosk.dll`), relying on the OS's standard dynamic library + loading mechanism (e.g., `LD_LIBRARY_PATH` on Linux, `PATH` on Windows, + system paths on macOS). + +**example** + +```ts +import * as vosk from "https://raw.githubusercontent.com/alphacep/vosk-api/v0.3.50/denojs/mod.ts"; +import wav from "npm:wav"; +import fs from "node:fs"; +import process from "node:process"; +import { Readable } from "node:stream"; + +const MODEL_PATH = "model"; +const FILE_NAME = "test.wav"; + +if (!fs.existsSync(MODEL_PATH)) { + console.log( + "Please download the model from https://alphacephei.com/vosk/models and unpack as " + + MODEL_PATH + " in the current folder.", + ); + process.exit(); +} + +if (process.argv.length > 2) { + FILE_NAME = process.argv[2]; +} + +vosk.setLogLevel(0); +const model = new vosk.Model(MODEL_PATH); + +const wfReader = new wav.Reader(); +const wfReadable = new Readable().wrap(wfReader); + +wfReader.on("format", async ({ audioFormat, sampleRate, channels }) => { + if (audioFormat != 1 || channels != 1) { + console.error("Audio file must be WAV format mono PCM."); + process.exit(1); + } + const rec = new vosk.Recognizer({ model: model, sampleRate: sampleRate }); + rec.setMaxAlternatives(10); + rec.setWords(true); + rec.setPartialWords(true); + for await (const data of wfReadable) { + const end_of_speech = rec.acceptWaveform(data); + if (end_of_speech) { + console.log(JSON.stringify(rec.result(), null, 4)); + } else { + console.log(JSON.stringify(rec.partialResult(), null, 4)); + } + } + console.log(JSON.stringify(rec.finalResult(rec), null, 4)); + rec.free(); +}); + +fs.createReadStream(FILE_NAME, { "highWaterMark": 4096 }).pipe(wfReader).on( + "finish", + function () { + model.free(); + }, +); +``` + +## About Vosk + +Vosk is an offline open source speech recognition toolkit. It enables speech +recognition for 20+ languages and dialects - English, Indian English, German, +French, Spanish, Portuguese, Chinese, Russian, Turkish, Vietnamese, Italian, +Dutch, Catalan, Arabic, Greek, Farsi, Filipino, Ukrainian, Kazakh, Swedish, +Japanese, Esperanto, Hindi, Czech, Polish. More to come. + +Vosk models are small (50 Mb) but provide continuous large vocabulary +transcription, zero-latency response with streaming API, reconfigurable +vocabulary and speaker identification. + +Vosk supplies speech recognition for chatbots, smart home appliances, virtual +assistants. It can also create subtitles for movies, transcription for lectures +and interviews. + +Vosk scales from small devices like Raspberry Pi or Android smartphone to big +clusters. + +# Documentation + +For installation instructions (for the underlying Vosk library and models), +examples, and documentation visit the +[Vosk Website](https://alphacephei.com/vosk). See also the main project on +[Github](https://github.com/alphacep/vosk-api). diff --git a/denojs/demo/test_ffmpeg.js b/denojs/demo/test_ffmpeg.js new file mode 100644 index 00000000..ab90ef96 --- /dev/null +++ b/denojs/demo/test_ffmpeg.js @@ -0,0 +1,50 @@ +import * as vosk from "../mod.ts"; +import fs from "node:fs"; +import { spawn } from "node:child_process"; +import process from "node:process"; + +const MODEL_PATH = "model"; +const FILE_NAME = "test.wav"; +const SAMPLE_RATE = 16000; +const BUFFER_SIZE = 4000; + +if (!fs.existsSync(MODEL_PATH)) { + console.log( + "Please download the model from https://alphacephei.com/vosk/models and unpack as " + + MODEL_PATH + " in the current folder.", + ); + process.exit(); +} + +if (process.argv.length > 2) { + FILE_NAME = process.argv[2]; +} + +vosk.setLogLevel(0); +const model = new vosk.Model(MODEL_PATH); +const rec = new vosk.Recognizer({ model: model, sampleRate: SAMPLE_RATE }); + +const ffmpeg_run = spawn("ffmpeg", [ + "-loglevel", + "quiet", + "-i", + FILE_NAME, + "-ar", + String(SAMPLE_RATE), + "-ac", + "1", + "-f", + "s16le", + "-bufsize", + String(BUFFER_SIZE), + "-", +]); + +ffmpeg_run.stdout.on("data", (stdout) => { + if (rec.acceptWaveform(stdout)) { + console.log(rec.result()); + } else { + console.log(rec.partialResult()); + } + console.log(rec.finalResult()); +}); diff --git a/denojs/demo/test_microphone.js b/denojs/demo/test_microphone.js new file mode 100644 index 00000000..35dc6b64 --- /dev/null +++ b/denojs/demo/test_microphone.js @@ -0,0 +1,50 @@ +import * as vosk from "../mod.ts"; +import mic from "npm:mic"; +import fs from "node:fs"; +import process from "node:process"; + +const MODEL_PATH = "model"; +const SAMPLE_RATE = 16000; + +if (!fs.existsSync(MODEL_PATH)) { + console.log( + "Please download the model from https://alphacephei.com/vosk/models and unpack as " + + MODEL_PATH + " in the current folder.", + ); + process.exit(); +} + +vosk.setLogLevel(0); +const model = new vosk.Model(MODEL_PATH); +const rec = new vosk.Recognizer({ model: model, sampleRate: SAMPLE_RATE }); + +const micInstance = mic({ + rate: String(SAMPLE_RATE), + channels: "1", + debug: false, + device: "default", +}); + +const micInputStream = micInstance.getAudioStream(); + +micInputStream.on("data", (data) => { + if (rec.acceptWaveform(data)) { + console.log(rec.result()); + } else { + console.log(rec.partialResult()); + } +}); + +micInputStream.on("audioProcessExitComplete", function () { + console.log("Cleaning up"); + console.log(rec.finalResult()); + rec.free(); + model.free(); +}); + +process.on("SIGINT", function () { + console.log("\nStopping"); + micInstance.stop(); +}); + +micInstance.start(); diff --git a/denojs/demo/test_simple.js b/denojs/demo/test_simple.js new file mode 100644 index 00000000..83b5803c --- /dev/null +++ b/denojs/demo/test_simple.js @@ -0,0 +1,54 @@ +import * as vosk from "../mod.ts"; +import wav from "npm:wav"; +import fs from "node:fs"; +import process from "node:process"; +import { Readable } from "node:stream"; + +const MODEL_PATH = "model"; +const FILE_NAME = "test.wav"; + +if (!fs.existsSync(MODEL_PATH)) { + console.log( + "Please download the model from https://alphacephei.com/vosk/models and unpack as " + + MODEL_PATH + " in the current folder.", + ); + process.exit(); +} + +if (process.argv.length > 2) { + FILE_NAME = process.argv[2]; +} + +vosk.setLogLevel(0); +const model = new vosk.Model(MODEL_PATH); + +const wfReader = new wav.Reader(); +const wfReadable = new Readable().wrap(wfReader); + +wfReader.on("format", async ({ audioFormat, sampleRate, channels }) => { + if (audioFormat != 1 || channels != 1) { + console.error("Audio file must be WAV format mono PCM."); + process.exit(1); + } + const rec = new vosk.Recognizer({ model: model, sampleRate: sampleRate }); + rec.setMaxAlternatives(10); + rec.setWords(true); + rec.setPartialWords(true); + for await (const data of wfReadable) { + const end_of_speech = rec.acceptWaveform(data); + if (end_of_speech) { + console.log(JSON.stringify(rec.result(), null, 4)); + } else { + console.log(JSON.stringify(rec.partialResult(), null, 4)); + } + } + console.log(JSON.stringify(rec.finalResult(rec), null, 4)); + rec.free(); +}); + +fs.createReadStream(FILE_NAME, { "highWaterMark": 4096 }).pipe(wfReader).on( + "finish", + function () { + model.free(); + }, +); diff --git a/denojs/demo/test_simple_async.js b/denojs/demo/test_simple_async.js new file mode 100644 index 00000000..116c451f --- /dev/null +++ b/denojs/demo/test_simple_async.js @@ -0,0 +1,49 @@ +// TODO: FIXME Implement Async +import * as vosk from "../mod.ts"; +import wav from "npm:wav"; +import fs from "node:fs"; +import process from "node:process"; +import { Readable } from "node:stream"; +import async from "npm:async"; + +const MODEL_PATH = "model"; + +if (!fs.existsSync(MODEL_PATH)) { + console.log( + "Please download the model from https://alphacephei.com/vosk/models and unpack as " + + MODEL_PATH + " in the current folder.", + ); + process.exit(); +} + +// Process file 4 times in parallel with a single model +const files = Array(10).fill("test.wav"); +const model = new vosk.Model(MODEL_PATH); + +async.filter(files, function (filePath, callback) { + const wfReader = new wav.Reader(); + const wfReadable = new Readable().wrap(wfReader); + + wfReader.on("format", async ({ audioFormat, sampleRate, channels }) => { + const rec = new vosk.Recognizer({ model: model, sampleRate: sampleRate }); + if (audioFormat != 1 || channels != 1) { + console.error("Audio file must be WAV format mono PCM."); + process.exit(1); + } + for await (const data of wfReadable) { + const end_of_speech = await rec.acceptWaveformAsync(data); + if (end_of_speech) { + console.log(rec.result()); + } + } + console.log(rec.finalResult(rec)); + rec.free(); + // Signal we are done without errors + callback(null, true); + }); + + fs.createReadStream(filePath, { "highWaterMark": 4096 }).pipe(wfReader); +}, function () { + model.free(); + console.log("Done!!!!!"); +}); diff --git a/denojs/demo/test_speaker.js b/denojs/demo/test_speaker.js new file mode 100644 index 00000000..e26be94f --- /dev/null +++ b/denojs/demo/test_speaker.js @@ -0,0 +1,63 @@ +import * as vosk from "../mod.ts"; +import wav from "npm:wav"; +import fs from "node:fs"; +import process from "node:process"; +import { Readable } from "node:stream"; + +const MODEL_PATH = "model"; +const SPEAKER_MODEL_PATH = "model-spk"; +const FILE_NAME = "test.wav"; + +if (!fs.existsSync(MODEL_PATH)) { + console.log( + "Please download the model from https://alphacephei.com/vosk/models and unpack as " + + MODEL_PATH + " in the current folder.", + ); + process.exit(); +} + +if (!fs.existsSync(SPEAKER_MODEL_PATH)) { + console.log( + "Please download the speaker model from https://alphacephei.com/vosk/models and unpack as " + + SPEAKER_MODEL_PATH + " in the current folder.", + ); + process.exit(); +} + +if (process.argv.length > 2) { + FILE_NAME = process.argv[2]; +} + +const model = new vosk.Model(MODEL_PATH); +const speakerModel = new vosk.SpeakerModel(SPEAKER_MODEL_PATH); + +const wfReader = new wav.Reader(); +const wfReadable = new Readable().wrap(wfReader); + +wfReader.on("format", async ({ audioFormat, sampleRate, channels }) => { + if (audioFormat != 1 || channels != 1) { + console.error("Audio file must be WAV format mono PCM."); + process.exit(1); + } + // const rec = new vosk.Recognizer({ model: model, + // speakerModel: speakerModel, + // sampleRate: sampleRate }); + const rec = new vosk.Recognizer({ model: model, sampleRate: sampleRate }); + rec.setSpkModel(speakerModel); + for await (const data of wfReadable) { + const end_of_speech = rec.acceptWaveform(data); + if (end_of_speech) { + console.log(rec.finalResult()); + } + } + console.log(rec.finalResult()); + rec.free(); +}); + +fs.createReadStream(FILE_NAME, { highWaterMark: 4096 }).pipe(wfReader).on( + "finish", + function () { + model.free(); + speakerModel.free(); + }, +); diff --git a/denojs/demo/test_srt.js b/denojs/demo/test_srt.js new file mode 100644 index 00000000..a25e5bd2 --- /dev/null +++ b/denojs/demo/test_srt.js @@ -0,0 +1,104 @@ +import * as vosk from "../mod.ts"; +import fs from "node:fs"; +import process from "node:process"; + +import { spawn } from "node:child_process"; +import { stringifySync } from "npm:subtitle"; + +const MODEL_PATH = "model"; +const FILE_NAME = "test.wav"; +const SAMPLE_RATE = 16000; +const BUFFER_SIZE = 4000; + +if (!fs.existsSync(MODEL_PATH)) { + console.log( + "Please download the model from https://alphacephei.com/vosk/models and unpack as " + + MODEL_PATH + " in the current folder.", + ); + process.exit(); +} + +if (process.argv.length > 2) { + FILE_NAME = process.argv[2]; +} + +vosk.setLogLevel(-1); +const model = new vosk.Model(MODEL_PATH); +const rec = new vosk.Recognizer({ model: model, sampleRate: SAMPLE_RATE }); +rec.setWords(true); + +const ffmpeg_run = spawn("ffmpeg", [ + "-loglevel", + "quiet", + "-i", + FILE_NAME, + "-ar", + String(SAMPLE_RATE), + "-ac", + "1", + "-f", + "s16le", + "-bufsize", + String(BUFFER_SIZE), + "-", +]); + +const WORDS_PER_LINE = 7; +const subs = []; +const results = []; +ffmpeg_run.stdout.on("data", (stdout) => { + if (rec.acceptWaveform(stdout)) { + results.push(rec.result()); + } + results.push(rec.finalResult()); +}); + +ffmpeg_run.on("exit", () => { + rec.free(); + model.free(); + results.forEach((element) => { + if (!Object.hasOwn(element, "result")) { + return; + } + const words = element.result; + if (words.length == 1) { + subs.push({ + type: "cue", + data: { + start: words[0].start * 1000, + end: words[0].end * 1000, + text: words[0].word, + }, + }); + return; + } + const start_index = 0; + let text = words[0].word + " "; + for (let i = 1; i < words.length; i++) { + text += words[i].word + " "; + if (i % WORDS_PER_LINE == 0) { + subs.push({ + type: "cue", + data: { + start: words[start_index].start * 1000, + end: words[i].end * 1000, + text: text.slice(0, text.length - 1), + }, + }); + + text = ""; + } + } + if (start_index != words.length - 1) { + subs.push({ + type: "cue", + data: { + start: words[start_index].start * 1000, + end: words[words.length - 1].end * 1000, + text: text, + }, + }); + } + }); + console.log(stringifySync(subs, { format: "SRT" })); +}); diff --git a/denojs/mod.ts b/denojs/mod.ts new file mode 100644 index 00000000..3d36d1d8 --- /dev/null +++ b/denojs/mod.ts @@ -0,0 +1,490 @@ +// Deno port of the vosk Node.js bindings + +import path from "node:path"; + +// --- Type Definitions (Converted from JSDoc) --- + +export interface WordResult { + conf: number; // The confidence rate in the detection. 0 For unlikely, and 1 for totally accurate. + start: number; // The start of the timeframe when the word is pronounced in seconds + end: number; // The end of the timeframe when the word is pronounced in seconds + word: string; // The word detected +} + +export interface RecognitionResults { + result?: WordResult[]; // Details about the words that have been detected (optional if words not enabled) + text: string; // The complete sentence that have been detected + alternatives?: RecognitionResults[]; // Present if max_alternatives > 0 +} + +export interface SpeakerResults { + spk: number[]; // A floating vector representing speaker identity. + spk_frames: number; // The number of frames used to extract speaker vector. +} + +// Parameter types for Recognizer constructor +export interface BaseRecognizerParam { + model: Model; // The language model to be used + sampleRate: number; // The sample rate. Most models are trained at 16kHz +} + +export interface GrammarRecognizerParam { + grammar: string[]; // The list of sentences to be recognized. +} + +export interface SpeakerRecognizerParam { + speakerModel: SpeakerModel; // The SpeakerModel that will enable speaker identification +} + +// Conditional result type based on Recognizer parameters +export type RecognizerResult = T extends SpeakerRecognizerParam + ? SpeakerResults & RecognitionResults + : RecognitionResults; + +export interface PartialResults { + partial: string; + partial_result?: WordResult[]; // If partial words are enabled +} + +const __dirname = import.meta.dirname; + +// --- Determine Native Library Path --- +let soname: string; +let sonameLocal: string | undefined; +const libDir = __dirname ? path.join(__dirname, "lib") : undefined; + +switch (Deno.build.os) { + case "windows": { + // TODO: Maybe also modify PATH like the original version + const arch = Deno.build.arch === "x86_64" ? "win-x86_64" : null; + if (!arch) throw new Error("Unsupported Windows architecture"); + soname = "libvosk.dll"; + if (libDir) { + sonameLocal = path.join(libDir, arch, soname); + } + break; + } + case "darwin": { + soname = "libvosk.dylib"; + if (libDir) { + sonameLocal = path.join(libDir, "osx-universal", soname); + } + break; + } + case "linux": { + const arch = Deno.build.arch === "aarch64" + ? "linux-arm64" + : Deno.build.arch === "x86_64" + ? "linux-x86_64" + : null; + if (!arch) { + throw new Error(`Unsupported Linux architecture: ${Deno.build.arch}`); + } + soname = "libvosk.so"; + if (libDir) { + sonameLocal = path.join(libDir, arch, soname); + } + break; + } + default: + throw new Error(`Unsupported OS: ${Deno.build.os}`); +} + +// --- Define FFI Symbol Signatures --- +// Note: Opaque pointers (like vosk_model*) are just 'pointer' in Deno FFI +// C strings passed *to* the library are 'buffer' (pass Uint8Array) +// C strings returned *from* the library are 'pointer' (read with UnsafePointerView) +const voskSymbols = { + vosk_set_log_level: { parameters: ["i32"], result: "void" }, + vosk_model_new: { parameters: ["buffer"], result: "pointer" }, + vosk_model_free: { parameters: ["pointer"], result: "void" }, + vosk_spk_model_new: { parameters: ["buffer"], result: "pointer" }, + vosk_spk_model_free: { parameters: ["pointer"], result: "void" }, + vosk_recognizer_new: { parameters: ["pointer", "f32"], result: "pointer" }, + vosk_recognizer_new_spk: { + parameters: ["pointer", "f32", "pointer"], + result: "pointer", + }, + vosk_recognizer_new_grm: { + parameters: ["pointer", "f32", "buffer"], // Grammar is C string + result: "pointer", + }, + vosk_recognizer_free: { parameters: ["pointer"], result: "void" }, + vosk_recognizer_set_max_alternatives: { + parameters: ["pointer", "i32"], + result: "void", + }, + vosk_recognizer_set_words: { + parameters: ["pointer", "bool"], + result: "void", + }, + vosk_recognizer_set_partial_words: { + parameters: ["pointer", "bool"], + result: "void", + }, + vosk_recognizer_set_spk_model: { + parameters: ["pointer", "pointer"], + result: "void", + }, + vosk_recognizer_accept_waveform: { + parameters: ["pointer", "buffer", "i32"], // data buffer, length + result: "bool", + }, + vosk_recognizer_result: { parameters: ["pointer"], result: "pointer" }, // Returns char* + vosk_recognizer_final_result: { parameters: ["pointer"], result: "pointer" }, // Returns char* + vosk_recognizer_partial_result: { + parameters: ["pointer"], + result: "pointer", + }, + vosk_recognizer_reset: { parameters: ["pointer"], result: "void" }, +} as const; + +// --- Load the Native Library --- +let libvosk: Deno.DynamicLibrary; +const userSetNativeLibPath = Deno.env.get("VOSK_LIB_DIR_PATH"); +if (userSetNativeLibPath) { + libvosk = Deno.dlopen(path.join(userSetNativeLibPath, soname), voskSymbols); +} else if (sonameLocal) { + try { + // 1. try to load local library + libvosk = Deno.dlopen(sonameLocal, voskSymbols); + } catch { + try { + // 2. try to load library installed in the system + libvosk = Deno.dlopen(soname, voskSymbols); + } catch (e) { + console.error( + `Error loading Vosk library. Tried local: '${sonameLocal}' and system: '${soname}'.`, + ); + throw e; + } + } +} else { + try { + // 2. try to load library installed in the system + libvosk = Deno.dlopen(soname, voskSymbols); + } catch (e) { + console.error( + `Error loading Vosk library. Tried local: '${sonameLocal}' and system: '${soname}'.`, + ); + throw e; + } +} + +// --- Helper Functions --- + +/** Encodes a JavaScript string to a null-terminated C string (Uint8Array). */ +function encodeCString(value: string): Uint8Array { + return new TextEncoder().encode(value + "\0"); +} + +/** Reads a C string (char*) from a Deno pointer. Returns null if pointer is null. */ +function readCString(pointer: Deno.PointerValue): string | null { + if (pointer === null) { + return null; + } + try { + return Deno.UnsafePointerView.getCString(pointer); + } catch (e) { + console.error("Failed to read C string from pointer"); + throw e; + } +} + +// --- Exported Vosk API --- + +/** + * Set log level for Kaldi messages + * @param level The higher, the more verbose. 0 for infos and errors. Less than 0 for silence. + */ +export function setLogLevel(level: number): void { + libvosk.symbols.vosk_set_log_level(level); +} + +/** + * Build a Model from a model file path. + * @see models [models](https://alphacephei.com/vosk/models) + */ +export class Model { + /** @internal */ + public handle: Deno.PointerValue | null = null; // Store the native pointer + + /** + * Build a Model to be used with the voice recognition. Each language should have it's own Model + * for the speech recognition to work. + * @param modelPath The absolute or relative pathname to the model directory + * @see models [models](https://alphacephei.com/vosk/models) + */ + constructor(modelPath: string) { + const encodedPath = encodeCString(modelPath); + this.handle = libvosk.symbols.vosk_model_new(encodedPath); + if (this.handle === null) { + // PointerValue 0 is equivalent to NULL pointer + throw new Error(`Failed to create Vosk model from path: ${modelPath}`); + } + } + + /** + * Releases the model memory. + * + * The model object is reference-counted so if some recognizer + * depends on this model, model might still stay alive. When + * last recognizer is released, model will be released too. + */ + free(): void { + if (this.handle) { + libvosk.symbols.vosk_model_free(this.handle); + this.handle = null; // Prevent double-free + } + } +} + +/** + * Build a Speaker Model from a speaker model file path. + * The Speaker Model enables speaker identification. + * @see models [models](https://alphacephei.com/vosk/models) + */ +export class SpeakerModel { + /** @internal */ + public handle: Deno.PointerValue | null = null; // Store the native pointer + + /** + * Loads speaker model data from the file and returns the model object + * @param modelPath the path of the model on the filesystem + * @see models [models](https://alphacephei.com/vosk/models) + */ + constructor(modelPath: string) { + const encodedPath = encodeCString(modelPath); + this.handle = libvosk.symbols.vosk_spk_model_new(encodedPath); + if (this.handle === null) { + throw new Error( + `Failed to create Vosk speaker model from path: ${modelPath}`, + ); + } + } + + /** + * Releases the speaker model memory. + * + * The model object is reference-counted so if some recognizer + * depends on this model, model might still stay alive. When + * last recognizer is released, model will be released too. + */ + free(): void { + if (this.handle) { + libvosk.symbols.vosk_spk_model_free(this.handle); + this.handle = null; // Prevent double-free + } + } +} + +// Utility types for XOR logic (same as original) +type Without = { [P in Exclude]?: never }; +type XOR = (T | U) extends object + ? (Without & U) | (Without & T) + : T | U; + +/** + * Create a Recognizer that will be able to transform audio streams into text using a Model. + * @template T Extra parameter for speaker or grammar options. + * @see Model + */ +export class Recognizer< + T extends XOR>, +> { + /** @internal */ + public handle: Deno.PointerValue | null = null; // Store the native pointer + + /** + * Create a Recognizer that will handle speech to text recognition. + * @constructor + * @param param The Recognizer parameters, including model, sampleRate, and optional grammar or speakerModel. + * + * Sometimes when you want to improve recognition accuracy and when you don't need + * to recognize large vocabulary you can specify a list of phrases to recognize. This + * will improve recognizer speed and accuracy but might return [unk] if user said + * something different. + * + * Only recognizers with lookahead models support this type of quick configuration. + * Precompiled HCLG graph models are not supported. + */ + constructor(param: T & BaseRecognizerParam) { + const { model, sampleRate } = param; + + // Prevent using both grammar and speakerModel simultaneously + if ( + Object.hasOwn(param, "speakerModel") && + Object.hasOwn(param, "grammar") + ) { + throw new Error( + "grammar and speakerModel cannot be used together in the constructor.", + ); + } + + if (Object.hasOwn(param, "speakerModel") && param.speakerModel) { + // Speaker recognizer + const spkModel = param.speakerModel; + this.handle = libvosk.symbols.vosk_recognizer_new_spk( + model.handle, + sampleRate, + spkModel.handle, + ); + } else if (Object.hasOwn(param, "grammar") && param.grammar) { + // Grammar recognizer + const grammarJson = JSON.stringify(param.grammar); + const encodedGrammar = encodeCString(grammarJson); + this.handle = libvosk.symbols.vosk_recognizer_new_grm( + model.handle, + sampleRate, + encodedGrammar, + ); + } else { + // Base recognizer + this.handle = libvosk.symbols.vosk_recognizer_new( + model.handle, + sampleRate, + ); + } + + if (this.handle === null) { + throw new Error("Failed to create Vosk recognizer."); + } + } + + /** + * Releases the recognizer memory. + */ + free(): void { + if (this.handle) { + libvosk.symbols.vosk_recognizer_free(this.handle); + this.handle = null; // Prevent double-free + } + } + + /** Configures recognizer to output n-best results. */ + setMaxAlternatives(maxAlternatives: number): void { + libvosk.symbols.vosk_recognizer_set_max_alternatives( + this.handle, + maxAlternatives, + ); + } + + /** Configures recognizer to output words with times. */ + setWords(words: boolean): void { + libvosk.symbols.vosk_recognizer_set_words(this.handle, words); + } + + /** Configures recognizer to output partial words with times. */ + setPartialWords(partialWords: boolean): void { + libvosk.symbols.vosk_recognizer_set_partial_words( + this.handle, + partialWords, + ); + } + + /** Adds speaker recognition model to an already created recognizer. */ + setSpkModel(spkModel: SpeakerModel): void { + libvosk.symbols.vosk_recognizer_set_spk_model( + this.handle, + spkModel.handle, + ); + } + + /** + * Synchronously accept and process a new chunk of voice data. + * + * @param data Audio data in PCM 16-bit mono format (as Uint8Array). + * @returns `true` if silence is detected (or end of segment in grammar mode), indicating a result might be ready. + */ + acceptWaveform(data: Uint8Array): boolean { + return libvosk.symbols.vosk_recognizer_accept_waveform( + this.handle, + data, + data.byteLength, // Use byteLength for buffer size + ); + } + + // TODO: async + + /** + * Returns the speech recognition result as a raw JSON string. + * Call this after `acceptWaveform` returns `true` or after feeding all data. + * @returns JSON string result, or null if reading fails. + */ + resultString(): string | null { + const resultPtr = libvosk.symbols.vosk_recognizer_result(this.handle); + return readCString(resultPtr); + } + + /** + * Returns the parsed speech recognition result. + * @returns The parsed result object, cast to the appropriate type based on Recognizer options. + */ + result(): RecognizerResult | null { + const jsonString = this.resultString(); + if (jsonString) { + return JSON.parse(jsonString) as RecognizerResult; + } else { + return null; + } + } + + /** + * Returns the partial speech recognition result as a raw JSON string. + * Result may change as more data is processed. + * @returns JSON string partial result, or null if reading fails. + */ + partialResultString(): string | null { + const resultPtr = libvosk.symbols.vosk_recognizer_partial_result( + this.handle, + ); + return readCString(resultPtr); + } + + /** + * Returns the parsed partial speech recognition result. + * @returns The parsed partial result object. + */ + partialResult(): PartialResults | null { + const jsonString = this.partialResultString(); + if (jsonString) { + return JSON.parse(jsonString) as PartialResults; + } else { + return null; + } + } + + /** + * Returns the final speech recognition result as a raw JSON string. + * Use this at the very end of processing to flush internal buffers. + * @returns JSON string final result, or null if reading fails. + */ + finalResultString(): string | null { + const resultPtr = libvosk.symbols.vosk_recognizer_final_result( + this.handle, + ); + return readCString(resultPtr); + } + + /** + * Returns the parsed final speech recognition result. + * Use this at the very end of processing. + * @returns The parsed final result object. + */ + finalResult(): RecognizerResult | null { + const jsonString = this.finalResultString(); + if (jsonString) { + return JSON.parse(jsonString) as RecognizerResult; + } else { + return null; + } + } + + /** + * Resets the recognizer, clearing partial results. + * Recognition can then continue from scratch. + */ + reset(): void { + libvosk.symbols.vosk_recognizer_reset(this.handle); + } +}