Skip to content
46 changes: 40 additions & 6 deletions dimos/stream/audio/stt/node_whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from typing import Any

from reactivex import Observable, create, disposable
import whisper # type: ignore[import-untyped]

from dimos.stream.audio.base import (
AbstractAudioConsumer,
Expand All @@ -27,10 +26,31 @@

logger = setup_logger()

try:
import whisper # type: ignore[import-untyped]

_USE_FASTER_WHISPER = False
except ImportError:
try:
from faster_whisper import WhisperModel # type: ignore[import-untyped]

logger.warn(
"openai-whisper not installed, falling back to faster-whisper. "
"Install openai-whisper for the full backend: pip install openai-whisper",
)
_USE_FASTER_WHISPER = True
except ImportError:
raise ImportError(
"No whisper backend found. "
"Install faster-whisper (pip install faster-whisper) "
"or openai-whisper (pip install dimos[whisper])."
)


class WhisperNode(AbstractAudioConsumer, AbstractTextEmitter):
"""
A node that transcribes audio using OpenAI's Whisper model and emits the transcribed text.
A node that transcribes audio using OpenAI Whisper or faster-whisper and emits
the transcribed text. Prefers openai-whisper if installed, falls back to faster-whisper.
"""

def __init__(
Expand All @@ -41,8 +61,15 @@ def __init__(
if modelopts is None:
modelopts = {"language": "en", "fp16": False}
self.audio_observable = None
self.modelopts = modelopts
self.model = whisper.load_model(model)

if _USE_FASTER_WHISPER:
compute_type = "float16" if modelopts.get("fp16", False) else "int8"
modelopts = {k: v for k, v in modelopts.items() if k != "fp16"}
self.modelopts = modelopts
self.model = WhisperModel(model, device="auto", compute_type=compute_type)
Comment thread
Dreamsorcerer marked this conversation as resolved.
else:
self.modelopts = modelopts
self.model = whisper.load_model(model)

def consume_audio(self, audio_observable: Observable) -> "WhisperNode": # type: ignore[type-arg]
"""
Expand Down Expand Up @@ -73,8 +100,15 @@ def on_subscribe(observer, scheduler):
# Subscribe to the audio source
def on_audio_event(event: AudioEvent) -> None:
try:
result = self.model.transcribe(event.data.flatten(), **self.modelopts)
observer.on_next(result["text"].strip())
if _USE_FASTER_WHISPER:
segments, _info = self.model.transcribe(
event.data.flatten(), **self.modelopts
)
text = " ".join(seg.text.strip() for seg in segments)
else:
result = self.model.transcribe(event.data.flatten(), **self.modelopts)
text = result["text"].strip()
observer.on_next(text)
except Exception as e:
logger.error(f"Error processing audio event: {e}")
observer.on_error(e)
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,8 @@ agents = [

# Audio
"openai",
"openai-whisper",
"sounddevice",
"faster-whisper>=1.0.0",
]

web = [
Expand Down Expand Up @@ -246,6 +246,7 @@ dev = [
"requests-mock==1.12.1",
"terminaltexteffects==0.12.2",
"watchdog>=3.0.0",
"openai-whisper",

# docs
"md-babel-py==1.1.1",
Expand Down Expand Up @@ -391,6 +392,7 @@ module = [
"cyclonedds.*",
"dimos_lcm.*",
"etils",
"faster_whisper",
"geometry_msgs.*",
"lazy_loader",
"mujoco",
Expand Down
76 changes: 69 additions & 7 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading