From 3bfb6b7c6a44ae3a6704a793e76107d98b9a0d80 Mon Sep 17 00:00:00 2001 From: Sam Bull Date: Fri, 17 Apr 2026 14:10:27 +0100 Subject: [PATCH 01/11] Move whisper to separate extra --- pyproject.toml | 6 +++++- uv.lock | 23 ++++++++++++++++++----- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 364fc6d22b..d9ce0aa30d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -169,10 +169,14 @@ agents = [ # Audio "openai", - "openai-whisper", "sounddevice", ] +whisper = [ + "dimos[agents]", + "openai-whisper", +] + web = [ "fastapi>=0.115.6", "sse-starlette>=2.2.1", diff --git a/uv.lock b/uv.lock index 529842294b..df97aea85f 100644 --- a/uv.lock +++ b/uv.lock @@ -1729,7 +1729,6 @@ agents = [ { name = "langchain-text-splitters" }, { name = "ollama" }, { name = "openai" }, - { name = "openai-whisper" }, { name = "sounddevice" }, ] base = [ @@ -1753,7 +1752,6 @@ base = [ { name = "ollama" }, { name = "omegaconf" }, { name = "openai" }, - { name = "openai-whisper" }, { name = "pillow" }, { name = "playground" }, { name = "pygame" }, @@ -1959,7 +1957,6 @@ unitree = [ { name = "ollama" }, { name = "omegaconf" }, { name = "openai" }, - { name = "openai-whisper" }, { name = "pillow" }, { name = "playground" }, { name = "pygame" }, @@ -1983,6 +1980,21 @@ web = [ { name = "sse-starlette" }, { name = "uvicorn" }, ] +whisper = [ + { name = "anthropic" }, + { name = "bitsandbytes", marker = "sys_platform == 'linux'" }, + { name = "langchain" }, + { name = "langchain-chroma" }, + { name = "langchain-core" }, + { name = "langchain-huggingface" }, + { name = "langchain-ollama" }, + { name = "langchain-openai" }, + { name = "langchain-text-splitters" }, + { name = "ollama" }, + { name = "openai" }, + { name = "openai-whisper" }, + { name = "sounddevice" }, +] [package.metadata] requires-dist = [ @@ -1997,6 +2009,7 @@ requires-dist = [ { name = "ctransformers", extras = ["cuda"], marker = "extra == 'cuda'", specifier = "==0.2.27" }, { name = "cupy-cuda12x", marker = "platform_machine == 'x86_64' and extra == 'cuda'", specifier = "==13.6.0" }, { name = "cyclonedds", marker = "extra == 'dds'", specifier = ">=0.10.5" }, + { name = "dimos", extras = ["agents"], marker = "extra == 'whisper'" }, { name = "dimos", extras = ["agents", "web", "perception", "visualization", "sim"], marker = "extra == 'base'" }, { name = "dimos", extras = ["base"], marker = "extra == 'unitree'" }, { name = "dimos", extras = ["dev"], marker = "extra == 'dds'" }, @@ -2051,7 +2064,7 @@ requires-dist = [ { name = "open3d-unofficial-arm", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, { name = "open3d-unofficial-arm", marker = "platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'docker'" }, { name = "openai", marker = "extra == 'agents'" }, - { name = "openai-whisper", marker = "extra == 'agents'" }, + { name = "openai-whisper", marker = "extra == 'whisper'" }, { name = "opencv-contrib-python", marker = "extra == 'misc'", specifier = "==4.10.0.84" }, { name = "opencv-python" }, { name = "opencv-python-headless", marker = "extra == 'docker'" }, @@ -2152,7 +2165,7 @@ requires-dist = [ { name = "xformers", marker = "platform_machine == 'x86_64' and extra == 'cuda'", specifier = ">=0.0.20" }, { name = "yapf", marker = "extra == 'misc'", specifier = "==0.40.2" }, ] -provides-extras = ["misc", "visualization", "agents", "web", "perception", "unitree", "manipulation", "cpu", "cuda", "dev", "psql", "sim", "drone", "dds", "docker", "base"] +provides-extras = ["misc", "visualization", "agents", "whisper", "web", "perception", "unitree", "manipulation", "cpu", "cuda", "dev", "psql", "sim", "drone", "dds", "docker", "base"] [[package]] name = "dimos-lcm" From 3d2f3a40b1bdc582a182126f4ce563d8fa105612 Mon Sep 17 00:00:00 2001 From: Sam Bull Date: Fri, 17 Apr 2026 16:18:34 +0100 Subject: [PATCH 02/11] Use faster-whisper --- dimos/stream/audio/stt/node_whisper.py | 42 ++++++++++++-- pyproject.toml | 6 ++ uv.lock | 78 +++++++++++++++++++++++++- 3 files changed, 119 insertions(+), 7 deletions(-) diff --git a/dimos/stream/audio/stt/node_whisper.py b/dimos/stream/audio/stt/node_whisper.py index e162d150a1..911d81b8f1 100644 --- a/dimos/stream/audio/stt/node_whisper.py +++ b/dimos/stream/audio/stt/node_whisper.py @@ -16,7 +16,6 @@ from typing import Any from reactivex import Observable, create, disposable -import whisper # type: ignore[import-untyped] from dimos.stream.audio.base import ( AbstractAudioConsumer, @@ -25,12 +24,29 @@ from dimos.stream.audio.text.base import AbstractTextEmitter from dimos.utils.logging_config import setup_logger +try: + import whisper # type: ignore[import-untyped] + + _USE_FASTER_WHISPER = False +except ImportError: + try: + from faster_whisper import WhisperModel # type: ignore[import-untyped] + + _USE_FASTER_WHISPER = True + except ImportError: + raise ImportError( + "No whisper backend found. " + "Install openai-whisper (pip install dimos[whisper]) " + "or faster-whisper (pip install dimos[whisper-cpu])." + ) + logger = setup_logger() class WhisperNode(AbstractAudioConsumer, AbstractTextEmitter): """ - A node that transcribes audio using OpenAI's Whisper model and emits the transcribed text. + A node that transcribes audio using OpenAI Whisper or faster-whisper and emits + the transcribed text. Prefers openai-whisper if installed, falls back to faster-whisper. """ def __init__( @@ -41,8 +57,15 @@ def __init__( if modelopts is None: modelopts = {"language": "en", "fp16": False} self.audio_observable = None - self.modelopts = modelopts - self.model = whisper.load_model(model) + + if _USE_FASTER_WHISPER: + fp16 = modelopts.pop("fp16", False) + compute_type = "float16" if fp16 else "int8" + self.modelopts = modelopts + self.model = WhisperModel(model, device="auto", compute_type=compute_type) + else: + self.modelopts = modelopts + self.model = whisper.load_model(model) def consume_audio(self, audio_observable: Observable) -> "WhisperNode": # type: ignore[type-arg] """ @@ -73,8 +96,15 @@ def on_subscribe(observer, scheduler): # Subscribe to the audio source def on_audio_event(event: AudioEvent) -> None: try: - result = self.model.transcribe(event.data.flatten(), **self.modelopts) - observer.on_next(result["text"].strip()) + if _USE_FASTER_WHISPER: + segments, _info = self.model.transcribe( + event.data.flatten(), **self.modelopts + ) + text = " ".join(seg.text.strip() for seg in segments) + else: + result = self.model.transcribe(event.data.flatten(), **self.modelopts) + text = result["text"].strip() + observer.on_next(text) except Exception as e: logger.error(f"Error processing audio event: {e}") observer.on_error(e) diff --git a/pyproject.toml b/pyproject.toml index d9ce0aa30d..d2aadbc10a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -177,6 +177,11 @@ whisper = [ "openai-whisper", ] +whisper-cpu = [ + "dimos[agents]", + "faster-whisper", +] + web = [ "fastapi>=0.115.6", "sse-starlette>=2.2.1", @@ -394,6 +399,7 @@ module = [ "cyclonedds.*", "dimos_lcm.*", "etils", + "faster_whisper", "geometry_msgs.*", "lazy_loader", "mujoco", diff --git a/uv.lock b/uv.lock index df97aea85f..9e180ff406 100644 --- a/uv.lock +++ b/uv.lock @@ -1525,6 +1525,49 @@ cuda = [ { name = "nvidia-cuda-runtime-cu12", version = "12.9.79", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin' or sys_platform == 'win32'" }, ] +[[package]] +name = "ctranslate2" +version = "4.7.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "pyyaml" }, + { name = "setuptools" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/e0/b69c40c3d739b213a78d327071240590792071b4f890e34088b03b95bb1e/ctranslate2-4.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9017a355dd7c6d29dc3bca6e9fc74827306c61b702c66bb1f6b939655e7de3fa", size = 1255773, upload-time = "2026-02-04T06:11:04.769Z" }, + { url = "https://files.pythonhosted.org/packages/51/29/e5c2fc1253e3fb9b2c86997f36524bba182a8ed77fb4f8fe8444a5649191/ctranslate2-4.7.1-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:6abcd0552285e7173475836f9d133e04dfc3e42ca8e6930f65eaa4b8b13a47fa", size = 11914945, upload-time = "2026-02-04T06:11:06.853Z" }, + { url = "https://files.pythonhosted.org/packages/03/25/e7fe847d3f02c84d2e9c5e8312434fbeab5af3d8916b6c8e2bdbe860d052/ctranslate2-4.7.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8492cba605319e0d7f2760180957d5a2a435dfdebcef1a75d2ade740e6b9fb0b", size = 16547973, upload-time = "2026-02-04T06:11:09.021Z" }, + { url = "https://files.pythonhosted.org/packages/68/75/074ed22bc340c2e26c09af6bf85859b586516e4e2d753b20189936d0dcf7/ctranslate2-4.7.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:688bd82482b5d057eff5bc1e727f11bb9a1277b7e4fce8ab01fd3bb70e69294b", size = 38636471, upload-time = "2026-02-04T06:11:12.146Z" }, + { url = "https://files.pythonhosted.org/packages/76/b6/9baf8a565f6dcdbfbc9cfd179dd6214529838cda4e91e89b616045a670f0/ctranslate2-4.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:3b39a5f4e3c87ac91976996458a64ba08a7cbf974dc0be4e6df83a9e040d4bd2", size = 18842389, upload-time = "2026-02-04T06:11:15.154Z" }, + { url = "https://files.pythonhosted.org/packages/da/25/41920ccee68e91cb6fa0fc9e8078ab2b7839f2c668f750dc123144cb7c6e/ctranslate2-4.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f74200bab9996b14a57cf6f7cb27d0921ceedc4acc1e905598e3e85b4d75b1ec", size = 1256943, upload-time = "2026-02-04T06:11:17.781Z" }, + { url = "https://files.pythonhosted.org/packages/79/22/bc81fcc9f10ba4da3ffd1a9adec15cfb73cb700b3bbe69c6c8b55d333316/ctranslate2-4.7.1-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:59b427eb3ac999a746315b03a63942fddd351f511db82ba1a66880d4dea98e25", size = 11916445, upload-time = "2026-02-04T06:11:19.938Z" }, + { url = "https://files.pythonhosted.org/packages/0a/a7/494a66bb02c7926331cadfff51d5ce81f5abfb1e8d05d7f2459082f31b48/ctranslate2-4.7.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:95f0c1051c180669d2a83a44b44b518b2d1683de125f623bbc81ad5dd6f6141c", size = 16696997, upload-time = "2026-02-04T06:11:22.697Z" }, + { url = "https://files.pythonhosted.org/packages/ed/4e/b48f79fd36e5d3c7e12db383aa49814c340921a618ef7364bd0ced670644/ctranslate2-4.7.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0ed92d9ab0ac6bc7005942be83d68714c80adb0897ab17f98157294ee0374347", size = 38836379, upload-time = "2026-02-04T06:11:26.325Z" }, + { url = "https://files.pythonhosted.org/packages/d2/23/8c01ac52e1f26fc4dbe985a35222ae7cd365bbf7ee5db5fd5545d8926f91/ctranslate2-4.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:67d9ad9b69933fbfeee7dcec899b2cd9341d5dca4fdfb53e8ba8c109dc332ee1", size = 18843315, upload-time = "2026-02-04T06:11:29.441Z" }, + { url = "https://files.pythonhosted.org/packages/fc/0f/581de94b64c5f2327a736270bc7e7a5f8fe5cf1ed56a2203b52de4d8986a/ctranslate2-4.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4c0cbd46a23b8dc37ccdbd9b447cb5f7fadc361c90e9df17d82ca84b1f019986", size = 1257089, upload-time = "2026-02-04T06:11:32.442Z" }, + { url = "https://files.pythonhosted.org/packages/3d/e9/d55b0e436362f9fe26bd98fefd2dd5d81926121f1d7f799c805e6035bb26/ctranslate2-4.7.1-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:5b141ddad1da5f84cf3c2a569a56227a37de649a555d376cbd9b80e8f0373dd8", size = 11918502, upload-time = "2026-02-04T06:11:33.986Z" }, + { url = "https://files.pythonhosted.org/packages/ec/ce/9f29f0b0bb4280c2ebafb3ddb6cdff8ef1c2e185ee020c0ec0ecba7dc934/ctranslate2-4.7.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d00a62544db4a3caaa58a3c50d39b25613c042b430053ae32384d94eb1d40990", size = 16859601, upload-time = "2026-02-04T06:11:36.227Z" }, + { url = "https://files.pythonhosted.org/packages/b3/86/428d270fd72117d19fb48ed3211aa8a3c8bd7577373252962cb634e0fd01/ctranslate2-4.7.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:722b93a89647974cbd182b4c7f87fefc7794fff7fc9cbd0303b6447905cc157e", size = 38995338, upload-time = "2026-02-04T06:11:42.789Z" }, + { url = "https://files.pythonhosted.org/packages/4a/f4/d23dbfb9c62cb642c114a30f05d753ba61d6ffbfd8a3a4012fe85a073bcb/ctranslate2-4.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:d0f734dc3757118094663bdaaf713f5090c55c1927fb330a76bb8b84173940e8", size = 18844949, upload-time = "2026-02-04T06:11:45.436Z" }, + { url = "https://files.pythonhosted.org/packages/34/6d/eb49ba05db286b4ea9d5d3fcf5f5cd0a9a5e218d46349618d5041001e303/ctranslate2-4.7.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6b2abf2929756e3ec6246057b56df379995661560a2d776af05f9d97f63afcf5", size = 1256960, upload-time = "2026-02-04T06:11:47.487Z" }, + { url = "https://files.pythonhosted.org/packages/45/5a/b9cce7b00d89fc6fdeaf27587aa52d0597b465058563e93ff50910553bdd/ctranslate2-4.7.1-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:857ef3959d6b1c40dc227c715a36db33db2d097164996d6c75b6db8e30828f52", size = 11918645, upload-time = "2026-02-04T06:11:49.599Z" }, + { url = "https://files.pythonhosted.org/packages/ea/03/c0db0a5276599fb44ceafa2f2cb1afd5628808ec406fe036060a39693680/ctranslate2-4.7.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:393a9e7e989034660526a2c0e8bb65d1924f43d9a5c77d336494a353d16ba2a4", size = 16860452, upload-time = "2026-02-04T06:11:52.276Z" }, + { url = "https://files.pythonhosted.org/packages/0b/03/4e3728ce29d192ee75ed9a2d8589bf4f19edafe5bed3845187de51b179a3/ctranslate2-4.7.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a3d0682f2b9082e31c73d75b45f16cde77355ab76d7e8356a24c3cb2480a6d3", size = 38995174, upload-time = "2026-02-04T06:11:55.477Z" }, + { url = "https://files.pythonhosted.org/packages/9b/15/6e8e87c6a201d69803a79ac2e29623ce7c2cc9cd1df9db99810cca714373/ctranslate2-4.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:baa6d2b10f57933d8c11791e8522659217918722d07bbef2389a443801125fe7", size = 18844953, upload-time = "2026-02-04T06:11:58.519Z" }, + { url = "https://files.pythonhosted.org/packages/fd/73/8a6b7ba18cad0c8667ee221ddab8c361cb70926440e5b8dd0e81924c28ac/ctranslate2-4.7.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:d5dfb076566551f4959dfd0706f94c923c1931def9b7bb249a2caa6ab23353a0", size = 1257560, upload-time = "2026-02-04T06:12:00.926Z" }, + { url = "https://files.pythonhosted.org/packages/70/c2/8817ca5d6c1b175b23a12f7c8b91484652f8718a76353317e5919b038733/ctranslate2-4.7.1-cp314-cp314-macosx_11_0_x86_64.whl", hash = "sha256:eecdb4ed934b384f16e8c01b185b082d6b5ffc7dcbb0b6a6eb48cd465282d957", size = 11918995, upload-time = "2026-02-04T06:12:02.875Z" }, + { url = "https://files.pythonhosted.org/packages/ac/33/b8eb3acc67bbca4d9872fc9ff94db78e6167a7ba5cd932f585d1560effc7/ctranslate2-4.7.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1aa6796edcc3c8d163c9e39c429d50076d266d68980fed9d1b2443f617c67e9e", size = 16844162, upload-time = "2026-02-04T06:12:05.099Z" }, + { url = "https://files.pythonhosted.org/packages/80/11/6474893b07121057035069a0a483fe1cd8c47878213f282afb4c0c6fc275/ctranslate2-4.7.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:24c0482c51726430fb83724451921c0e539d769c8618dcfd46b1645e7f75960d", size = 38966728, upload-time = "2026-02-04T06:12:07.923Z" }, + { url = "https://files.pythonhosted.org/packages/94/88/8fc7ff435c5e783e5fad9586d839d463e023988dbbbad949d442092d01f1/ctranslate2-4.7.1-cp314-cp314-win_amd64.whl", hash = "sha256:76db234c0446a23d20dd8eeaa7a789cc87d1d05283f48bf3152bae9fa0a69844", size = 19100788, upload-time = "2026-02-04T06:12:10.592Z" }, + { url = "https://files.pythonhosted.org/packages/d9/b3/f100013a76a98d64e67c721bd4559ea4eeb54be3e4ac45f4d801769899af/ctranslate2-4.7.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:058c9db2277dc8b19ecc86c7937628f69022f341844b9081d2ab642965d88fc6", size = 1280179, upload-time = "2026-02-04T06:12:12.596Z" }, + { url = "https://files.pythonhosted.org/packages/39/22/b77f748015667a5e2ca54a5ee080d7016fce34314f0e8cf904784549305a/ctranslate2-4.7.1-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:5abcf885062c7f28a3f9a46be8d185795e8706ac6230ad086cae0bc82917df31", size = 11940166, upload-time = "2026-02-04T06:12:14.054Z" }, + { url = "https://files.pythonhosted.org/packages/7d/78/6d7fd52f646c6ba3343f71277a9bbef33734632949d1651231948b0f0359/ctranslate2-4.7.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9950acb04a002d5c60ae90a1ddceead1a803af1f00cadd9b1a1dc76e1f017481", size = 16849483, upload-time = "2026-02-04T06:12:17.082Z" }, + { url = "https://files.pythonhosted.org/packages/40/27/58769ff15ac31b44205bd7a8aeca80cf7357c657ea5df1b94ce0f5c83771/ctranslate2-4.7.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1dcc734e92e3f1ceeaa0c42bbfd009352857be179ecd4a7ed6cccc086a202f58", size = 38949393, upload-time = "2026-02-04T06:12:21.302Z" }, + { url = "https://files.pythonhosted.org/packages/0e/5c/9fa0ad6462b62efd0fb5ac1100eee47bc96ecc198ff4e237c731e5473616/ctranslate2-4.7.1-cp314-cp314t-win_amd64.whl", hash = "sha256:dfb7657bdb7b8211c8f9ecb6f3b70bc0db0e0384d01a8b1808cb66fe7199df59", size = 19123451, upload-time = "2026-02-04T06:12:24.115Z" }, +] + [[package]] name = "cuda-bindings" version = "12.9.4" @@ -1995,6 +2038,21 @@ whisper = [ { name = "openai-whisper" }, { name = "sounddevice" }, ] +whisper-cpu = [ + { name = "anthropic" }, + { name = "bitsandbytes", marker = "sys_platform == 'linux'" }, + { name = "faster-whisper" }, + { name = "langchain" }, + { name = "langchain-chroma" }, + { name = "langchain-core" }, + { name = "langchain-huggingface" }, + { name = "langchain-ollama" }, + { name = "langchain-openai" }, + { name = "langchain-text-splitters" }, + { name = "ollama" }, + { name = "openai" }, + { name = "sounddevice" }, +] [package.metadata] requires-dist = [ @@ -2010,6 +2068,7 @@ requires-dist = [ { name = "cupy-cuda12x", marker = "platform_machine == 'x86_64' and extra == 'cuda'", specifier = "==13.6.0" }, { name = "cyclonedds", marker = "extra == 'dds'", specifier = ">=0.10.5" }, { name = "dimos", extras = ["agents"], marker = "extra == 'whisper'" }, + { name = "dimos", extras = ["agents"], marker = "extra == 'whisper-cpu'" }, { name = "dimos", extras = ["agents", "web", "perception", "visualization", "sim"], marker = "extra == 'base'" }, { name = "dimos", extras = ["base"], marker = "extra == 'unitree'" }, { name = "dimos", extras = ["dev"], marker = "extra == 'dds'" }, @@ -2023,6 +2082,7 @@ requires-dist = [ { name = "einops", marker = "extra == 'misc'", specifier = "==0.8.1" }, { name = "empy", marker = "extra == 'misc'", specifier = "==3.3.4" }, { name = "fastapi", marker = "extra == 'web'", specifier = ">=0.115.6" }, + { name = "faster-whisper", marker = "extra == 'whisper-cpu'" }, { name = "ffmpeg-python", marker = "extra == 'web'" }, { name = "filterpy", marker = "extra == 'perception'", specifier = ">=1.4.5" }, { name = "gdown", marker = "extra == 'misc'", specifier = "==5.2.0" }, @@ -2165,7 +2225,7 @@ requires-dist = [ { name = "xformers", marker = "platform_machine == 'x86_64' and extra == 'cuda'", specifier = ">=0.0.20" }, { name = "yapf", marker = "extra == 'misc'", specifier = "==0.40.2" }, ] -provides-extras = ["misc", "visualization", "agents", "whisper", "web", "perception", "unitree", "manipulation", "cpu", "cuda", "dev", "psql", "sim", "drone", "dds", "docker", "base"] +provides-extras = ["misc", "visualization", "agents", "whisper", "whisper-cpu", "web", "perception", "unitree", "manipulation", "cpu", "cuda", "dev", "psql", "sim", "drone", "dds", "docker", "base"] [[package]] name = "dimos-lcm" @@ -2572,6 +2632,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2b/c0/d5417aa573f502b7aa037a46e1279b4906511d2ad6bb93b0a531a454f393/fastcrc-0.3.5-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:fa17dbea2c0984f204318d64da0c5109e8afc0f3fa218d836b42a6c4a6f6a27e", size = 491214, upload-time = "2025-12-31T18:23:07.131Z" }, ] +[[package]] +name = "faster-whisper" +version = "1.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "av" }, + { name = "ctranslate2" }, + { name = "huggingface-hub" }, + { name = "onnxruntime" }, + { name = "tokenizers" }, + { name = "tqdm" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/99/49ee85903dee060d9f08297b4a342e5e0bcfca2f027a07b4ee0a38ab13f9/faster_whisper-1.2.1-py3-none-any.whl", hash = "sha256:79a66ad50688c0b794dd501dc340a736992a6342f7f95e5811be60b5224a26a7", size = 1118909, upload-time = "2025-10-31T11:35:47.794Z" }, +] + [[package]] name = "fastjsonschema" version = "2.21.2" From 98bb8199ba9f356973804bc861cfa70b24256d39 Mon Sep 17 00:00:00 2001 From: Sam Bull Date: Fri, 17 Apr 2026 16:27:24 +0100 Subject: [PATCH 03/11] Include by default --- pyproject.toml | 6 +----- uv.lock | 24 ++++++------------------ 2 files changed, 7 insertions(+), 23 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d2aadbc10a..22a55efd85 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -170,6 +170,7 @@ agents = [ # Audio "openai", "sounddevice", + "faster-whisper", ] whisper = [ @@ -177,11 +178,6 @@ whisper = [ "openai-whisper", ] -whisper-cpu = [ - "dimos[agents]", - "faster-whisper", -] - web = [ "fastapi>=0.115.6", "sse-starlette>=2.2.1", diff --git a/uv.lock b/uv.lock index 9e180ff406..02c4aa7c53 100644 --- a/uv.lock +++ b/uv.lock @@ -1763,6 +1763,7 @@ dependencies = [ agents = [ { name = "anthropic" }, { name = "bitsandbytes", marker = "sys_platform == 'linux'" }, + { name = "faster-whisper" }, { name = "langchain" }, { name = "langchain-chroma" }, { name = "langchain-core" }, @@ -1779,6 +1780,7 @@ base = [ { name = "bitsandbytes", marker = "sys_platform == 'linux'" }, { name = "dimos-viewer" }, { name = "fastapi" }, + { name = "faster-whisper" }, { name = "ffmpeg-python" }, { name = "filterpy" }, { name = "hydra-core" }, @@ -1984,6 +1986,7 @@ unitree = [ { name = "bitsandbytes", marker = "sys_platform == 'linux'" }, { name = "dimos-viewer" }, { name = "fastapi" }, + { name = "faster-whisper" }, { name = "ffmpeg-python" }, { name = "filterpy" }, { name = "hydra-core" }, @@ -2024,21 +2027,6 @@ web = [ { name = "uvicorn" }, ] whisper = [ - { name = "anthropic" }, - { name = "bitsandbytes", marker = "sys_platform == 'linux'" }, - { name = "langchain" }, - { name = "langchain-chroma" }, - { name = "langchain-core" }, - { name = "langchain-huggingface" }, - { name = "langchain-ollama" }, - { name = "langchain-openai" }, - { name = "langchain-text-splitters" }, - { name = "ollama" }, - { name = "openai" }, - { name = "openai-whisper" }, - { name = "sounddevice" }, -] -whisper-cpu = [ { name = "anthropic" }, { name = "bitsandbytes", marker = "sys_platform == 'linux'" }, { name = "faster-whisper" }, @@ -2051,6 +2039,7 @@ whisper-cpu = [ { name = "langchain-text-splitters" }, { name = "ollama" }, { name = "openai" }, + { name = "openai-whisper" }, { name = "sounddevice" }, ] @@ -2068,7 +2057,6 @@ requires-dist = [ { name = "cupy-cuda12x", marker = "platform_machine == 'x86_64' and extra == 'cuda'", specifier = "==13.6.0" }, { name = "cyclonedds", marker = "extra == 'dds'", specifier = ">=0.10.5" }, { name = "dimos", extras = ["agents"], marker = "extra == 'whisper'" }, - { name = "dimos", extras = ["agents"], marker = "extra == 'whisper-cpu'" }, { name = "dimos", extras = ["agents", "web", "perception", "visualization", "sim"], marker = "extra == 'base'" }, { name = "dimos", extras = ["base"], marker = "extra == 'unitree'" }, { name = "dimos", extras = ["dev"], marker = "extra == 'dds'" }, @@ -2082,7 +2070,7 @@ requires-dist = [ { name = "einops", marker = "extra == 'misc'", specifier = "==0.8.1" }, { name = "empy", marker = "extra == 'misc'", specifier = "==3.3.4" }, { name = "fastapi", marker = "extra == 'web'", specifier = ">=0.115.6" }, - { name = "faster-whisper", marker = "extra == 'whisper-cpu'" }, + { name = "faster-whisper", marker = "extra == 'agents'" }, { name = "ffmpeg-python", marker = "extra == 'web'" }, { name = "filterpy", marker = "extra == 'perception'", specifier = ">=1.4.5" }, { name = "gdown", marker = "extra == 'misc'", specifier = "==5.2.0" }, @@ -2225,7 +2213,7 @@ requires-dist = [ { name = "xformers", marker = "platform_machine == 'x86_64' and extra == 'cuda'", specifier = ">=0.0.20" }, { name = "yapf", marker = "extra == 'misc'", specifier = "==0.40.2" }, ] -provides-extras = ["misc", "visualization", "agents", "whisper", "whisper-cpu", "web", "perception", "unitree", "manipulation", "cpu", "cuda", "dev", "psql", "sim", "drone", "dds", "docker", "base"] +provides-extras = ["misc", "visualization", "agents", "whisper", "web", "perception", "unitree", "manipulation", "cpu", "cuda", "dev", "psql", "sim", "drone", "dds", "docker", "base"] [[package]] name = "dimos-lcm" From 7a332a5cf709dfdc1a823d675557db90f9f2d193 Mon Sep 17 00:00:00 2001 From: Sam Bull Date: Fri, 17 Apr 2026 16:51:37 +0100 Subject: [PATCH 04/11] Add warning --- dimos/stream/audio/stt/node_whisper.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/dimos/stream/audio/stt/node_whisper.py b/dimos/stream/audio/stt/node_whisper.py index 911d81b8f1..f0c3d3c0db 100644 --- a/dimos/stream/audio/stt/node_whisper.py +++ b/dimos/stream/audio/stt/node_whisper.py @@ -14,6 +14,7 @@ # limitations under the License. from typing import Any +import warnings from reactivex import Observable, create, disposable @@ -32,12 +33,17 @@ try: from faster_whisper import WhisperModel # type: ignore[import-untyped] + warnings.warn( + "openai-whisper not installed, falling back to faster-whisper. " + "Install openai-whisper for the full backend: pip install dimos[whisper]", + stacklevel=2, + ) _USE_FASTER_WHISPER = True except ImportError: raise ImportError( "No whisper backend found. " - "Install openai-whisper (pip install dimos[whisper]) " - "or faster-whisper (pip install dimos[whisper-cpu])." + "Install faster-whisper (pip install faster-whisper) " + "or openai-whisper (pip install dimos[whisper])." ) logger = setup_logger() From 980d82540b0ae83e0174a63be5cb85b1e2982595 Mon Sep 17 00:00:00 2001 From: Sam Bull Date: Fri, 17 Apr 2026 17:01:48 +0100 Subject: [PATCH 05/11] Minimum version --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 22a55efd85..4f305bb4ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -170,7 +170,7 @@ agents = [ # Audio "openai", "sounddevice", - "faster-whisper", + "faster-whisper>=1.0.0", ] whisper = [ diff --git a/uv.lock b/uv.lock index 02c4aa7c53..e57b78107e 100644 --- a/uv.lock +++ b/uv.lock @@ -2070,7 +2070,7 @@ requires-dist = [ { name = "einops", marker = "extra == 'misc'", specifier = "==0.8.1" }, { name = "empy", marker = "extra == 'misc'", specifier = "==3.3.4" }, { name = "fastapi", marker = "extra == 'web'", specifier = ">=0.115.6" }, - { name = "faster-whisper", marker = "extra == 'agents'" }, + { name = "faster-whisper", marker = "extra == 'agents'", specifier = ">=1.0.0" }, { name = "ffmpeg-python", marker = "extra == 'web'" }, { name = "filterpy", marker = "extra == 'perception'", specifier = ">=1.4.5" }, { name = "gdown", marker = "extra == 'misc'", specifier = "==5.2.0" }, From c85b1d4d935fd4d65d8d60316c0b55f9de15a958 Mon Sep 17 00:00:00 2001 From: Sam Bull Date: Fri, 17 Apr 2026 17:06:03 +0100 Subject: [PATCH 06/11] Avoid mutating original dict --- dimos/stream/audio/stt/node_whisper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dimos/stream/audio/stt/node_whisper.py b/dimos/stream/audio/stt/node_whisper.py index f0c3d3c0db..2661045f0b 100644 --- a/dimos/stream/audio/stt/node_whisper.py +++ b/dimos/stream/audio/stt/node_whisper.py @@ -65,8 +65,8 @@ def __init__( self.audio_observable = None if _USE_FASTER_WHISPER: - fp16 = modelopts.pop("fp16", False) - compute_type = "float16" if fp16 else "int8" + compute_type = "float16" if modelopts.get("fp16", False) else "int8" + modelopts = {k: v for k, v in modelopts.items() if k != "fp16"} self.modelopts = modelopts self.model = WhisperModel(model, device="auto", compute_type=compute_type) else: From 500b30e444c4b6d97c18d26daeae0a62c6204f7d Mon Sep 17 00:00:00 2001 From: Sam Bull Date: Tue, 21 Apr 2026 15:49:36 +0100 Subject: [PATCH 07/11] Update node_whisper.py --- dimos/stream/audio/stt/node_whisper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dimos/stream/audio/stt/node_whisper.py b/dimos/stream/audio/stt/node_whisper.py index 2661045f0b..fccb9318a3 100644 --- a/dimos/stream/audio/stt/node_whisper.py +++ b/dimos/stream/audio/stt/node_whisper.py @@ -25,6 +25,8 @@ from dimos.stream.audio.text.base import AbstractTextEmitter from dimos.utils.logging_config import setup_logger +logger = setup_logger() + try: import whisper # type: ignore[import-untyped] @@ -33,9 +35,9 @@ try: from faster_whisper import WhisperModel # type: ignore[import-untyped] - warnings.warn( + logger.warn( "openai-whisper not installed, falling back to faster-whisper. " - "Install openai-whisper for the full backend: pip install dimos[whisper]", + "Install openai-whisper for the full backend: pip install openai-whisper", stacklevel=2, ) _USE_FASTER_WHISPER = True @@ -46,8 +48,6 @@ "or openai-whisper (pip install dimos[whisper])." ) -logger = setup_logger() - class WhisperNode(AbstractAudioConsumer, AbstractTextEmitter): """ From 1693d6512993471050dd4efb7ee1d6eca13511f1 Mon Sep 17 00:00:00 2001 From: Sam Bull Date: Tue, 21 Apr 2026 15:49:47 +0100 Subject: [PATCH 08/11] Apply suggestion from @Dreamsorcerer --- pyproject.toml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4f305bb4ac..c6a4ce44ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -173,11 +173,6 @@ agents = [ "faster-whisper>=1.0.0", ] -whisper = [ - "dimos[agents]", - "openai-whisper", -] - web = [ "fastapi>=0.115.6", "sse-starlette>=2.2.1", From c78c2c1a16541edf29659b1ec3877e001be6e3f7 Mon Sep 17 00:00:00 2001 From: Sam Bull Date: Tue, 21 Apr 2026 15:53:10 +0100 Subject: [PATCH 09/11] Apply suggestion from @Dreamsorcerer --- dimos/stream/audio/stt/node_whisper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dimos/stream/audio/stt/node_whisper.py b/dimos/stream/audio/stt/node_whisper.py index fccb9318a3..4b71701580 100644 --- a/dimos/stream/audio/stt/node_whisper.py +++ b/dimos/stream/audio/stt/node_whisper.py @@ -38,7 +38,6 @@ logger.warn( "openai-whisper not installed, falling back to faster-whisper. " "Install openai-whisper for the full backend: pip install openai-whisper", - stacklevel=2, ) _USE_FASTER_WHISPER = True except ImportError: From a32dc4182f3aba20bb476510e7ac83e3f13ae45b Mon Sep 17 00:00:00 2001 From: Sam Bull Date: Tue, 21 Apr 2026 17:28:16 +0100 Subject: [PATCH 10/11] Install whisper in CI --- pyproject.toml | 1 + uv.lock | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 5044cc643f..950494b177 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -246,6 +246,7 @@ dev = [ "requests-mock==1.12.1", "terminaltexteffects==0.12.2", "watchdog>=3.0.0", + "openai-whisper", # docs "md-babel-py==1.1.1", diff --git a/uv.lock b/uv.lock index 44d0e7c79e..84119dd4f7 100644 --- a/uv.lock +++ b/uv.lock @@ -1823,6 +1823,7 @@ dds = [ { name = "lxml-stubs" }, { name = "md-babel-py" }, { name = "mypy" }, + { name = "openai-whisper" }, { name = "pandas-stubs" }, { name = "pre-commit" }, { name = "py-spy" }, @@ -1863,6 +1864,7 @@ dev = [ { name = "lxml-stubs" }, { name = "md-babel-py" }, { name = "mypy" }, + { name = "openai-whisper" }, { name = "pandas-stubs" }, { name = "pre-commit" }, { name = "py-spy" }, @@ -2090,6 +2092,7 @@ requires-dist = [ { name = "open3d-unofficial-arm", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, { name = "open3d-unofficial-arm", marker = "platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'docker'" }, { name = "openai", marker = "extra == 'agents'" }, + { name = "openai-whisper", marker = "extra == 'dev'" }, { name = "opencv-contrib-python", marker = "extra == 'misc'", specifier = "==4.10.0.84" }, { name = "opencv-python" }, { name = "opencv-python-headless", marker = "extra == 'docker'" }, @@ -5104,6 +5107,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f2/cf/369278487161c8d8eadd1a6cee8b0bd629936a1b263bbeccf71342b24dc8/moondream-0.2.0-py3-none-any.whl", hash = "sha256:ca722763bddcce7c13faf87fa3e6b834f86f7bea22bc8794fc1fe15f2d826d93", size = 96169, upload-time = "2025-11-25T18:22:03.465Z" }, ] +[[package]] +name = "more-itertools" +version = "11.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/f7/139d22fef48ac78127d18e01d80cf1be40236ae489769d17f35c3d425293/more_itertools-11.0.2.tar.gz", hash = "sha256:392a9e1e362cbc106a2457d37cabf9b36e5e12efd4ebff1654630e76597df804", size = 144659, upload-time = "2026-04-09T15:01:33.297Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/98/6af411189d9413534c3eb691182bff1f5c6d44ed2f93f2edfe52a1bbceb8/more_itertools-11.0.2-py3-none-any.whl", hash = "sha256:6e35b35f818b01f691643c6c611bc0902f2e92b46c18fffa77ae1e7c46e912e4", size = 71939, upload-time = "2026-04-09T15:01:32.21Z" }, +] + [[package]] name = "mosek" version = "11.0.24" @@ -6107,6 +6119,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/56/0a89092a453bb2c676d66abee44f863e742b2110d4dbb1dbcca3f7e5fc33/openai-2.21.0-py3-none-any.whl", hash = "sha256:0bc1c775e5b1536c294eded39ee08f8407656537ccc71b1004104fe1602e267c", size = 1103065, upload-time = "2026-02-14T00:11:59.603Z" }, ] +[[package]] +name = "openai-whisper" +version = "20250625" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "more-itertools" }, + { name = "numba" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "tiktoken" }, + { name = "torch" }, + { name = "tqdm" }, + { name = "triton", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or sys_platform == 'linux2'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/35/8e/d36f8880bcf18ec026a55807d02fe4c7357da9f25aebd92f85178000c0dc/openai_whisper-20250625.tar.gz", hash = "sha256:37a91a3921809d9f44748ffc73c0a55c9f366c85a3ef5c2ae0cc09540432eb96", size = 803191, upload-time = "2025-06-26T01:06:13.34Z" } + [[package]] name = "opencv-contrib-python" version = "4.10.0.84" From 388b0570e7059d25e7091ecdf6ef6f34e59c6bcf Mon Sep 17 00:00:00 2001 From: Sam Bull Date: Tue, 21 Apr 2026 17:33:36 +0100 Subject: [PATCH 11/11] Apply suggestion from @Dreamsorcerer --- dimos/stream/audio/stt/node_whisper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dimos/stream/audio/stt/node_whisper.py b/dimos/stream/audio/stt/node_whisper.py index 4b71701580..2c6cc1b29a 100644 --- a/dimos/stream/audio/stt/node_whisper.py +++ b/dimos/stream/audio/stt/node_whisper.py @@ -14,7 +14,6 @@ # limitations under the License. from typing import Any -import warnings from reactivex import Observable, create, disposable