From c82ff71555c795a4d16668ca47ec89a2aa90f51d Mon Sep 17 00:00:00 2001 From: Andrew Lauer Date: Fri, 17 Apr 2026 19:21:28 -0700 Subject: [PATCH 1/5] Initial subprocess display --- dimos/core/resource_monitor/monitor.py | 8 +++- dimos/core/resource_monitor/stats.py | 28 ++++++++++++ dimos/utils/cli/dtop.py | 59 +++++++++++++++++++------- 3 files changed, 79 insertions(+), 16 deletions(-) diff --git a/dimos/core/resource_monitor/monitor.py b/dimos/core/resource_monitor/monitor.py index 49079ed98e..692c597d43 100644 --- a/dimos/core/resource_monitor/monitor.py +++ b/dimos/core/resource_monitor/monitor.py @@ -22,6 +22,7 @@ from dimos.core.resource import Resource from dimos.core.resource_monitor.stats import ( WorkerStats, + collect_children_stats, collect_process_stats, ) from dimos.utils.logging_config import setup_logger @@ -110,8 +111,13 @@ def _collect_and_log(self) -> None: pid = w.pid if pid is not None: ps = collect_process_stats(pid) + children = collect_children_stats(pid) + ps_dict = asdict(ps) + ps_dict["cpu_percent"] += sum(c.cpu_percent for c in children) worker_stats.append( - WorkerStats(**asdict(ps), worker_id=w.worker_id, modules=w.module_names) + WorkerStats( + **ps_dict, worker_id=w.worker_id, modules=w.module_names, children=children + ) ) else: worker_stats.append( diff --git a/dimos/core/resource_monitor/stats.py b/dimos/core/resource_monitor/stats.py index 6264d5c7f9..458b22519f 100644 --- a/dimos/core/resource_monitor/stats.py +++ b/dimos/core/resource_monitor/stats.py @@ -115,6 +115,33 @@ def _collect_proc(proc: psutil.Process) -> ProcStats: ) +@dataclass(frozen=True) +class ChildProcessStats: + """CPU stats for a single child process.""" + + pid: int + name: str + cpu_percent: float + + +def collect_children_stats(pid: int) -> list[ChildProcessStats]: + """Return per-child CPU stats for all direct children of pid.""" + result = [] + try: + proc = _get_process(pid) + for child in proc.children(recursive=False): + child_proc = _get_process(child.pid) + try: + name = child_proc.name() + cpu = child_proc.cpu_percent(interval=None) + result.append(ChildProcessStats(pid=child.pid, name=name, cpu_percent=cpu)) + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return result + + def collect_process_stats(pid: int) -> ProcessStats: """Collect resource stats for a single process by PID.""" try: @@ -137,3 +164,4 @@ class WorkerStats(ProcessStats): worker_id: int = -1 modules: list[str] = field(default_factory=list) + children: list[ChildProcessStats] = field(default_factory=list) diff --git a/dimos/utils/cli/dtop.py b/dimos/utils/cli/dtop.py index 64529a6bc3..b2b2c9310f 100644 --- a/dimos/utils/cli/dtop.py +++ b/dimos/utils/cli/dtop.py @@ -119,6 +119,20 @@ def _fmt_io(v: float) -> str: return f"{v / 1048576:.0f} MB" +def _cpu_metric(line: Text, cpu: float, stale: bool, cpu_hist: deque[float] | None = None) -> None: + """Append a CPU label + value + sparkline/bar to an existing Text line.""" + dim = "#606060" + line.append("CPU ", style=dim if stale else _LABEL_COLOR) + line.append(_fmt_pct(cpu), style=dim if stale else _heat(min(cpu / 100.0, 1.0))) + line.append(" ") + if stale: + line.append("░" * _SPARK_WIDTH, style=dim) + elif cpu_hist is not None and len(cpu_hist) > 0: + line.append_text(_spark(cpu_hist)) + else: + line.append_text(_bar(cpu, 100)) + + _LINE1: list[tuple[str, str, Callable[[float], str]]] = [ ("CPU", "cpu_percent", _fmt_pct), ("PSS", "pss", _fmt_mem), @@ -191,6 +205,7 @@ def __init__(self, topic_name: str = "/dimos/resource_stats") -> None: self._latest: dict[str, Any] | None = None self._last_msg_time: float = 0.0 self._cpu_history: dict[str, deque[float]] = {} + self._child_cpu_history: dict[int, deque[float]] = {} def compose(self) -> ComposeResult: with VerticalScroll(): @@ -266,6 +281,13 @@ def _refresh(self) -> None: title.append(" ") parts.append(Rule(title=title, style=border_style)) parts.extend(self._make_lines(d, stale, ranges, self._cpu_history[role])) + for child in d.get("children", []): + pid = child.get("pid", 0) + if pid not in self._child_cpu_history: + self._child_cpu_history[pid] = deque(maxlen=_SPARK_WIDTH * 2) + if not stale: + self._child_cpu_history[pid].append(child.get("cpu_percent", 0.0)) + parts.append(self._make_child_line(child, stale, self._child_cpu_history[pid])) # First entry title goes on the Panel itself first_role, first_rs, _, first_mods, first_pid = entries[0] @@ -285,6 +307,24 @@ def _refresh(self) -> None: ) self.query_one("#panels", Static).update(panel) + @staticmethod + def _make_child_line( + child: dict[str, Any], stale: bool, cpu_hist: deque[float] | None = None + ) -> Text: + dim = "#606060" + sep = " · " + sep_style = dim if stale else "#555555" + cpu = child.get("cpu_percent", 0.0) + pid = child.get("pid", "") + name = child.get("name", "?") + line = Text() + line.append(" ↳ ", style=sep_style) + line.append(f"{name}", style=dim if stale else _LABEL_COLOR) + line.append(f" [{pid}]", style=dim if stale else "#777777") + line.append(sep, style=sep_style) + _cpu_metric(line, cpu, stale, cpu_hist) + return line + @staticmethod def _make_lines( d: dict[str, Any], @@ -304,24 +344,13 @@ def _make_lines( for idx, (label, key, fmt) in enumerate(_LINE1): val = d.get(key, 0) lo, hi = ranges[key] - # CPU% uses absolute 0-100 scale; everything else is relative - if key == "cpu_percent": - val_style = dim if stale else _heat(min(val / 100.0, 1.0)) - else: - val_style = dim if stale else _rel_style(val, lo, hi) if idx > 0: line1.append(sep, style=sep_style) - line1.append(f"{label} ", style=label1_style) - line1.append(fmt(val), style=val_style) - # CPU bar right after CPU% if key == "cpu_percent": - line1.append(" ") - if stale: - line1.append("░" * _SPARK_WIDTH, style=dim) - elif cpu_hist is not None and len(cpu_hist) > 0: - line1.append_text(_spark(cpu_hist)) - else: - line1.append_text(_bar(val, 100)) + _cpu_metric(line1, val, stale, cpu_hist) + else: + line1.append(f"{label} ", style=label1_style) + line1.append(fmt(val), style=dim if stale else _rel_style(val, lo, hi)) # Line 2 line2 = Text() From e340bdfb32f8756249951348112bd5ed0d56f927 Mon Sep 17 00:00:00 2001 From: Andrew Lauer Date: Sat, 18 Apr 2026 12:09:03 -0700 Subject: [PATCH 2/5] Dtop logging and plotting --- dimos/utils/cli/dtop.py | 33 ++++++++-- dimos/utils/cli/dtop_plot.py | 122 +++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + 3 files changed, 151 insertions(+), 5 deletions(-) create mode 100644 dimos/utils/cli/dtop_plot.py diff --git a/dimos/utils/cli/dtop.py b/dimos/utils/cli/dtop.py index b2b2c9310f..543718d213 100644 --- a/dimos/utils/cli/dtop.py +++ b/dimos/utils/cli/dtop.py @@ -21,6 +21,7 @@ from __future__ import annotations from collections import deque +import json import threading import time from typing import TYPE_CHECKING, Any @@ -190,9 +191,12 @@ class ResourceSpyApp(App[None]): BINDINGS = [("q", "quit"), ("ctrl+c", "quit")] - def __init__(self, topic_name: str = "/dimos/resource_stats") -> None: + def __init__( + self, topic_name: str = "/dimos/resource_stats", log_path: str | None = None + ) -> None: super().__init__() self._topic_name = topic_name + self._log_file = open(log_path, "a") if log_path else None # Warn about missing system config before entering TUI raw mode. from dimos.protocol.service.lcmservice import autoconf @@ -216,11 +220,15 @@ def on_mount(self) -> None: async def on_unmount(self) -> None: self._lcm.stop() + if self._log_file: + self._log_file.close() def _on_msg(self, msg: dict[str, Any], _topic: str) -> None: with self._lock: self._latest = msg self._last_msg_time = time.monotonic() + if self._log_file: + self._log_file.write(json.dumps({"ts": time.time(), **msg}) + "\n") def _refresh(self) -> None: with self._lock: @@ -485,17 +493,32 @@ def _preview() -> None: def main() -> None: + import argparse import sys if "--preview" in sys.argv: _preview() return - topic = "/dimos/resource_stats" - if len(sys.argv) > 1 and sys.argv[1] == "--topic" and len(sys.argv) > 2: - topic = sys.argv[2] + parser = argparse.ArgumentParser( + prog="dtop", description="Live TUI for per-worker resource stats." + ) + parser.add_argument( + "--topic", default="/dimos/resource_stats", help="LCM topic to subscribe to." + ) + parser.add_argument( + "--log", + nargs="?", + const=f"dtop_{time.strftime('%Y%m%d_%H%M%S')}.jsonl", + metavar="PATH", + help="Log stats to a JSONL file. Uses a timestamped filename if no path is given.", + ) + args = parser.parse_args() + + if args.log and args.log == parser.get_default("log"): + print(f"Logging to {args.log}") - ResourceSpyApp(topic_name=topic).run() + ResourceSpyApp(topic_name=args.topic, log_path=args.log).run() if __name__ == "__main__": diff --git a/dimos/utils/cli/dtop_plot.py b/dimos/utils/cli/dtop_plot.py new file mode 100644 index 0000000000..c81fbf539e --- /dev/null +++ b/dimos/utils/cli/dtop_plot.py @@ -0,0 +1,122 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""dtop-plot — Plot resource stats from a dtop JSONL log file. + +Usage: + dtop-plot [--metrics cpu_percent,pss] [--out plot.png] +""" + +from __future__ import annotations + +_COORDINATOR = "coordinator" + +_METRIC_LABELS: dict[str, str] = { + "cpu_percent": "CPU %", + "pss": "PSS (MB)", + "num_threads": "Threads", + "num_children": "Children", + "num_fds": "File Descriptors", + "cpu_time_user": "User CPU Time (s)", + "cpu_time_system": "Sys CPU Time (s)", + "cpu_time_iowait": "IO Wait Time (s)", + "io_read_bytes": "IO Read (MB)", + "io_write_bytes": "IO Write (MB)", +} + +_SCALE: dict[str, float] = { + "pss": 1 / 1048576, + "io_read_bytes": 1 / 1048576, + "io_write_bytes": 1 / 1048576, +} + + +def _load(path: str): + import pandas as pd + + raw = pd.read_json(path, lines=True) + + rows = [] + for _, msg in raw.iterrows(): + ts = msg["ts"] + rows.append({"ts": ts, "role": _COORDINATOR, **msg[_COORDINATOR]}) + for w in msg.get("workers", []): + wid = w.get("worker_id", 0) + rows.append({"ts": ts, "role": f"worker_{wid}", **w}) + + df = pd.DataFrame(rows) + df["ts"] = pd.to_datetime(df["ts"], unit="s") + + labels: dict[str, str] = {_COORDINATOR: _COORDINATOR} + for role, group in df.groupby("role"): + if role == _COORDINATOR: + continue + mods_col = group["modules"].dropna() if "modules" in group.columns else None + mods = mods_col.iloc[0] if mods_col is not None and len(mods_col) > 0 else None + labels[role] = ", ".join(mods) if mods else role + + df["label"] = df["role"].map(labels) + return df, labels + + +def _plot(df, labels: dict[str, str], metrics: list[str], out: str | None) -> None: + import matplotlib.pyplot as plt + + fig, axes = plt.subplots(len(metrics), 1, figsize=(12, 3 * len(metrics)), sharex=True) + if len(metrics) == 1: + axes = [axes] + + for ax, metric in zip(axes, metrics, strict=False): + if metric not in df.columns: + ax.set_visible(False) + continue + scale = _SCALE.get(metric, 1.0) + for role, group in df.groupby("role"): + ax.plot(group["ts"], group[metric] * scale, label=labels[role]) + ax.set_ylabel(_METRIC_LABELS.get(metric, metric)) + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3) + + axes[-1].set_xlabel("Time") + fig.tight_layout() + + if out: + fig.savefig(out, dpi=150) + print(f"Saved to {out}") + else: + plt.show() + + +def main() -> None: + import argparse + + parser = argparse.ArgumentParser( + prog="dtop-plot", description="Plot resource stats from a dtop JSONL log file." + ) + parser.add_argument("log", metavar="LOG", help="Path to a dtop JSONL log file.") + parser.add_argument( + "--metrics", + default="cpu_percent,pss,num_threads", + help="Comma-separated list of metrics to plot (default: cpu_percent,pss,num_threads).", + ) + parser.add_argument("--out", metavar="PATH", help="Save plot to file instead of displaying it.") + args = parser.parse_args() + + metrics = [m.strip() for m in args.metrics.split(",")] + df, labels = _load(args.log) + _plot(df, labels, metrics, args.out) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 364fc6d22b..5bd41963b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,6 +100,7 @@ dimos = "dimos.robot.cli.dimos:main" rerun-bridge = "dimos.visualization.rerun.bridge:app" doclinks = "dimos.utils.docs.doclinks:main" dtop = "dimos.utils.cli.dtop:main" +dtop-plot = "dimos.utils.cli.dtop_plot:main" [project.urls] Homepage = "https://dimensionalos.com" From 68beb7ed8e62689f1cfcfc5804d953f200206587 Mon Sep 17 00:00:00 2001 From: Andrew Lauer Date: Sat, 18 Apr 2026 12:19:18 -0700 Subject: [PATCH 3/5] Move legend off of plot --- dimos/utils/cli/dtop_plot.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dimos/utils/cli/dtop_plot.py b/dimos/utils/cli/dtop_plot.py index c81fbf539e..916529f066 100644 --- a/dimos/utils/cli/dtop_plot.py +++ b/dimos/utils/cli/dtop_plot.py @@ -62,8 +62,7 @@ def _load(path: str): for role, group in df.groupby("role"): if role == _COORDINATOR: continue - mods_col = group["modules"].dropna() if "modules" in group.columns else None - mods = mods_col.iloc[0] if mods_col is not None and len(mods_col) > 0 else None + mods = next((m for m in group.get("modules", []) if m), None) labels[role] = ", ".join(mods) if mods else role df["label"] = df["role"].map(labels) @@ -85,14 +84,14 @@ def _plot(df, labels: dict[str, str], metrics: list[str], out: str | None) -> No for role, group in df.groupby("role"): ax.plot(group["ts"], group[metric] * scale, label=labels[role]) ax.set_ylabel(_METRIC_LABELS.get(metric, metric)) - ax.legend(fontsize=8) + ax.legend(fontsize=8, loc="center left", bbox_to_anchor=(1.01, 0.5), borderaxespad=0) ax.grid(True, alpha=0.3) axes[-1].set_xlabel("Time") fig.tight_layout() if out: - fig.savefig(out, dpi=150) + fig.savefig(out, dpi=150, bbox_inches="tight") print(f"Saved to {out}") else: plt.show() From 28beba3537b01ee38b9d9b01f3c443ed592dc162 Mon Sep 17 00:00:00 2001 From: Andrew Lauer Date: Fri, 24 Apr 2026 14:11:59 -0700 Subject: [PATCH 4/5] Fixes --- dimos/core/resource_monitor/stats.py | 18 +++++++++--------- dimos/utils/cli/dtop.py | 4 ++-- dimos/utils/cli/dtop_plot.py | 24 +++++++++++++++++------- 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/dimos/core/resource_monitor/stats.py b/dimos/core/resource_monitor/stats.py index 458b22519f..aa39192ce3 100644 --- a/dimos/core/resource_monitor/stats.py +++ b/dimos/core/resource_monitor/stats.py @@ -129,16 +129,16 @@ def collect_children_stats(pid: int) -> list[ChildProcessStats]: result = [] try: proc = _get_process(pid) - for child in proc.children(recursive=False): - child_proc = _get_process(child.pid) - try: - name = child_proc.name() - cpu = child_proc.cpu_percent(interval=None) - result.append(ChildProcessStats(pid=child.pid, name=name, cpu_percent=cpu)) - except (psutil.NoSuchProcess, psutil.AccessDenied): - pass except (psutil.NoSuchProcess, psutil.AccessDenied): - pass + return result + for child in proc.children(recursive=False): + try: + child_proc = _get_process(child.pid) + name = child_proc.name() + cpu = child_proc.cpu_percent(interval=None) + result.append(ChildProcessStats(pid=child.pid, name=name, cpu_percent=cpu)) + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass return result diff --git a/dimos/utils/cli/dtop.py b/dimos/utils/cli/dtop.py index 543718d213..c4c0b4e863 100644 --- a/dimos/utils/cli/dtop.py +++ b/dimos/utils/cli/dtop.py @@ -509,13 +509,13 @@ def main() -> None: parser.add_argument( "--log", nargs="?", - const=f"dtop_{time.strftime('%Y%m%d_%H%M%S')}.jsonl", + const=f"dtop_{time.strftime('%Y%m%d_%H%M%S')}.ignore.jsonl", metavar="PATH", help="Log stats to a JSONL file. Uses a timestamped filename if no path is given.", ) args = parser.parse_args() - if args.log and args.log == parser.get_default("log"): + if args.log: print(f"Logging to {args.log}") ResourceSpyApp(topic_name=args.topic, log_path=args.log).run() diff --git a/dimos/utils/cli/dtop_plot.py b/dimos/utils/cli/dtop_plot.py index 916529f066..1e4a826717 100644 --- a/dimos/utils/cli/dtop_plot.py +++ b/dimos/utils/cli/dtop_plot.py @@ -69,7 +69,7 @@ def _load(path: str): return df, labels -def _plot(df, labels: dict[str, str], metrics: list[str], out: str | None) -> None: +def _plot(df, labels: dict[str, str], metrics: list[str], out: str, show: bool = False) -> None: import matplotlib.pyplot as plt fig, axes = plt.subplots(len(metrics), 1, figsize=(12, 3 * len(metrics)), sharex=True) @@ -90,13 +90,17 @@ def _plot(df, labels: dict[str, str], metrics: list[str], out: str | None) -> No axes[-1].set_xlabel("Time") fig.tight_layout() - if out: - fig.savefig(out, dpi=150, bbox_inches="tight") - print(f"Saved to {out}") - else: + fig.savefig(out, dpi=150, bbox_inches="tight") + print(f"Saved to {out}") + if show: plt.show() +def _default_out(log_path: str) -> str: + base = log_path.removesuffix(".ignore.jsonl") + return f"{base}.ignore.png" + + def main() -> None: import argparse @@ -109,12 +113,18 @@ def main() -> None: default="cpu_percent,pss,num_threads", help="Comma-separated list of metrics to plot (default: cpu_percent,pss,num_threads).", ) - parser.add_argument("--out", metavar="PATH", help="Save plot to file instead of displaying it.") + parser.add_argument( + "--out", metavar="PATH", help="Output image path (default: .ignore.png)." + ) + parser.add_argument( + "--show", action="store_true", help="Open the plot interactively after saving." + ) args = parser.parse_args() + out = args.out or _default_out(args.log) metrics = [m.strip() for m in args.metrics.split(",")] df, labels = _load(args.log) - _plot(df, labels, metrics, args.out) + _plot(df, labels, metrics, out, args.show) if __name__ == "__main__": From d030c50957a697c90d4515c3a9a3fb5292124e25 Mon Sep 17 00:00:00 2001 From: Andrew Lauer Date: Mon, 27 Apr 2026 14:35:20 -0700 Subject: [PATCH 5/5] Fix mypy --- dimos/core/resource_monitor/stats.py | 2 +- dimos/utils/cli/dtop_plot.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/dimos/core/resource_monitor/stats.py b/dimos/core/resource_monitor/stats.py index aa39192ce3..bef54f89c3 100644 --- a/dimos/core/resource_monitor/stats.py +++ b/dimos/core/resource_monitor/stats.py @@ -126,7 +126,7 @@ class ChildProcessStats: def collect_children_stats(pid: int) -> list[ChildProcessStats]: """Return per-child CPU stats for all direct children of pid.""" - result = [] + result: list[ChildProcessStats] = [] try: proc = _get_process(pid) except (psutil.NoSuchProcess, psutil.AccessDenied): diff --git a/dimos/utils/cli/dtop_plot.py b/dimos/utils/cli/dtop_plot.py index 1e4a826717..16f1d8f594 100644 --- a/dimos/utils/cli/dtop_plot.py +++ b/dimos/utils/cli/dtop_plot.py @@ -20,6 +20,11 @@ from __future__ import annotations +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import pandas as pd + _COORDINATOR = "coordinator" _METRIC_LABELS: dict[str, str] = { @@ -42,7 +47,7 @@ } -def _load(path: str): +def _load(path: str) -> tuple[pd.DataFrame, dict[str, str]]: import pandas as pd raw = pd.read_json(path, lines=True) @@ -60,6 +65,7 @@ def _load(path: str): labels: dict[str, str] = {_COORDINATOR: _COORDINATOR} for role, group in df.groupby("role"): + role = str(role) if role == _COORDINATOR: continue mods = next((m for m in group.get("modules", []) if m), None) @@ -69,7 +75,9 @@ def _load(path: str): return df, labels -def _plot(df, labels: dict[str, str], metrics: list[str], out: str, show: bool = False) -> None: +def _plot( + df: pd.DataFrame, labels: dict[str, str], metrics: list[str], out: str, show: bool = False +) -> None: import matplotlib.pyplot as plt fig, axes = plt.subplots(len(metrics), 1, figsize=(12, 3 * len(metrics)), sharex=True) @@ -82,7 +90,7 @@ def _plot(df, labels: dict[str, str], metrics: list[str], out: str, show: bool = continue scale = _SCALE.get(metric, 1.0) for role, group in df.groupby("role"): - ax.plot(group["ts"], group[metric] * scale, label=labels[role]) + ax.plot(group["ts"], group[metric] * scale, label=labels[str(role)]) ax.set_ylabel(_METRIC_LABELS.get(metric, metric)) ax.legend(fontsize=8, loc="center left", bbox_to_anchor=(1.01, 0.5), borderaxespad=0) ax.grid(True, alpha=0.3)