Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
256 changes: 256 additions & 0 deletions marimo/_runtime/executor/lifecycles/cached.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
# Copyright 2026 Marimo. All rights reserved.
"""CachedLifecycle — cell-level caching as setup/teardown.

On setup: hash the cell + consult the lazy store.

- HIT → restore defs into globals, return `Skip` with the cached
return value so the Evaluator short-circuits the body.
- MISS → pre-flight `cell.refs`: if any ref in `glbls` is an
`UnhashableStub` (a stale placeholder left over from an
upstream cached producer whose value resisted serialization),
invalidate the producer's manifest and raise
`MarimoCancelCellError(cells_to_rerun=producers ∪ self)`.
`Runner.run_all` catches the signal and hands it to
`Scheduler.requeue_for_rerun`; the body never runs this turn.
Otherwise, fall through and let the body run.

On teardown: backfill on successful miss; drop the attempt on a raised
body. No body-trip handling — the pre-flight at setup catches it
strictly earlier, against the *refs* rather than reacting to a partial
body's runtime access.
"""

from __future__ import annotations

import time
from typing import TYPE_CHECKING, cast

from marimo import _loggers
from marimo._runtime.exceptions import MarimoCancelCellError
from marimo._runtime.executor.lifecycles import Skip
from marimo._runtime.runner.result import RunResult
from marimo._save.cache import Cache
from marimo._save.hash import cache_attempt_from_hash
from marimo._save.loaders import (
PERSISTENT_LOADERS,
BasePersistenceLoader,
LoaderKey,
resolve_loader,
)

if TYPE_CHECKING:
from marimo._ast.cell import CellImpl
from marimo._runtime.dataflow import DirectedGraph
from marimo._types.globals import MutableGlobals
from marimo._types.ids import CellId_t

LOGGER = _loggers.marimo_logger()

# Loader backing cell-level caching, resolved through the persistent-
# loader registry so improvements to the lazy per-def loader arrive
# transparently.
DEFAULT_CELL_LOADER: LoaderKey = "lazy"


def _is_unhashable_stub(value: object) -> bool:
"""Duck-typed check for serialization-failure placeholders.

Loaders that cannot serialize a def restore a stub carrying the
class-level `__marimo_unhashable__` marker. Detecting the protocol
attribute (rather than importing the stub class) keeps the runtime
lifecycle and the serialization toolkit independently mergeable.
"""
return getattr(type(value), "__marimo_unhashable__", False) is True


class CachedLifecycle:
"""Skip cell exec on cache hit; backfill cell results on miss success.

Inner wrap relative to `StrictLifecycle`: when both are configured, the
Evaluator runs Strict.setup → Cached.setup → body → Cached.teardown →
Strict.teardown. Caching sees a sanitized scope (Strict already ran),
and Strict's restore happens after Cached's backfill (so the cache
captures the cell's real defs).
"""

name = "cached"

def __init__(
self,
graph: DirectedGraph,
pin_modules: bool = True,
loader: LoaderKey = DEFAULT_CELL_LOADER,
) -> None:
self._graph = graph
self._pin_modules = pin_modules
# The persistent loaders are all BasePersistenceLoader subclasses
# (which carry `.store`); the registry is typed as the `Loader` base.
self._loader = cast(
BasePersistenceLoader,
resolve_loader(PERSISTENT_LOADERS[loader])(name="lazy"),
)
Comment on lines +84 to +91
# Per-cell state — populated in setup, consumed in teardown.
self._attempts: dict[CellId_t, Cache] = {}
self._exec_starts: dict[CellId_t, float] = {}
# Per-cell manifest path, recorded on hit/save. Consumed when
# this cell's pre-flight invalidates an upstream producer.
self._manifest_keys: dict[CellId_t, str] = {}

def setup(self, cell: CellImpl, glbls: MutableGlobals) -> Skip | None:
cell_id = cell.cell_id

attempt = cache_attempt_from_hash(
cell.mod,
self._graph,
cell_id,
glbls,
loader=self._loader,
pin_modules=self._pin_modules,
)
self._attempts[cell_id] = attempt

if attempt.hit:
try:
attempt.restore(glbls)
except Exception as e:
LOGGER.warning("Cache restore failed for %s: %s", cell_id, e)
self._attempts.pop(cell_id, None)
# Fall through to miss-path execution.
else:
Comment on lines +112 to +119
if self._restored_ui_defs(attempt, glbls):
# A restored UIElement carries a fresh object id while
# the cached output HTML embeds the saving session's —
# the frontend and kernel would disagree and events go
# nowhere. UI construction is cheap and inherently
# session state: run the cell live instead.
LOGGER.debug(
"Cache hit for %s defines UI elements; running "
"live to register them with this session",
cell_id,
)
self._attempts.pop(cell_id, None)
# Fall through to miss-path execution.
else:
self._manifest_keys[cell_id] = str(
self._loader.build_path(attempt.key)
)
return Skip(
result=RunResult(
output=attempt.meta.get("return"), exception=None
)
)

# Pre-flight refs against UnhashableStubs left in scope by upstream
# cached producers. Raises MarimoCancelCellError if any ref is a
# stub — body never runs this turn.
self._preflight_refs(cell, glbls)

self._exec_starts[cell_id] = time.time()
return None

def teardown(
self,
cell: CellImpl,
glbls: MutableGlobals,
run_result: RunResult,
) -> None:
cell_id = cell.cell_id
attempt = self._attempts.pop(cell_id, None)
exec_start = self._exec_starts.pop(cell_id, None)

if attempt is None:
return
if attempt.hit:
return
if run_result.exception is not None:
return

runtime = time.time() - (exec_start if exec_start else time.time())
try:
attempt.update(
{**glbls},
meta={
"return": run_result.output,
"runtime": runtime,
},
preserve_pointers=False,
)
saved = self._loader.save_cache(attempt)
if saved:
self._manifest_keys[cell_id] = str(
self._loader.build_path(attempt.key)
)
except BaseException as e:
# Best-effort: save failures (incl. CacheException, which
# extends BaseException) must never break the teardown chain.
LOGGER.warning("Cache save failed for %s: %s", cell_id, e)

@staticmethod
def _restored_ui_defs(attempt: Cache, glbls: MutableGlobals) -> bool:
"""True if any def restored from the cache is a live UIElement."""
from marimo._plugins.ui._core.ui_element import UIElement

return any(
isinstance(glbls.get(name), UIElement) for name in attempt.defs
)

def _preflight_refs(self, cell: CellImpl, glbls: MutableGlobals) -> None:
"""Detect UnhashableStub residues in refs; requeue producers.

Walks `cell.refs` and checks each name in `glbls` for an
`UnhashableStub` instance — a placeholder left behind by an
upstream cached cell whose def couldn't be serialized. If any
are found, invalidates each producer's recorded manifest, drops
this cell's attempt so teardown won't try to backfill, and
raises `MarimoCancelCellError` with `cells_to_rerun` populated
so `Runner.run_all` can `Scheduler.requeue_for_rerun` the
producers (plus this cell, which retries after they produce real
values).

Cheap top-level scan: only directly-stub refs trip. Stubs
embedded inside other values are not detected here — those would
surface during body execution as a `MarimoUnhashableCacheError`
from the stub's `__call__`.
"""
cell_id = cell.cell_id
stub_vars: list[str] = []
for ref in cell.refs:
value = glbls.get(ref) if ref in glbls else None
if _is_unhashable_stub(value):
stub_vars.append(ref)

if not stub_vars:
return

cells_to_rerun: set[CellId_t] = {cell_id}
for var_name in stub_vars:
try:
cells_to_rerun.update(self._graph.get_defining_cells(var_name))
except KeyError:
pass

for producer_id in cells_to_rerun - {cell_id}:
self._invalidate(producer_id)

# Drop our own attempt — body is being skipped this turn, so
# teardown must not backfill against the partially-restored scope.
self._attempts.pop(cell_id, None)
self._exec_starts.pop(cell_id, None)

LOGGER.info(
"Pre-flight requeue for %s: stub refs %s; producers %s",
cell_id,
stub_vars,
cells_to_rerun - {cell_id},
)
raise MarimoCancelCellError(cells_to_rerun=cells_to_rerun)

def _invalidate(self, cell_id: CellId_t) -> None:
"""Delete the recorded manifest for `cell_id` (if any)."""
key = self._manifest_keys.pop(cell_id, None)
if key is None:
return
try:
self._loader.store.clear(key)
except Exception as e:
LOGGER.warning("Manifest clear failed for %s: %s", key, e)
94 changes: 77 additions & 17 deletions marimo/_runtime/runner/cell_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from marimo._runtime.context.types import safe_get_context
from marimo._runtime.control_flow import MarimoInterrupt, MarimoStopError
from marimo._runtime.exceptions import (
MarimoCancelCellError,
MarimoMissingRefError,
MarimoRuntimeException,
unwrap_user_exception,
Expand Down Expand Up @@ -157,6 +158,28 @@ def __init__(
lifecycles: list[ExecutionLifecycle] = []
if execution_type == "strict":
lifecycles.append(StrictLifecycle(self.graph))
if user_config is not None and user_config.get("runtime", {}).get(
"cache_cells", False
):
# Lazy import: pulls in the cache machinery (and its downstream
# marimo._save chain), which would create a circular import at
# module load.
from marimo._runtime.executor.lifecycles.cached import (
CachedLifecycle,
)

lifecycles.append(
CachedLifecycle(
self.graph,
# Pinning trades staleness protection for key
# portability: a cache exported across environments
# (e.g. into a WASM bundle) only hits when module
# versions are excluded from the key.
pin_modules=bool(
user_config.get("runtime", {}).get("pin_modules", True)
),
)
)
self._evaluator = Evaluator(
executor=resolve_executor(), lifecycles=lifecycles
)
Expand Down Expand Up @@ -417,6 +440,26 @@ async def evaluate_interruptible(self, cell: CellImpl) -> RunResult:
# rather than escaping to the broad except below.
return RunResult(output=None, exception=exc)

@staticmethod
def _log_internal_error() -> None:
"""Defensive: an unexpected escape from the Evaluator or a bug in
`_finalize_run_result` would otherwise tear down the runner loop.
Log a report and let the caller degrade to an empty RunResult."""
LOGGER.error(
"""marimo encountered an internal error.

marimo finished executing a cell, but did not produce
a run result.

Please copy this message and paste it in a GitHub issue:

https://github.com/marimo-team/marimo/issues

Any additional context of what caused this error, such
as sample code to reproduce, will help us debug.
"""
)

async def run(self, cell_id: CellId_t) -> RunResult:
"""Run a cell."""
if self.debugger is not None:
Expand All @@ -430,25 +473,22 @@ async def run(self, cell_id: CellId_t) -> RunResult:
# effects are applied below in `_finalize_run_result`.
try:
raw_result = await self.evaluate_interruptible(cell)
run_result = self._finalize_run_result(raw_result, cell_id)
except BaseException:
# Defensive: an unexpected escape from the Evaluator or a bug
# in `_finalize_run_result` would otherwise tear down the
# runner loop. Degrade gracefully with an empty RunResult.
LOGGER.error(
"""marimo encountered an internal error.

marimo finished executing a cell, but did not produce
a run result.
self._log_internal_error()
raw_result = RunResult(output=None, exception=None)

Please copy this message and paste it in a GitHub issue:
# Soft-cancel control signal from a lifecycle (e.g. CachedLifecycle
# tripping on a stale UnhashableStub ref): propagate to `run_all`
# so it can requeue the producing cells. Lifecycle teardown already
# ran inside the Evaluator; this is not a cell error, so it is not
# classified or recorded here.
if isinstance(raw_result.exception, MarimoCancelCellError):
raise raw_result.exception

https://github.com/marimo-team/marimo/issues

Any additional context of what caused this error, such
as sample code to reproduce, will help us debug.
"""
)
try:
run_result = self._finalize_run_result(raw_result, cell_id)
except BaseException:
self._log_internal_error()
run_result = RunResult(output=None, exception=None)

# Mark as interrupted if the cell raised a MarimoInterrupt
Expand Down Expand Up @@ -807,4 +847,24 @@ async def _dispatch_runnable(
cell.set_run_result_status("cancelled")
cell.set_runtime_state("idle")
continue
await self._run_one(cell_id, pre_exec_ctx, post_exec_ctx)
try:
await self._run_one(cell_id, pre_exec_ctx, post_exec_ctx)
except MarimoCancelCellError as e:
# Soft-cancel: a lifecycle (e.g. CachedLifecycle hitting
# a stale UnhashableStub) asked to re-run producer cells
# so they emit real values. Requeue them at the head of
# the queue; this cell retries after they run. Not a
# cell error — don't classify or record it.
LOGGER.debug(
"Soft-cancel for %s; requeuing %s",
cell_id,
e.cells_to_rerun,
)
# The pre-execution hook set this cell's runtime_state to
# "running"; the soft-cancel raised out of `_run_one`
# before the post-execution hook could reset it. Mark
# every requeued cell "queued" so none lingers in
# "running" while its producers re-execute.
for rerun_id in e.cells_to_rerun:
self.graph.cells[rerun_id].set_runtime_state("queued")
self._scheduler.requeue_for_rerun(e.cells_to_rerun)
Comment thread
dmadisetti marked this conversation as resolved.
Loading
Loading