ml-explore
diff --git a/‎mlx/array.h‎
Lines changed: 1 addition & 0 deletions b/‎mlx/array.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mlx/backend/common/load.cpp‎
Lines changed: 10 additions & 1 deletion b/‎mlx/backend/common/load.cpp‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎mlx/backend/cuda/event.cu‎
Lines changed: 93 additions & 97 deletions b/‎mlx/backend/cuda/event.cu‎
Lines changed: 93 additions & 97 deletions
diff --git a/‎mlx/backend/cuda/event.h‎
Lines changed: 18 additions & 2 deletions b/‎mlx/backend/cuda/event.h‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎mlx/backend/cuda/fence.cpp‎
Lines changed: 11 additions & 9 deletions b/‎mlx/backend/cuda/fence.cpp‎
Lines changed: 11 additions & 9 deletions
@@ -426,6 +426,7 @@ class MLX_API array {
   }
 
   void detach_event() const {
+    array_desc_->event.check_error();
     array_desc_->event = Event{};
   }
 
 
@@ -3,6 +3,8 @@
 #include <algorithm>
 #include <utility>
 
+#include <fmt/format.h>
+
 #include "mlx/primitives.h"
 #include "mlx/scheduler.h"
 
@@ -51,7 +53,14 @@ void Load::eval_cpu(const std::vector<array>& inputs, array& out) {
     }
   };
   auto fut = io::thread_pool().enqueue(std::move(read_task)).share();
-  scheduler::enqueue(stream(), [fut = std::move(fut)]() { fut.wait(); });
+  auto s = stream();
+  scheduler::enqueue(s, [s, fut = std::move(fut)]() {
+    try {
+      fut.get();
+    } catch (const std::exception& error) {
+      scheduler::set_error(s, fmt::format("[Load::eval_cpu] {}", error.what()));
+    }
+  });
 }
 
 } // namespace mlx::core
@@ -113,10 +113,7 @@ void CudaEvent::init_pool() {
   cuda_event_pool();
 }
 
-// Wraps CudaEvent with a few features:
-// 1. The class can be copied.
-// 2. Make wait/record work with CPU streams.
-// 3. Add checks for waiting on un-recorded event.
+// Wraps CudaEvent so it can be copied.
 class CopyableCudaEvent {
  public:
   explicit CopyableCudaEvent(Device& d)
@@ -126,32 +123,24 @@ class CopyableCudaEvent {
                 cudaEventDisableTiming | cudaEventBlockingSync)) {}
 
   void wait() {
+    check_recorded();
     event_->wait();
   }
 
   void wait(Stream s) {
-    if (s.device == mlx::core::Device::cpu) {
-      scheduler::enqueue(s, [*this]() mutable {
-        check_recorded();
-        event_->wait();
-      });
-    } else {
-      check_recorded();
-      auto& encoder = cu::get_command_encoder(s);
-      encoder.commit();
-      event_->wait(encoder.stream());
-    }
+    assert(s.device == mlx::core::Device::gpu);
+    check_recorded();
+    auto& encoder = cu::get_command_encoder(s);
+    encoder.commit();
+    event_->wait(encoder.stream());
   }
 
   void record(Stream s) {
-    if (s.device == mlx::core::Device::cpu) {
-      throw std::runtime_error("CudaEvent can not wait on CPU stream.");
-    } else {
-      auto& encoder = cu::get_command_encoder(s);
-      encoder.commit();
-      event_->record(encoder.stream());
-      recorded_ = true;
-    }
+    assert(s.device == mlx::core::Device::gpu);
+    auto& encoder = cu::get_command_encoder(s);
+    encoder.commit();
+    event_->record(encoder.stream());
+    recorded_ = true;
   }
 
   bool is_signaled() const {
@@ -213,6 +202,11 @@ auto check_gpu_coherency() {
   return coherency;
 }
 
+const CudaStream& signal_stream() {
+  static CudaStream stream(device(0));
+  return stream;
+}
+
 AtomicEvent::AtomicEvent(Device& d) {
   void* buf;
   cudaError_t (*cuda_free)(void*);
@@ -264,14 +258,11 @@ void AtomicEvent::wait(cudaStream_t stream, uint32_t value) {
 
 void AtomicEvent::wait(Stream s, uint32_t value) {
   nvtx3::scoped_range r("cu::AtomicEvent::wait(s)");
-  if (s.device == mlx::core::Device::cpu) {
-    scheduler::enqueue(s, [*this, value]() mutable { wait(value); });
-  } else {
-    auto& encoder = get_command_encoder(s);
-    encoder.commit();
-    wait(encoder.stream(), value);
-    encoder.add_completed_handler([buf = buf_]() {});
-  }
+  assert(s.device == mlx::core::Device::gpu);
+  auto& encoder = get_command_encoder(s);
+  encoder.commit();
+  wait(encoder.stream(), value);
+  encoder.add_completed_handler([buf = buf_]() {});
 }
 
 void AtomicEvent::signal(uint32_t value) {
@@ -289,17 +280,11 @@ void AtomicEvent::signal(cudaStream_t stream, uint32_t value) {
 
 void AtomicEvent::signal(Stream s, uint32_t value) {
   nvtx3::scoped_range r("cu::AtomicEvent::signal(s)");
-  if (s.device == mlx::core::Device::cpu) {
-    // Signal through a GPU stream so the atomic is updated in GPU - updating
-    // the atomic in CPU sometimes does not get GPU notified.
-    scheduler::enqueue(
-        s, [*this, value]() mutable { signal(signal_stream(), value); });
-  } else {
-    auto& encoder = get_command_encoder(s);
-    encoder.commit();
-    signal(encoder.stream(), value);
-    encoder.add_completed_handler([buf = buf_]() {});
-  }
+  assert(s.device == mlx::core::Device::gpu);
+  auto& encoder = get_command_encoder(s);
+  encoder.commit();
+  signal(encoder.stream(), value);
+  encoder.add_completed_handler([buf = buf_]() {});
 }
 
 bool AtomicEvent::is_signaled(uint32_t val) const {
@@ -319,9 +304,21 @@ uint32_t AtomicEvent::value() const {
   }
 }
 
-const CudaStream& AtomicEvent::signal_stream() {
-  static CudaStream stream(device(0));
-  return stream;
+///////////////////////////////////////////////////////////////////////////////
+// EventImpl implementations
+///////////////////////////////////////////////////////////////////////////////
+
+void EventImpl::ensure_created(Stream s, uint64_t signal_value) {
+  if (is_created()) {
+    return;
+  }
+  auto& d = cu::device(s.device);
+  if (s.device == mlx::core::Device::cpu || signal_value > 1) {
+    nvtx3::mark("Using slow AtomicEvent");
+    atomic = std::make_unique<cu::AtomicEvent>(d);
+  } else {
+    cuda = std::make_unique<cu::CopyableCudaEvent>(d);
+  }
 }
 
 } // namespace cu
@@ -330,86 +327,85 @@ const CudaStream& AtomicEvent::signal_stream() {
 // Event implementations
 ///////////////////////////////////////////////////////////////////////////////
 
-namespace {
-
-struct EventImpl {
-  // CudaEvent is preferred when possible because it is fast, however we have
-  // to fallback to AtomicEvent in following cases:
-  // 1. the event is used to wait/signal a cpu stream;
-  // 2. signal value other than 1 has been specified.
-  std::unique_ptr<cu::CopyableCudaEvent> cuda;
-  std::unique_ptr<cu::AtomicEvent> atomic;
-
-  bool is_created() const {
-    return cuda || atomic;
-  }
-
-  void ensure_created(Stream s, uint64_t signal_value) {
-    if (is_created()) {
-      return;
-    }
-    auto& d = cu::device(s.device);
-    if (s.device == mlx::core::Device::cpu || signal_value > 1) {
-      nvtx3::mark("Using slow AtomicEvent");
-      atomic = std::make_unique<cu::AtomicEvent>(d);
-    } else {
-      cuda = std::make_unique<cu::CopyableCudaEvent>(d);
-    }
-  }
-};
-
-} // namespace
-
 Event::Event(Stream s) : stream_(s) {
-  event_ = std::shared_ptr<void>(
-      new EventImpl(), [](void* ptr) { delete static_cast<EventImpl*>(ptr); });
+  event_ = std::make_shared<cu::EventImpl>();
 }
 
 void Event::wait() {
-  auto* event = static_cast<EventImpl*>(event_.get());
-  assert(event->is_created());
-  if (event->cuda) {
+  check_error();
+  auto& event = cast<cu::EventImpl>();
+  assert(event.is_created());
+  if (event.cuda) {
     assert(value() == 1);
-    event->cuda->wait();
+    event.cuda->wait();
   } else {
-    event->atomic->wait(value());
+    event.atomic->wait(value());
   }
   CHECK_CUDA_ERROR(cudaPeekAtLastError());
+  check_error();
 }
 
 void Event::wait(Stream s) {
-  auto* event = static_cast<EventImpl*>(event_.get());
-  assert(event->is_created());
-  if (event->cuda) {
+  auto& event = cast<cu::EventImpl>();
+  assert(event.is_created());
+  if (event.cuda) {
     assert(value() == 1);
-    event->cuda->wait(s);
+    if (s.device == mlx::core::Device::cpu) {
+      scheduler::wait_event(s, *this, [value = value()](Event& self) {
+        self.cast<cu::EventImpl>().cuda->wait();
+      });
+    } else {
+      event.cuda->wait(s);
+    }
   } else {
-    event->atomic->wait(s, value());
+    if (s.device == mlx::core::Device::cpu) {
+      scheduler::wait_event(s, *this, [value = value()](Event& self) {
+        self.cast<cu::EventImpl>().atomic->wait(value);
+      });
+    } else {
+      event.atomic->wait(s, value());
+    }
   }
 }
 
 void Event::signal(Stream s) {
-  auto* event = static_cast<EventImpl*>(event_.get());
-  event->ensure_created(s, value());
-  if (event->cuda) {
+  auto& event = cast<cu::EventImpl>();
+  event.ensure_created(s, value());
+  if (event.cuda) {
     assert(value() == 1);
-    event->cuda->record(s);
+    if (s.device == mlx::core::Device::cpu) {
+      throw std::runtime_error("CudaEvent can not wait on CPU stream.");
+    } else {
+      event.cuda->record(s);
+    }
   } else {
-    event->atomic->signal(s, value());
+    if (s.device == mlx::core::Device::cpu) {
+      // Signal through a GPU stream so the atomic is updated in GPU - updating
+      // the atomic in CPU sometimes does not get GPU notified.
+      scheduler::signal_event(s, *this, [value = value()](Event& self) {
+        self.cast<cu::EventImpl>().atomic->signal(cu::signal_stream(), value);
+      });
+    } else {
+      event.atomic->signal(s, value());
+    }
   }
 }
 
 bool Event::is_signaled() const {
-  auto* event = static_cast<EventImpl*>(event_.get());
-  if (!event->is_created()) {
+  auto& event = cast<cu::EventImpl>();
+  if (!event.is_created()) {
     return false;
   }
-  if (event->cuda) {
+  if (event.cuda) {
     assert(value() == 1);
-    return event->cuda->is_signaled();
+    return event.cuda->is_signaled();
   } else {
-    return event->atomic->is_signaled(value());
+    return event.atomic->is_signaled(value());
   }
 }
 
+Event::Error& Event::error() {
+  return cast<cu::EventImpl>().error;
+}
+
 } // namespace mlx::core
@@ -13,6 +13,7 @@
 
 namespace mlx::core::cu {
 
+class CopyableCudaEvent;
 class Device;
 
 // RAII-managed move-only wrapper of cudaEvent_t.
@@ -66,8 +67,6 @@ class AtomicEvent {
   uint32_t value() const;
 
  private:
-  const CudaStream& signal_stream();
-
   uint32_t* ptr() const {
     return static_cast<uint32_t*>(buf_.get());
   }
@@ -76,4 +75,21 @@ class AtomicEvent {
   std::shared_ptr<void> buf_;
 };
 
+struct EventImpl {
+  Event::Error error;
+
+  // CudaEvent is preferred when possible because it is fast, however we have
+  // to fallback to AtomicEvent in following cases:
+  // 1. the event is used to wait/signal a cpu stream;
+  // 2. signal value other than 1 has been specified.
+  std::unique_ptr<cu::CopyableCudaEvent> cuda;
+  std::unique_ptr<cu::AtomicEvent> atomic;
+
+  bool is_created() const {
+    return cuda || atomic;
+  }
+
+  void ensure_created(Stream s, uint64_t signal_value);
+};
+
 } // namespace mlx::core::cu
@@ -9,22 +9,23 @@ namespace mlx::core {
 
 struct FenceImpl {
   uint32_t count;
-  cu::AtomicEvent event;
+  Event event;
+
+  FenceImpl(uint32_t count, Stream s) : count(count), event(s) {}
 };
 
 Fence::Fence(Stream s) {
-  fence_ = std::shared_ptr<void>(
-      new FenceImpl{0, cu::device(s.device)},
-      [](void* ptr) { delete static_cast<FenceImpl*>(ptr); });
+  fence_ = std::make_shared<FenceImpl>(0, s);
+  // Ensure that we use AtomicEvent.
+  cast<FenceImpl>().event.cast<cu::EventImpl>().ensure_created(s, 2);
 }
 
 void Fence::wait(Stream s, const array&) {
-  auto* fence = static_cast<FenceImpl*>(fence_.get());
-  fence->event.wait(fence->count);
+  cast<FenceImpl>().event.wait();
 }
 
 void Fence::update(Stream s, const array& a, bool cross_device) {
-  auto* fence = static_cast<FenceImpl*>(fence_.get());
+  auto& f = cast<FenceImpl>();
   if (cross_device) {
     // Move to managed memory if there is a device switch
     auto& cbuf =
@@ -35,8 +36,9 @@ void Fence::update(Stream s, const array& a, bool cross_device) {
       cu::allocator().move_to_unified_memory(cbuf, encoder.stream());
     }
   }
-  fence->count++;
-  fence->event.signal(s, fence->count);
+  f.count++;
+  f.event.set_value(f.count);
+  f.event.signal(s);
 }
 
 } // namespace mlx::core
Original file line number	Diff line number	Diff line change
`@@ -426,6 +426,7 @@ class MLX_API array {`
`426`	`426`	`}`
`427`	`427`
`428`	`428`	`void detach_event() const {`
	`429`	`+ array_desc_->event.check_error();`
`429`	`430`	`array_desc_->event = Event{};`
`430`	`431`	`}`
`431`	`432`