From 9e67f971137f99622f9ee4b0de35109099e51ec0 Mon Sep 17 00:00:00 2001
From: Cheng <git@zcbenz.com>
Date: Tue, 23 Jun 2026 08:28:53 +0900
Subject: [PATCH] Add CI for Windows CUDA build

---
 .github/actions/build-cuda-release/action.yml |  31 ---
 .github/actions/build-docs/action.yml         |  20 +-
 .../actions/build-linux-release/action.yml    |  42 ---
 .github/actions/build-linux/action.yml        |  38 ---
 .../actions/build-macos-release/action.yml    |  63 -----
 .github/actions/build-wheel/action.yml        | 115 ++++++++
 .github/actions/build-windows/action.yml      |  26 --
 .github/actions/build/action.yml              |  44 +++
 .github/actions/setup-linux/action.yml        | 111 --------
 .github/actions/setup-macos/action.yml        |  32 ---
 .github/actions/setup-windows/action.yml      |  42 ---
 .github/actions/setup/action.yml              | 256 ++++++++++++++++++
 .github/actions/test-linux/action.yml         |  37 ++-
 .github/actions/test-wheel/action.yml         |  50 ++++
 .github/actions/test-windows/action.yml       |   8 +-
 .github/workflows/build_and_test.yml          |  76 +++---
 .github/workflows/nightly.yml                 | 108 --------
 .github/workflows/release.yml                 | 254 ++++++++++-------
 .gitignore                                    |   2 +-
 CMakeLists.txt                                |  21 +-
 MANIFEST.in                                   |   2 +-
 mlx/backend/cuda/CMakeLists.txt               |  12 +-
 mlx/backend/cuda/delayload.cpp                |  59 ++--
 mlx/backend/cuda/dirs.cpp                     |  29 ++
 mlx/backend/cuda/jit_module.cpp               |   9 +-
 setup.py                                      |  63 +++--
 26 files changed, 836 insertions(+), 714 deletions(-)
 delete mode 100644 .github/actions/build-cuda-release/action.yml
 delete mode 100644 .github/actions/build-linux-release/action.yml
 delete mode 100644 .github/actions/build-linux/action.yml
 delete mode 100644 .github/actions/build-macos-release/action.yml
 create mode 100644 .github/actions/build-wheel/action.yml
 delete mode 100644 .github/actions/build-windows/action.yml
 create mode 100644 .github/actions/build/action.yml
 delete mode 100644 .github/actions/setup-linux/action.yml
 delete mode 100644 .github/actions/setup-macos/action.yml
 delete mode 100644 .github/actions/setup-windows/action.yml
 create mode 100644 .github/actions/setup/action.yml
 create mode 100644 .github/actions/test-wheel/action.yml
 delete mode 100644 .github/workflows/nightly.yml
 create mode 100644 mlx/backend/cuda/dirs.cpp

diff --git a/.github/actions/build-cuda-release/action.yml b/.github/actions/build-cuda-release/action.yml
deleted file mode 100644
index f78b0c10f8..0000000000
--- a/.github/actions/build-cuda-release/action.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-name: 'Build CUDA wheel'
-description: 'Build CUDA wheel'
-
-inputs:
-  arch:
-    description: 'Platform architecture tag'
-    required: true
-    type: choice
-    options:
-      - x86_64
-      - aarch64
-
-runs:
-  using: "composite"
-  steps:
-    - name: Build package
-      shell: bash
-      env:
-        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON
-      run: |
-        pip install auditwheel "build<=1.4.2" patchelf setuptools
-        python setup.py clean --all
-        MLX_BUILD_STAGE=2 python -m build -w
-
-        auditwheel repair dist/mlx_cuda*.whl \
-          --plat manylinux_2_35_${{ inputs.arch }} \
-          --exclude libcublas* \
-          --exclude libcuda* \
-          --exclude libcudnn* \
-          --exclude libnccl* \
-          --exclude libnvrtc*
diff --git a/.github/actions/build-docs/action.yml b/.github/actions/build-docs/action.yml
index 411f6be8d9..4d4286e3c8 100644
--- a/.github/actions/build-docs/action.yml
+++ b/.github/actions/build-docs/action.yml
@@ -5,24 +5,28 @@ runs:
   using: "composite"
   steps:
     - name: Setup machine
-      uses: ./.github/actions/setup-linux
+      id: setup
+      uses: ./.github/actions/setup
+      with:
+        ccache-key: 'release'
+        ccache-save: false
 
     - name: Install dependencies
       shell: bash
+      env:
+        CMAKE_ARGS: ${{ steps.setup.outputs.cmake-args }}
       run: |
         sudo apt-get install -y doxygen
-        source .venv/bin/activate
-        pip install -r docs/requirements.txt
-        pip install . -v
-  
+        uv pip install -r docs/requirements.txt
+        uv pip install . -v
+
     - name: Build documentation
       shell: bash
       run: |
-        source .venv/bin/activate
         cd docs
         doxygen
         make html O=-W
-    
+
     - name: Create artifact tar
       shell: bash
       run: tar -cf artifact.tar -C docs --dereference build/html index.html
@@ -30,7 +34,7 @@ runs:
     # Do it manually because upload-pages-artifact requires gtar
     - name: Upload artifact
       id: upload-artifact
-      uses: actions/upload-artifact@v5
+      uses: actions/upload-artifact@v7
       with:
         name: github-pages
         path: artifact.tar
diff --git a/.github/actions/build-linux-release/action.yml b/.github/actions/build-linux-release/action.yml
deleted file mode 100644
index 2e938d8592..0000000000
--- a/.github/actions/build-linux-release/action.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-name: 'Build Linux wheel'
-description: 'Build Linux wheel'
-
-inputs:
-  build-backend:
-    description: 'Build the backend mlx-cpu package'
-    type: boolean
-    required: false
-    default: false
-  arch:
-    description: 'Platform architecture tag'
-    required: true
-    type: choice
-    options:
-      - x86_64
-      - aarch64
-
-runs:
-  using: "composite"
-  steps:
-    - name: Build MLX
-      shell: bash
-      run: pip install -e . -v
-
-    - name: Build Python package
-      shell: bash
-      run: |
-        pip install auditwheel patchelf "build<=1.4.2"
-        python setup.py clean --all
-        MLX_BUILD_STAGE=1 python -m build -w
-        auditwheel repair dist/mlx-*.whl \
-          --plat manylinux_2_35_${{ inputs.arch }} \
-          --exclude libmlx.so* \
-          --only-plat
-
-    - name: Build backend package
-      if: ${{ inputs.build-backend }}
-      shell: bash
-      run: |
-        python setup.py clean --all
-        MLX_BUILD_STAGE=2 python -m build -w
-        auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_${{ inputs.arch }}
diff --git a/.github/actions/build-linux/action.yml b/.github/actions/build-linux/action.yml
deleted file mode 100644
index b7a3b07c39..0000000000
--- a/.github/actions/build-linux/action.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-name: 'Build and Test on Linux'
-
-inputs:
-  toolkit:
-    description: 'The toolkit to build with'
-    required: false
-    default: 'cpu'
-
-runs:
-  using: "composite"
-  steps:
-
-    - name: Install Python package
-      id: python_build
-      shell: sh
-      env:
-        DEBUG: 1
-        CMAKE_ARGS: >-
-          -DCMAKE_COMPILE_WARNING_AS_ERROR=ON
-          -DMLX_BUILD_CUDA=${{ startsWith(inputs.toolkit, 'cuda') && 'ON' || 'OFF' }}
-      run: |
-        if ${{ startsWith(inputs.toolkit, 'cuda') && runner.arch == 'arm64' }} ; then
-          # There is no GPU in arm64 runner, use a common arch.
-          CMAKE_ARGS="$CMAKE_ARGS -DMLX_CUDA_ARCHITECTURES=80"
-          # Can not build tests and stubs when the built executables can not run.
-          CMAKE_ARGS="$CMAKE_ARGS -DMLX_BUILD_TESTS=OFF -DMLX_BUILD_PYTHON_STUBS=OFF"
-        fi
-        # Install cpu-only torch to save space
-        pip install torch --index-url https://download.pytorch.org/whl/cpu
-        pip install --no-build-isolation -e ".[dev]" -v
-        # Pass the CMAKE_ARGS to following steps.
-        echo CMAKE_ARGS="$CMAKE_ARGS" >> $GITHUB_OUTPUT
-
-    - name: Build CPP only
-      shell: bash
-      run: |
-        cmake . -B build -DCMAKE_BUILD_TYPE=Debug ${{ steps.python_build.outputs.CMAKE_ARGS }}
-        cmake --build build -j $(nproc)
diff --git a/.github/actions/build-macos-release/action.yml b/.github/actions/build-macos-release/action.yml
deleted file mode 100644
index ec36c46f6f..0000000000
--- a/.github/actions/build-macos-release/action.yml
+++ /dev/null
@@ -1,63 +0,0 @@
-name: 'Build macOS release'
-description: 'Build MLX releases macOS'
-
-inputs:
-  macos-target:
-    description: 'macOS build target'
-    required: false
-    default: '15.0'
-  build-backend:
-    description: 'Build the backend mlx-metal package'
-    required: false
-    default: 'false'
-  python-version:
-    description: 'Python version to use'
-    required: false
-    default: '3.10'
-
-runs:
-  using: "composite"
-  steps:
-    - name: Clear build environment
-      shell: bash
-      run: |
-        echo "::group::Clear build environment"
-        uv venv --clear
-        uv pip install build setuptools
-        python setup.py clean --all
-        source .venv/bin/activate
-        echo "::endgroup::"
-
-    - name: Build Python package
-      shell: bash
-      env:
-        DEVELOPER_DIR: /Applications/Xcode-latest.app
-        MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
-      run: |
-        echo "::group::Build Python package"
-        MLX_BUILD_STAGE=1 python -m build -w
-        python setup.py clean --all
-        echo "::endgroup::"
-
-    - name: Build backend package
-      if: ${{ inputs.build-backend == 'true' }}
-      shell: bash
-      env:
-        DEVELOPER_DIR: /Applications/Xcode-latest.app
-        MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
-      run: |
-        echo "::group::Build backend package"
-        MLX_BUILD_STAGE=2 python -m build -w
-        python setup.py clean --all
-        echo "::endgroup::"
-
-    - name: Test local packages
-      if: ${{ inputs.build-backend == 'true' && startsWith(inputs.macos-target, '26.') }}
-      shell: bash
-      run: |
-        echo "::group::Test local packages"
-        uv pip install numpy torch
-        uv pip install dist/mlx_metal-*-macosx_26_0_arm64.whl
-        uv pip install dist/mlx-*-macosx_26_0_arm64.whl
-        python -m unittest discover -v python/tests
-        echo "::endgroup::"
diff --git a/.github/actions/build-wheel/action.yml b/.github/actions/build-wheel/action.yml
new file mode 100644
index 0000000000..96cca9eadc
--- /dev/null
+++ b/.github/actions/build-wheel/action.yml
@@ -0,0 +1,115 @@
+name: 'Build wheel'
+description: 'Build the Python wheels for release on all platforms'
+
+inputs:
+  cmake-args:
+    description: 'The args for generating CMake project'
+    required: true
+  build-frontend:
+    description: 'Build the frontend mlx package'
+    required: false
+    default: 'true'
+  build-backend:
+    description: 'Build the backend mlx-cpu/mlx-cuda/mlx-metal packages'
+    required: false
+    default: 'true'
+  macos-target:
+    description: 'The target macOS version to build for'
+    required: false
+    default: '26.2'
+  arch-tag:
+    description: 'Platform architecture tag'
+    required: false
+    default: |-
+      ${{ case(runner.arch == 'x64', 'x86_64',
+               runner.arch == 'x86', 'i686',
+               runner.arch == 'arm', 'armv7l',
+               runner.arch == 'arm64', 'aarch64',
+               'unknown')
+      }}
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Install dependencies
+      shell: bash
+      run: |
+        echo "::group::Install dependencies"
+        uv pip install 'build<=1.4.2' setuptools
+        if ${{ runner.os == 'Linux' }} ; then
+          uv pip install auditwheel patchelf
+        fi
+        mkdir -p wheelhouse
+        echo "::endgroup::"
+
+    - name: Build frontend package
+      if: inputs.build-frontend == 'true'
+      shell: bash
+      env:
+        CMAKE_ARGS: ${{ inputs.cmake-args }}
+        MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
+      run: |
+        echo "::group::Build frontend package"
+        python setup.py clean --all
+        MLX_BUILD_STAGE=1 python -m build -w
+        echo "::endgroup::"
+
+    - name: Post-process frontend package
+      if: inputs.build-frontend == 'true'
+      shell: bash
+      run: |
+        echo "::group::Post-process frontend package"
+        if ${{ runner.os == 'Linux' }} ; then
+          auditwheel repair dist/mlx-*.whl \
+            --plat manylinux_2_35_${{ inputs.arch-tag }} \
+            --exclude libmlx.so* \
+            --only-plat
+        else
+          mv dist/mlx-*.whl wheelhouse/
+        fi
+        echo "::endgroup::"
+
+    - name: Build backend package
+      if: inputs.build-backend == 'true'
+      shell: bash
+      env:
+        CMAKE_ARGS: ${{ inputs.cmake-args }}
+        MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
+      run: |
+        echo "::group::Build backend package"
+        python setup.py clean --all
+        MLX_BUILD_STAGE=2 python -m build -w
+        echo "::endgroup::"
+
+    - name: Post-process backend package
+      if: inputs.build-backend == 'true'
+      shell: bash
+      run: |
+        echo "::group::Post-process backend package"
+        if ${{ runner.os == 'Linux' }} ; then
+          if [ -f dist/mlx_cpu*.whl ]; then
+            auditwheel repair dist/mlx_cpu*.whl \
+              --plat manylinux_2_35_${{ inputs.arch-tag }}
+          fi
+          if [ -f dist/mlx_cuda*.whl ]; then
+            auditwheel repair dist/mlx_cuda*.whl \
+              --plat manylinux_2_35_${{ inputs.arch-tag }} \
+              --exclude libcublas* \
+              --exclude libcuda* \
+              --exclude libcudnn* \
+              --exclude libcufft* \
+              --exclude libnccl* \
+              --exclude libnvrtc*
+          fi
+        else
+          if [ -f dist/mlx_cpu*.whl ]; then
+            mv dist/mlx_cpu*.whl wheelhouse/
+          fi
+          if [ -f dist/mlx_cuda*.whl ]; then
+            mv dist/mlx_cuda*.whl wheelhouse/
+          fi
+          if [ -f dist/mlx_metal*.whl ]; then
+            mv dist/mlx_metal*.whl wheelhouse/
+          fi
+        fi
+        echo "::endgroup::"
diff --git a/.github/actions/build-windows/action.yml b/.github/actions/build-windows/action.yml
deleted file mode 100644
index 372f848156..0000000000
--- a/.github/actions/build-windows/action.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-name: 'Build on Windows'
-
-runs:
-  using: 'composite'
-  steps:
-    - name: Install Python package
-      id: python-build
-      shell: cmd
-      env:
-        # For MSVC, Ninja/Release is the only config supported by ccache.
-        CMAKE_ARGS: >-
-          -G Ninja
-          -DCMAKE_BUILD_TYPE=Release
-          -DCMAKE_C_COMPILER=cl
-          -DCMAKE_CXX_COMPILER=cl
-          -DCMAKE_RC_COMPILER=rc
-      run: |
-        uv pip install ".[dev]" -v
-        :: Pass the CMAKE_ARGS to following steps.
-        >>%GITHUB_OUTPUT% ECHO CMAKE_ARGS=%CMAKE_ARGS%
-
-    - name: Build CPP only
-      shell: cmd
-      run: |
-        cmake . -B build ${{ steps.python-build.outputs.CMAKE_ARGS }}
-        cmake --build build -j %NUMBER_OF_PROCESSORS%
diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml
new file mode 100644
index 0000000000..95c6f5ff70
--- /dev/null
+++ b/.github/actions/build/action.yml
@@ -0,0 +1,44 @@
+name: 'Build'
+description: 'Build C++ and Python binaries for testing on Linux and Windows'
+
+inputs:
+  cmake-args:
+    description: 'The args for generating CMake project'
+    required: true
+  debug:
+    description: 'Do debug build'
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Install Python package
+      shell: bash
+      env:
+        DEBUG: ${{ inputs.debug == 'true' && 1 || 0 }}
+        CMAKE_ARGS: ${{ inputs.cmake-args }}
+      run: |
+        echo "::group::Install Python package"
+        # Install cpu-only torch to save space
+        uv pip install torch --index-url https://download.pytorch.org/whl/cpu
+        uv pip install --no-build-isolation -e ".[dev]" -v
+        echo "::endgroup::"
+
+    - name: Build CPP only
+      shell: bash
+      env:
+        # The cpp build is using some extra settings to reuse the compile cache
+        # generated by the python install:
+        # 1. Use the same ccache options with setup.py.
+        # 2. Build dynamic library.
+        # 3. Put the build dir in the same depth with python build dir.
+        CCACHE_BASEDIR: ${{ github.workspace }}/build/cpp/mlx
+        CCACHE_NOHASHDIR: true
+      run: |
+        echo "::group::Build CPP only"
+        cmake . -B build/cpp/mlx ${{ inputs.cmake-args }} \
+          -DBUILD_SHARED_LIBS=ON \
+          -DCMAKE_BUILD_TYPE=${{ inputs.debug == 'true' && 'Debug' || 'Release' }}
+        cmake --build build/cpp/mlx \
+          -j ${{ runner.os == 'Windows' && '$NUMBER_OF_PROCESSORS' || '$(nproc)' }}
+        echo "::endgroup::"
diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml
deleted file mode 100644
index 419abfb163..0000000000
--- a/.github/actions/setup-linux/action.yml
+++ /dev/null
@@ -1,111 +0,0 @@
-name: 'Setup Linux Environment'
-description: 'Install dependencies for Linux builds'
-
-inputs:
-  toolkit:
-    description: 'Which toolkit to install'
-    required: false
-    default: 'cpu'
-  python-version:
-    description: 'Version of python to set up'
-    required: false
-    default: '3.14'
-  use-ccache:
-    description: 'Whether to enable ccache'
-    required: false
-    default: 'true'
-  ccache-key:
-    required: false
-    default: 'ccache'
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install common dependencies
-      shell: bash
-      run: |
-        echo "::group::Install common dependencies"
-        sudo apt-get update
-        sudo apt-get install -y --no-install-recommends \
-            gdb zip \
-            libblas-dev liblapack-dev liblapacke-dev \
-            openmpi-bin openmpi-common libopenmpi-dev
-        echo "::endgroup::"
-
-    - name: Use ccache
-      if: ${{ inputs.use-ccache == 'true' }}
-      uses: hendrikmuhs/ccache-action@v1.2
-      with:
-        key: ${{ inputs.ccache-key }}-${{ runner.os }}-${{ runner.arch }}-${{ inputs.toolkit }}
-        max-size: 1GB
-        # ccache-action bug: running "apt-get update" fails on large arm runner.
-        update-package-index: false
-
-    - name: Cache JIT-compiled CUDA kernels
-      if: ${{ startsWith(inputs.toolkit, 'cuda') }}
-      uses: actions/cache@v5
-      with:
-        path: /tmp/mlx-ptx-cache
-        key: >-
-          ptx-${{ runner.os }}-${{ runner.arch }}-${{ inputs.toolkit }}-
-          ${{ hashFiles('mlx/backend/cuda/**') }}
-
-    - uses: actions/setup-python@v6
-      with:
-        python-version: ${{ inputs.python-version }}
-
-    - name: Setup Python venv
-      shell: bash
-      run: |
-        echo "::group::Setup Python venv"
-        python -m venv .venv
-        source .venv/bin/activate
-        pip install setuptools cmake typing_extensions
-        echo PATH=$PATH >> $GITHUB_ENV
-        # Search python packages in .venv
-        echo PYTHONPATH=`python -c 'import sys; print(sys.path[-1])'` >> $GITHUB_ENV
-        echo "::endgroup::"
-
-    - name: Set swap space
-      if: ${{ startsWith(inputs.toolkit, 'cuda') }}
-      uses: pierotofy/set-swap-space@fc79b3f67fa8a838184ce84a674ca12238d2c761
-      with:
-        swap-size-gb: 16
-
-    - name: Install CUDA toolkit
-      if: ${{ startsWith(inputs.toolkit, 'cuda') }}
-      shell: bash
-      env:
-        # Note: the CI machine does not meet CUDA 13's driver requirement.
-        # Compatibility matrix:
-        # https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
-        PACKAGES: |
-          {
-            "cuda-12.6": "libcudnn9-dev-cuda-12 cuda-compiler-12-6 cuda-libraries-dev-12-6",
-            "cuda-12.9": "libcudnn9-dev-cuda-12 cuda-compiler-12-9 cuda-libraries-dev-12-9",
-            "cuda-13.0": "libcudnn9-dev-cuda-13 cuda-compiler-13-0 cuda-libraries-dev-13-0"
-          }
-      run: |
-        echo "::group::Install CUDA toolkit"
-        # The CUDA binaries are hosted in the "sbsa" repo, the "arm64" repo is
-        # Jetson specific. SBSA means Arm Server Base System Architecture.
-        ARCH=${{ runner.arch == 'arm64' && 'sbsa' || 'x86_64' }}
-        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$ARCH/cuda-keyring_1.1-1_all.deb
-        sudo dpkg -i cuda-keyring_1.1-1_all.deb
-        sudo apt-get update
-        sudo apt-get install -y --no-install-recommends \
-            libnccl2 libnccl-dev \
-            ${{ fromJson(env.PACKAGES)[inputs.toolkit] }}
-        echo "/usr/local/${{ inputs.toolkit }}/bin" >> $GITHUB_PATH
-        echo "::endgroup::"
-
-    - name: CUDA packages and driver report
-      if: ${{ startsWith(inputs.toolkit, 'cuda') }}
-      shell: bash
-      run: |
-        echo "::group::Installed NVIDIA and CUDA packages"
-        dpkg -l | egrep "cuda|nvidia" -i
-        echo "::endgroup::"
-        echo "::group::NVIDIA-SMI Status"
-        nvidia-smi || true
-        echo "::endgroup::"
diff --git a/.github/actions/setup-macos/action.yml b/.github/actions/setup-macos/action.yml
deleted file mode 100644
index 97655daa3a..0000000000
--- a/.github/actions/setup-macos/action.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: 'Setup macOS Environment'
-description: 'Install dependencies for macOS builds'
-
-inputs:
-  python-version:
-    description: 'Python version to use'
-    required: false
-    default: '3.10'
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install Homebrew packages
-      shell: sh
-      run: /opt/homebrew/bin/brew install openmpi
-
-    - name: Verify MetalToolchain installed
-      shell: bash
-      run: xcodebuild -showComponent MetalToolchain
-
-    - uses: astral-sh/setup-uv@v7
-
-    - name: Setup Python venv
-      shell: bash
-      run: |
-        echo "::group::Setup Python venv"
-        uv venv --python ${{ inputs.python-version }} --managed-python
-        source .venv/bin/activate
-        echo PATH=$PATH >> $GITHUB_ENV
-        # Search python packages in .venv
-        echo PYTHONPATH=`python -c 'import sys; print(sys.path[-1])'` >> $GITHUB_ENV
-        echo "::endgroup::"
diff --git a/.github/actions/setup-windows/action.yml b/.github/actions/setup-windows/action.yml
deleted file mode 100644
index 83afd4de12..0000000000
--- a/.github/actions/setup-windows/action.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-name: 'Setup Windows environment'
-
-inputs:
-  python-version:
-    description: 'Version of python to set up'
-    required: false
-    default: '3.14'
-  use-ccache:
-    description: 'Whether to enable ccache'
-    required: false
-    default: 'true'
-
-runs:
-  using: 'composite'
-  steps:
-    - name: Use ccache
-      if: ${{ inputs.use-ccache == 'true' }}
-      uses: hendrikmuhs/ccache-action@v1.2
-      with:
-        key: ccache-${{ runner.os }}-${{ runner.arch }}-cpu
-        max-size: 1GB
-
-    - name: Setup Visual Studio cmd
-      shell: cmd
-      run: |
-        :: Find out path to VS.
-        pushd "C:\Program Files (x86)\Microsoft Visual Studio\Installer\"
-        for /f "delims=" %%x in ('.\vswhere.exe -latest -property InstallationPath') do set VSPATH=%%x
-        popd
-        :: Import VS vars.
-        call "%VSPATH%\VC\Auxiliary\Build\vcvarsall.bat" x64
-        :: Export to all steps.
-        >>%GITHUB_ENV% set
-
-    - uses: astral-sh/setup-uv@v7
-
-    - name: Setup Python venv
-      shell: cmd
-      run: |
-        uv venv --python ${{ inputs.python-version }}
-        call ".venv/Scripts/activate.bat"
-        >>%GITHUB_ENV% set
diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
new file mode 100644
index 0000000000..731968d4d4
--- /dev/null
+++ b/.github/actions/setup/action.yml
@@ -0,0 +1,256 @@
+name: 'Setup environment'
+description: 'Install dependencies for Linux and Windows builds'
+
+inputs:
+  toolkit:
+    description: 'Which toolkit to install'
+    required: false
+    default: 'cpu'
+  python-version:
+    description: 'Version of python to set up'
+    required: false
+    default: '3.12'
+  use-ccache:
+    description: 'Whether to enable ccache'
+    required: false
+    default: 'true'
+  ccache-key:
+    description: 'Extra key to use in the key of ccache'
+    required: false
+    default: 'test'
+  ccache-save:
+    description: 'Whether the ccache in this workflow will be saved'
+    required: false
+    default: 'auto'
+
+outputs:
+  cmake-args:
+    description: 'The args for generating CMake project'
+    value: ${{ steps.cmake-args.outputs.cmakeArgs }}
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Install Linux dependencies
+      if: runner.os == 'Linux'
+      shell: bash
+      run: |
+        echo "::group::Install common dependencies"
+        sudo apt-get update
+        sudo apt-get install -y --no-install-recommends \
+            gdb g++ ninja-build zip \
+            libblas-dev liblapack-dev liblapacke-dev \
+            openmpi-bin openmpi-common libopenmpi-dev
+        echo "::endgroup::"
+
+    - name: Install macOS dependencies
+      if: runner.os == 'macOS'
+      shell: bash
+      run: |
+        echo "::group::Install macOS dependencies"
+        brew install openmpi
+        xcodebuild -showComponent MetalToolchain
+        echo "::endgroup::"
+
+    - name: Setup Windows environment
+      if: runner.os == 'Windows'
+      shell: cmd
+      run: |
+        echo "::group::Setup environment"
+        :: Find out path to Visual Studio.
+        pushd "C:\Program Files (x86)\Microsoft Visual Studio\Installer\"
+        for /f "delims=" %%x in ('.\vswhere.exe -latest -property InstallationPath') do set VSPATH=%%x
+        popd
+        :: Import Visual stubs vars.
+        call "%VSPATH%\VC\Auxiliary\Build\vcvarsall.bat" ${{ runner.arch }}
+        :: Avoid using default TMP which uses short path and causes mismatch of CCACHE_BASEDIR.
+        set TMP=%RUNNER_TEMP%
+        set TEMP=%RUNNER_TEMP%
+        set UV_CACHE_DIR=%RUNNER_TEMP%
+        :: The cuda headers are downloaded.
+        set CCACHE_COMPILERCHECK=content
+        set CCACHE_SLOPPINESS=include_file_ctime,include_file_mtime
+        :: Export to all steps.
+        >>%GITHUB_ENV% set
+        echo "::endgroup::"
+
+    - uses: astral-sh/setup-uv@v8.2.0
+      with:
+        enable-cache: false
+        quiet: true
+
+    - name: Use ccache
+      if: inputs.use-ccache == 'true'
+      uses: hendrikmuhs/ccache-action@v1.2.23
+      with:
+        key: v7-${{ inputs.ccache-key }}-${{ runner.os }}-${{ runner.arch }}-${{ inputs.toolkit }}
+        max-size: ${{ inputs.toolkit == 'cpu' && '200MB' || '600MB' }}
+        save: ${{ !startsWith(github.ref, 'refs/pull/') && (inputs.ccache-save != 'false') }}
+        # ccache-action bug: running "apt-get update" fails on large arm runner.
+        update-package-index: false
+
+    - name: Cache JIT-compiled CUDA kernels
+      if: runner.os == 'Linux' && startsWith(inputs.toolkit, 'cuda')
+      uses: actions/cache@v5
+      with:
+        path: /tmp/mlx-ptx-cache
+        key: >-
+          ptx-${{ runner.os }}-${{ runner.arch }}-${{ inputs.toolkit }}-
+          ${{ hashFiles('mlx/backend/cuda/**') }}
+
+    - name: Setup Python venv
+      if: runner.os != 'Windows'
+      shell: bash
+      run: |
+        echo "::group::Setup Python venv"
+        uv venv --python ${{ inputs.python-version }} --managed-python
+        # Make sure all builds use the same cmake binary.
+        uv pip install cmake
+        source .venv/bin/activate
+        echo PATH=$PATH >> $GITHUB_ENV
+        # Search python packages in .venv
+        echo PYTHONPATH=`python -c 'import sys; print(sys.path[-1])'` >> $GITHUB_ENV
+        echo "::endgroup::"
+
+    - name: Setup Python venv (Windows)
+      if: runner.os == 'Windows'
+      shell: cmd
+      run: |
+        echo "::group::Setup Python venv"
+        uv venv --python ${{ inputs.python-version }}${{ runner.arch == 'arm64' && '-arm64' || ''}} || exit /b
+        uv pip install cmake
+        call ".venv/Scripts/activate.bat"
+        >>%GITHUB_ENV% set
+        echo "::endgroup::"
+
+    - name: Install CUDA toolkit (Linux)
+      if: runner.os == 'Linux' && startsWith(inputs.toolkit, 'cuda')
+      shell: bash
+      env:
+        PACKAGES: |
+          {
+            "cuda-12.6": "libcudnn9-dev-cuda-12 cuda-compiler-12-6 cuda-libraries-dev-12-6",
+            "cuda-12.9": "libcudnn9-dev-cuda-12 cuda-compiler-12-9 cuda-libraries-dev-12-9",
+            "cuda-13.0": "libcudnn9-dev-cuda-13 cuda-compiler-13-0 cuda-libraries-dev-13-0"
+          }
+      run: |
+        echo "::group::Install CUDA toolkit"
+        # The CUDA binaries are hosted in the "sbsa" repo, the "arm64" repo is
+        # Jetson specific. SBSA means Arm Server Base System Architecture.
+        ARCH=${{ runner.arch == 'arm64' && 'sbsa' || 'x86_64' }}
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$ARCH/cuda-keyring_1.1-1_all.deb
+        sudo dpkg -i cuda-keyring_1.1-1_all.deb
+        sudo apt-get update
+        sudo apt-get install -y --no-install-recommends \
+            libnccl2 libnccl-dev \
+            ${{ fromJson(env.PACKAGES)[inputs.toolkit] }}
+        echo "/usr/local/${{ inputs.toolkit }}/bin" >> $GITHUB_PATH
+        echo "::endgroup::"
+
+    - name: Install CUDA Toolkit (Windows)
+      if: runner.os == 'Windows' && startsWith(inputs.toolkit, 'cuda')
+      shell: powershell
+      env:
+        INSTALLERS: |
+          {
+            "cuda-12.6": "https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_561.17_windows.exe",
+            "cuda-12.9": "https://developer.download.nvidia.com/compute/cuda/12.9.1/local_installers/cuda_12.9.1_576.57_windows.exe",
+            "cuda-13.0": "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_windows.exe"
+          }
+        PACKAGES: |
+          {
+            "cuda-12.6": ["cudart_12.6", "nvcc_12.6", "cublas_12.6", "cublas_dev_12.6", "cufft_12.6", "cufft_dev_12.6", "nvrtc_12.6", "nvrtc_dev_12.6"],
+            "cuda-12.9": ["cudart_12.9", "nvcc_12.9", "cublas_12.9", "cublas_dev_12.9", "cufft_12.9", "cufft_dev_12.9", "nvrtc_12.9", "nvrtc_dev_12.9"],
+            "cuda-13.0": ["cudart_13.0", "nvcc_13.0", "cublas_13.0", "cublas_dev_13.0", "cufft_13.0", "cufft_dev_13.0", "nvrtc_13.0", "nvrtc_dev_13.0", "crt_13.0", "nvvm_13.0", "nvptxcompiler_13.0"],
+          }
+      run: |
+        echo "::group::Install CUDA toolkit"
+        $ErrorActionPreference = "Stop"
+        $cudaUrl = "${{ fromJson(env.INSTALLERS)[inputs.toolkit] }}"
+        $cudaInstaller = "./install.exe"
+
+        echo "Downloading '$cudaUrl'..."
+        $webClient = New-Object System.Net.WebClient
+        $webClient.DownloadFile($cudaUrl, $cudaInstaller)
+        echo "Downloaded: $cudaInstaller ($(([math]::Round((Get-Item $cudaInstaller).Length / 1MB, 2))) MB)"
+
+        $args = "-s ${{ join(fromJson(env.PACKAGES)[inputs.toolkit], ' ') }}"
+        echo "Running '$cudaInstaller $args'..."
+        Start-Process -FilePath $cudaInstaller -ArgumentList "$args" -NoNewWindow -Wait
+        $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
+        echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        echo "::endgroup::"
+
+    - name: Install cuDNN (Windows)
+      if: runner.os == 'Windows' && startsWith(inputs.toolkit, 'cuda')
+      id: cudnn
+      shell: powershell
+      env:
+        ARCHIVES: |
+          {
+            "cuda-12.6": "https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.23.2.1_cuda12-archive.zip",
+            "cuda-12.9": "https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.23.2.1_cuda12-archive.zip",
+            "cuda-13.0": "https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.23.2.1_cuda13-archive.zip"
+          }
+      run: |
+        echo "::group::Install cuDNN"
+        $ErrorActionPreference = "Stop"
+        $cudnnUrl = "${{ fromJson(env.ARCHIVES)[inputs.toolkit] }}"
+        $cudnnZip = "cudnn.zip"
+
+        echo "Downloading '$cudnnUrl'..."
+        $webClient = New-Object System.Net.WebClient
+        $webClient.DownloadFile($cudnnUrl, $cudnnZip)
+        echo "Downloaded: $cudnnZip ($(([math]::Round((Get-Item $cudnnZip).Length / 1MB, 2))) MB)"
+
+        echo "Extracing..."
+        Expand-Archive -Path $cudnnZip -DestinationPath cudnn-extracted
+        $cudnnDir = (Get-ChildItem -Path cudnn-extracted -Directory)[0].FullName
+        echo "cudnnDir=$($cudnnDir -replace '\\', '/')" | Out-File -FilePath $env:GITHUB_OUTPUT
+        echo "::endgroup::"
+
+    - name: Generate CMake args
+      id: cmake-args
+      shell: bash
+      run: |
+        echo "::group::Generate CMake args"
+        cmakeArgs=(
+          "-G Ninja"
+        )
+        if ${{ runner.os == 'Windows' }} ; then
+          cmakeArgs+=(
+            "-DCMAKE_C_COMPILER=cl"
+            "-DCMAKE_CXX_COMPILER=cl"
+            "-DCMAKE_RC_COMPILER=rc"
+          )
+        else
+          cmakeArgs+=(
+            "-DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
+          )
+        fi
+        if ${{ startsWith(inputs.toolkit, 'cuda') }} ; then
+          cmakeArgs+=("-DMLX_BUILD_CUDA=ON")
+          # Some machines have no GPU.
+          if ! __nvcc_device_query ; then
+            # Use a fallback arch for testing.
+            cmakeArgs+=("-DMLX_CUDA_ARCHITECTURES=80")
+            # Can not build tests and stubs when the built executables can not run.
+            cmakeArgs+=("-DMLX_BUILD_TESTS=OFF")
+            cmakeArgs+=("-DMLX_BUILD_PYTHON_STUBS=OFF")
+          fi
+          echo
+          # Set cuDNN paths.
+          if ${{ runner.os == 'Windows' }} ; then
+            cmakeArgs+=(
+              "-DCUDNN_INCLUDE_PATH=${{ steps.cudnn.outputs.cudnnDir }}/include"
+              "-DCUDNN_LIBRARY_PATH=${{ steps.cudnn.outputs.cudnnDir }}/lib/x64"
+            )
+          fi
+        else
+          cmakeArgs+=("-DMLX_BUILD_CUDA=OFF")
+        fi
+        # Pass to following steps.
+        IFS=" "
+        echo ${cmakeArgs[*]}
+        echo "cmakeArgs=${cmakeArgs[*]}" >> $GITHUB_OUTPUT
+        echo "::endgroup::"
diff --git a/.github/actions/test-linux/action.yml b/.github/actions/test-linux/action.yml
index 24cda6c103..33ca15268f 100644
--- a/.github/actions/test-linux/action.yml
+++ b/.github/actions/test-linux/action.yml
@@ -1,15 +1,24 @@
-name: 'Run Linux tests'
-
-inputs:
-  has-gpu:
-    description: 'Run GPU tests'
-    required: false
-    default: false
+name: 'Run tests'
+description: 'Run Python and C++ tests on Linux'
 
 runs:
-  using: "composite"
+  using: 'composite'
   steps:
+    - name: Check GPU support
+      id: gpu-check
+      shell: bash
+      run: |
+        echo "::group::Check GPU support"
+        if __nvcc_device_query ; then
+          echo "good=true" >> $GITHUB_OUTPUT
+        else
+          echo "good=false" >> $GITHUB_OUTPUT
+        fi
+        echo
+        echo "::endgroup::"
+
     - name: Run MPI tests
+      if: steps.gpu-check.outputs.good == 'false'
       shell: bash
       run: |
         echo "::group::MPI tests"
@@ -17,7 +26,7 @@ runs:
         echo "::endgroup::"
 
     - name: Run distributed tests
-      if: ${{ inputs.has-gpu == 'false' }}
+      if: steps.gpu-check.outputs.good == 'false'
       shell: bash
       run: |
         echo "::group::Distributed tests"
@@ -30,7 +39,7 @@ runs:
         echo "::endgroup::"
 
     - name: Run Python tests - CPU
-      if: ${{ inputs.has-gpu == 'false' }}
+      if: steps.gpu-check.outputs.good == 'false'
       shell: bash
       env:
         DEVICE: cpu
@@ -40,7 +49,7 @@ runs:
         echo "::endgroup::"
 
     - name: Run Python tests - GPU
-      if: ${{ inputs.has-gpu == 'true' }}
+      if: steps.gpu-check.outputs.good == 'true'
       shell: bash
       env:
         DEVICE: gpu
@@ -56,17 +65,17 @@ runs:
         DEVICE: cpu
       run: |
         echo "::group::CPP tests - CPU"
-        ./build/tests/tests
+        ./build/cpp/mlx/tests/tests
         echo "::endgroup::"
 
     - name: Run CPP tests - GPU
-      if: ${{ inputs.has-gpu == 'true' }}
+      if: steps.gpu-check.outputs.good == 'true'
       shell: bash
       env:
         DEVICE: gpu
       run: |
         echo "::group::CPP tests - GPU"
-        ./build/tests/tests -sfe="*linalg_tests.cpp"
+        ./build/cpp/mlx/tests/tests -sfe="*linalg_tests.cpp"
         echo "::endgroup::"
 
     - name: Show stack trace on crash
diff --git a/.github/actions/test-wheel/action.yml b/.github/actions/test-wheel/action.yml
new file mode 100644
index 0000000000..6e282d2691
--- /dev/null
+++ b/.github/actions/test-wheel/action.yml
@@ -0,0 +1,50 @@
+name: 'Test wheel'
+description: 'Run tests with the built wheels'
+
+inputs:
+  toolkit:
+    description: 'Which toolkit to test'
+    required: false
+    default: 'cpu'
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Get Python version
+      id: python
+      shell: bash
+      run: python -c "import sys; print(f'version={sys.version_info.major}.{sys.version_info.minor}')" >> $GITHUB_OUTPUT
+
+    - name: Download frontend packages
+      uses: actions/download-artifact@v8
+      with:
+        pattern: frontend-${{ runner.os }}-${{ runner.arch }}-py${{ steps.python.outputs.version }}
+        path: wheelhouse
+
+    - name: Download backend packages
+      uses: actions/download-artifact@v8
+      with:
+        pattern: backend-${{ inputs.toolkit}}-${{ runner.os }}-${{ runner.arch }}
+        path: wheelhouse
+
+    - name: Test local packages
+      shell: bash
+      run: |
+        echo "::group::Test local packages"
+        uv pip install torch --index-url https://download.pytorch.org/whl/cpu
+        uv pip install numpy
+        if ${{ inputs.toolkit == 'cpu' }} ; then
+          uv pip install wheelhouse/mlx_cpu*.whl
+          uv pip install wheelhouse/mlx-*.whl
+        elif ${{ startsWith(inputs.toolkit, 'cuda') }} ; then
+          uv pip install wheelhouse/mlx_cuda*.whl
+          uv pip install wheelhouse/mlx-*.whl
+        elif ${{ inputs.toolkit == 'metal' }} ; then
+          uv pip install wheelhouse/mlx_metal-*-macosx_26_0_arm64.whl
+          uv pip install wheelhouse/mlx-*-macosx_26_0_arm64.whl
+        else
+          echo "No matching backend wheel to install"
+          exit 1
+        fi
+        python -m unittest discover -v python/tests
+        echo "::endgroup::"
diff --git a/.github/actions/test-windows/action.yml b/.github/actions/test-windows/action.yml
index c2714df5a8..ac812d8af9 100644
--- a/.github/actions/test-windows/action.yml
+++ b/.github/actions/test-windows/action.yml
@@ -1,10 +1,13 @@
-name: 'Run tests on Windows'
+name: 'Run tests'
+description: 'Run Python and C++ tests on Windows'
 
 runs:
   using: 'composite'
   steps:
     - name: Run Python tests - CPU
       shell: bash
+      env:
+        DEVICE: cpu
       run: |
         echo "::group::Python tests - CPU"
         python -m unittest discover python/tests -v
@@ -16,6 +19,5 @@ runs:
         DEVICE: cpu
       run: |
         echo "::group::CPP tests - CPU"
-        ./build/tests.exe -tce="*gguf*"
-        ./build/test_teardown.exe
+        ./build/cpp/mlx/tests.exe -tce="*gguf*"
         echo "::endgroup::"
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 69777f3cd4..faec6672a4 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -23,43 +23,54 @@ jobs:
       - uses: actions/checkout@v7
       - uses: pre-commit/action@v3.0.1
 
-  linux_build_and_test:
-    name: Linux (cpu, ${{ matrix.arch }})
-    needs: check_lint
-    strategy:
-      fail-fast: false
-      matrix:
-        arch: ['x86_64', 'aarch64']
-    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
-    steps:
-      - uses: actions/checkout@v7
-      - uses: ./.github/actions/setup-linux
-      - uses: ./.github/actions/build-linux
-      - uses: ./.github/actions/test-linux
-      - run: df -h
-
-  cuda_build_and_test:
-    name: Linux (${{ matrix.toolkit }}, ${{ matrix.arch }})
+  build_and_test:
+    name: ${{ matrix.os }} (${{ matrix.toolkit }}, ${{ matrix.arch }})
     if: github.repository == 'ml-explore/mlx'
     needs: check_lint
     strategy:
       fail-fast: false
       matrix:
+        os: ['Linux', 'Windows']
         arch: ['x86_64', 'aarch64']
-        toolkit: ['cuda-12.6', 'cuda-12.9']
-    runs-on: ${{ matrix.arch == 'x86_64' && 'gpu-t4-4-core' || 'ubuntu-22.04-arm' }}
+        toolkit: ['cpu', 'cuda-12.6', 'cuda-12.9', 'cuda-13.0']
+        exclude:
+          # CUDA does not support Windows on arm.
+          - os: 'Windows'
+            arch: 'aarch64'
+            toolkit: 'cuda-12.6'
+          - os: 'Windows'
+            arch: 'aarch64'
+            toolkit: 'cuda-12.9'
+          - os: 'Windows'
+            arch: 'aarch64'
+            toolkit: 'cuda-13.0'
+          # CUDA 12.6 does not compile with CUTLASS on Windows.
+          - os: 'Windows'
+            arch: 'x86_64'
+            toolkit: 'cuda-12.6'
+    runs-on: |-
+      ${{ case(matrix.os == 'Windows',
+               case(matrix.arch == 'aarch64', 'windows-11-arm',
+                    'windows-2022'),
+               case(matrix.arch == 'x86_64' && startsWith(matrix.toolkit, 'cuda'), 'gpu-t4-4-core',
+                    matrix.arch == 'aarch64', 'ubuntu-22.04-arm',
+                    'ubuntu-22.04'))
+      }}
     steps:
       - uses: actions/checkout@v7
-      - uses: ./.github/actions/setup-linux
+      - uses: ./.github/actions/setup
+        id: setup
         with:
           toolkit: ${{ matrix.toolkit }}
-      - uses: ./.github/actions/build-linux
+      - uses: ./.github/actions/build
         with:
-          toolkit: ${{ matrix.toolkit }}
+          cmake-args: ${{ steps.setup.outputs.cmake-args }}
+          # For MSVC, Ninja/Release is the only config supported by ccache.
+          debug: ${{ matrix.os != 'Windows' }}
       - uses: ./.github/actions/test-linux
-        if: matrix.arch == 'x86_64'
-        with:
-          has-gpu: true
+        if: matrix.os == 'Linux' && (matrix.toolkit == 'cpu' || matrix.arch == 'x86_64')
+      - uses: ./.github/actions/test-windows
+        if: matrix.os == 'Windows' && matrix.toolkit == 'cpu'
 
   mac_build_and_test:
     name: macOS (${{ matrix.macos-target }})
@@ -73,19 +84,12 @@ jobs:
     needs: check_lint
     steps:
       - uses: actions/checkout@v7
-      - uses: ./.github/actions/setup-macos
+      - uses: ./.github/actions/setup
+        with:
+          toolkit: 'metal'
+          ccache-key: 'test-${{ matrix.macos-target }}'
       - uses: ./.github/actions/build-macos
 
-  windows_build_and_test:
-    name: Windows (cpu, x86_64)
-    needs: check_lint
-    runs-on: windows-2025
-    steps:
-      - uses: actions/checkout@v7
-      - uses: ./.github/actions/setup-windows
-      - uses: ./.github/actions/build-windows
-      - uses: ./.github/actions/test-windows
-
   build_documentation:
     name: Build Documentation
     if: github.repository == 'ml-explore/mlx'
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
deleted file mode 100644
index 9e750ff47d..0000000000
--- a/.github/workflows/nightly.yml
+++ /dev/null
@@ -1,108 +0,0 @@
-name: Nightly Build
-
-on:
-  schedule:
-    - cron: 33 6 * * 1-5
-  workflow_dispatch:
-
-permissions:
-  contents: read
-
-jobs:
-  build_linux_release:
-    strategy:
-      fail-fast: false
-      matrix:
-        python_version: ["3.10", "3.14"]
-    runs-on: ubuntu-22.04
-    steps:
-      - uses: actions/checkout@v7
-      - uses: ./.github/actions/setup-linux
-      - uses: ./.github/actions/build-linux-release
-        with:
-          build-backend: ${{ matrix.python-version == '3.10' }}
-          arch: "x86_64"
-      - name: Upload mlx artifacts
-        uses: actions/upload-artifact@v7
-        with:
-          name: linux-wheels-${{ matrix.python_version }}
-          path: wheelhouse/mlx-*.whl
-          retention-days: 7
-      - name: Upload mlx-cpu artifacts
-        if: matrix.python_version == '3.10'
-        uses: actions/upload-artifact@v7
-        with:
-          name: mlx-cpu
-          path: wheelhouse/mlx_cpu-*.whl
-          retention-days: 7
-      - run: df -h
-
-  build_linux_with_tests:
-    strategy:
-      fail-fast: false
-      matrix:
-        python_version: ["3.11", "3.12", "3.13", "3.14"]
-        runner:
-          - ubuntu-22.04
-          - ubuntu-22.04-arm
-    runs-on: ${{ matrix.runner }}
-    steps:
-      - uses: actions/checkout@v7
-      - uses: ./.github/actions/setup-linux
-        with:
-          python-version: ${{ matrix.python_version }}
-      - uses: ./.github/actions/build-linux
-      - uses: ./.github/actions/test-linux
-      - run: df -h
-
-  build_mac_release:
-    if: github.repository == 'ml-explore/mlx'
-    strategy:
-      matrix:
-        python-version: ["3.10", "3.13"]
-    runs-on: [self-hosted, macos]
-    steps:
-      - uses: actions/checkout@v7
-      - uses: ./.github/actions/setup-macos
-        with:
-          python-version: ${{ matrix.python-version }}
-      - uses: ./.github/actions/build-macos
-      - name: Build macOS 14 package
-        uses: ./.github/actions/build-macos-release
-        with:
-          macos-target: '14.0'
-          build-backend: ${{ matrix.python-version == '3.10' }}
-      - name: Build macOS 15 package
-        uses: ./.github/actions/build-macos-release
-        with:
-          macos-target: '15.0'
-          build-backend: ${{ matrix.python-version == '3.10' }}
-      - name: Build macOS 26 package
-        uses: ./.github/actions/build-macos-release
-        with:
-          macos-target: '26.2'
-          build-backend: ${{ matrix.python-version == '3.10' }}
-
-  build_cuda_release:
-    if: github.repository == 'ml-explore/mlx'
-    strategy:
-      matrix:
-        arch: ['x86_64', 'aarch64']
-        toolkit: ['cuda-12.9', 'cuda-13.0']
-    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22-large' || 'ubuntu-22-large-arm' }}
-    steps:
-      - uses: actions/checkout@v7
-      - uses: ./.github/actions/setup-linux
-        with:
-          toolkit: ${{ matrix.toolkit }}
-          ccache-key: 'ccache-release'
-      - name: Build Python package
-        uses: ./.github/actions/build-cuda-release
-        with:
-          arch: ${{ matrix.arch }}
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v7
-        with:
-          name: mlx-${{ matrix.toolkit }}-${{ matrix.arch }}
-          path: wheelhouse/mlx_cuda_*.whl
-          retention-days: 7
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index ab31ac53d2..d5c8a78b8f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,27 +1,35 @@
-name: PyPI Release
+name: 'Release build'
+description: 'Build Python wheels for nightly or offical releases'
 
 on:
   push:
     tags:
       - 'v*'
-    branches:
-      - 'test-publish/*'
   workflow_dispatch:
     inputs:
-      dry_run:
-        description: 'Dry run (do not publish to PyPi)'
+      publish:
+        description: 'Publish to PyPi'
         required: false
         type: boolean
-      dev_release:
+      dev-release:
         description: 'Development release (DEV_RELEASE=1)'
         required: false
         type: boolean
+  schedule:
+    - cron: 33 6 * * 1-5
+
+# In jobs we must use |*publish| instead of |inputs.publish| because we can not
+# set default value for workflow_dispatch inputs reliably.
+env:
+  publish: &publish ${{ inputs.publish || github.event_name == 'push' }}
+  pypi-env: &pypi-env ${{ (inputs.publish || github.event_name == 'push') && 'pypi' || 'dry-run' }}
 
 permissions:
   contents: read
 
 jobs:
   build_documentation:
+    name: Build documentation
     if: github.repository == 'ml-explore/mlx'
     runs-on: ubuntu-22.04
     steps:
@@ -29,7 +37,8 @@ jobs:
       - uses: ./.github/actions/build-docs
 
   deploy_documentation:
-    if: ${{ !inputs.dry_run }}
+    name: Deploy documentation
+    if: *publish
     needs: build_documentation
     permissions:
       pages: write
@@ -43,207 +52,266 @@ jobs:
         id: deployment
         uses: actions/deploy-pages@v5
 
-  build_linux_release:
-    if: github.repository == 'ml-explore/mlx'
+  build_frontend:
+    name: ${{ matrix.os }} (python-${{ matrix.python-version }}, ${{ matrix.arch }})
     strategy:
       matrix:
-        python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+        os: ['Linux', 'Windows']
         arch: ['x86_64', 'aarch64']
-    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
-    env:
+        python-version: ['3.10', '3.11', '3.12', '3.13', '3.14']
+        # There is no cp310 binary for Windows on arm.
+        exclude:
+          - os: 'Windows'
+            arch: 'aarch64'
+            python-version: '3.10'
+    runs-on: |-
+      ${{ case(matrix.os == 'Windows',
+               case(matrix.arch == 'aarch64', 'windows-11-arm',
+                    'windows-2022'),
+               case(matrix.arch == 'aarch64', 'ubuntu-22.04-arm',
+                    'ubuntu-22.04'))
+      }}
+    env: &build-env
       PYPI_RELEASE: 1
-      DEV_RELEASE: ${{ inputs.dev_release && 1 || 0 }}
+      DEV_RELEASE: ${{ inputs.dev-release && 1 || 0 }}
     steps:
       - uses: actions/checkout@v7
-      - uses: ./.github/actions/setup-linux
+      - uses: ./.github/actions/setup
+        id: setup
         with:
-          python-version: ${{ matrix.python_version }}
-          use-ccache: false
-      - uses: ./.github/actions/build-linux-release
+          python-version: ${{ matrix.python-version }}
+          ccache-key: 'release'
+          ccache-save: false
+      - uses: ./.github/actions/build-wheel
         with:
-          build-backend: ${{ matrix.python_version == '3.10' }}
-          arch: ${{ matrix.arch }}
-      - name: Upload MLX artifacts
-        uses: actions/upload-artifact@v7
+          cmake-args: ${{ steps.setup.outputs.cmake-args }}
+          build-backend: false
+      - uses: actions/upload-artifact@v7
         with:
-          overwrite: true
-          name: linux-wheels-${{ matrix.python_version }}-${{ matrix.arch }}
+          name: frontend-${{ runner.os }}-${{ runner.arch }}-py${{ matrix.python-version }}
           path: wheelhouse/mlx-*.whl
           if-no-files-found: error
-      - name: Upload CPU artifacts
-        if: matrix.python_version == '3.10'
-        uses: actions/upload-artifact@v7
+
+  build_backend:
+    name: ${{ matrix.os }} (${{ matrix.toolkit }}, ${{ matrix.arch }})
+    if: github.repository == 'ml-explore/mlx'
+    strategy:
+      matrix:
+        os: ['Linux', 'Windows']
+        arch: ['x86_64', 'aarch64']
+        toolkit: ['cpu', 'cuda-12.9', 'cuda-13.0']
+        exclude:
+          # CUDA does not support Windows on arm.
+          - os: 'Windows'
+            arch: 'aarch64'
+            toolkit: 'cuda-12.9'
+          - os: 'Windows'
+            arch: 'aarch64'
+            toolkit: 'cuda-13.0'
+    runs-on: |-
+      ${{ case(matrix.os == 'Windows',
+               case(matrix.arch == 'aarch64', 'windows-11-arm',
+                    'windows-2022'),
+               case(matrix.arch == 'aarch64', 'ubuntu-22-large-arm',
+                    'ubuntu-22-large'))
+      }}
+    env: *build-env
+    steps:
+      - uses: actions/checkout@v7
+      - uses: ./.github/actions/setup
+        id: setup
         with:
-          overwrite: true
-          name: mlx-cpu-${{ matrix.arch }}
-          path: wheelhouse/mlx_cpu-*.whl
-          if-no-files-found: error
+          toolkit: ${{ matrix.toolkit }}
+          ccache-key: 'release'
+      - uses: ./.github/actions/build-wheel
+        with:
+          cmake-args: ${{ steps.setup.outputs.cmake-args }}
+          build-frontend: false
+      - uses: actions/upload-artifact@v7
+        with:
+          name: backend-${{ matrix.toolkit }}-${{ runner.os }}-${{ runner.arch }}
+          path: wheelhouse/*.whl
+          if-no-files-found: ignore
 
-  build_mac_release:
+  build_mac_wheels:
+    name: macOS (python-${{ matrix.python-version }})
     if: github.repository == 'ml-explore/mlx'
     strategy:
       matrix:
-        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
-    runs-on: [self-hosted, macos]
-    env:
-      PYPI_RELEASE: 1
-      DEV_RELEASE: ${{ inputs.dev_release && 1 || 0 }}
+        python-version: ['3.10', '3.11', '3.12', '3.13', '3.14']
+    runs-on: 'macos-26'
+    env: *build-env
     steps:
       - uses: actions/checkout@v7
-      - uses: ./.github/actions/setup-macos
+      - uses: ./.github/actions/setup
+        id: setup
         with:
+          toolkit: 'metal'
           python-version: ${{ matrix.python-version }}
+          ccache-key: 'release'
+          ccache-save: ${{ matrix.python-version == '3.10' }}
       - name: Build macOS 14 package
-        uses: ./.github/actions/build-macos-release
+        uses: ./.github/actions/build-wheel
         with:
           macos-target: '14.0'
+          cmake-args: ${{ steps.setup.outputs.cmake-args }}
           build-backend: ${{ matrix.python-version == '3.10' }}
       - name: Build macOS 15 package
-        uses: ./.github/actions/build-macos-release
+        uses: ./.github/actions/build-wheel
         with:
           macos-target: '15.0'
+          cmake-args: ${{ steps.setup.outputs.cmake-args }}
           build-backend: ${{ matrix.python-version == '3.10' }}
       - name: Build macOS 26 package
-        uses: ./.github/actions/build-macos-release
+        uses: ./.github/actions/build-wheel
         with:
           macos-target: '26.2'
+          cmake-args: ${{ steps.setup.outputs.cmake-args }}
           build-backend: ${{ matrix.python-version == '3.10' }}
-      - name: Upload MLX artifacts
+      - name: Upload frontend packages
         uses: actions/upload-artifact@v7
         with:
-          overwrite: true
-          name: mac-wheels-${{ matrix.python-version }}
-          path: dist/mlx-*.whl
+          name: frontend-${{ runner.os }}-${{ runner.arch }}-py${{ matrix.python-version }}
+          path: wheelhouse/mlx-*.whl
           if-no-files-found: error
-      - name: Upload Metal artifacts
+      - name: Upload backend packages
         if: matrix.python-version == '3.10'
         uses: actions/upload-artifact@v7
         with:
-          overwrite: true
-          name: mlx-metal
-          path: dist/mlx_metal-*.whl
+          name: backend-metal-${{ runner.os }}-${{ runner.arch }}
+          path: wheelhouse/mlx_metal-*.whl
           if-no-files-found: error
 
-  build_cuda_release:
+  test_wheel:
+    name: Test (${{ matrix.os }}, ${{ matrix.toolkit }}, ${{ matrix.arch }})
     if: github.repository == 'ml-explore/mlx'
+    needs: [build_frontend, build_backend, build_mac_wheels]
     strategy:
       matrix:
-        arch: ['x86_64', 'aarch64']
-        toolkit: ['cuda-12.9', 'cuda-13.0']
-    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22-large' || 'ubuntu-22-large-arm' }}
-    env:
-      PYPI_RELEASE: 1
-      DEV_RELEASE: ${{ inputs.dev_release && 1 || 0 }}
+        os: ['Linux', 'Windows']
+        arch: ['aarch64']
+        toolkit: ['cpu']
+        include:
+          - os: 'Linux'
+            arch: 'x86_64'
+            toolkit: 'cpu'
+          - os: 'Linux'
+            arch: 'x86_64'
+            toolkit: 'cuda-12.9'
+          - os: 'Linux'
+            arch: 'x86_64'
+            toolkit: 'cuda-13.0'
+          - os: 'Windows'
+            arch: 'x86_64'
+            toolkit: 'cpu'
+          - os: 'macOS'
+            arch: 'aarch64'
+            toolkit: 'metal'
+    runs-on: |-
+      ${{ case(matrix.os == 'Windows', case(matrix.arch == 'aarch64', 'windows-11-arm',
+                                            'windows-2022'),
+               matrix.os == 'macOS', fromJson('["self-hosted","macos"]'),
+               case(matrix.arch == 'x86_64' && startsWith(matrix.toolkit, 'cuda'), 'gpu-t4-4-core',
+                    matrix.arch == 'aarch64', 'ubuntu-22.04-arm',
+                    'ubuntu-22.04'))
+      }}
     steps:
       - uses: actions/checkout@v7
-      - uses: ./.github/actions/setup-linux
+      - uses: ./.github/actions/setup
         with:
           toolkit: ${{ matrix.toolkit }}
-          ccache-key: 'ccache-release'
-      - name: Build Python package
-        uses: ./.github/actions/build-cuda-release
-        with:
-          arch: ${{ matrix.arch }}
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v7
+          use-ccache: false
+      - uses: ./.github/actions/test-wheel
         with:
-          overwrite: true
-          name: mlx-${{ matrix.toolkit }}-${{ matrix.arch }}
-          path: wheelhouse/mlx_cuda_*.whl
-          if-no-files-found: error
+          toolkit: ${{ matrix.toolkit }}
 
-  pypi-publish:
-    name: Upload release to PyPI
+  pypi-publish-frontend:
+    name: Publish mlx to PyPI
     runs-on: ubuntu-latest
-    needs: [build_linux_release, build_mac_release]
+    needs: [test_wheel]
     permissions:
       id-token: write
     environment:
-      name: ${{ inputs.dry_run && 'dry-run' || 'pypi' }}
+      name: *pypi-env
       url: https://pypi.org/p/mlx
     steps:
       - uses: actions/download-artifact@v8
         with:
-          pattern: linux-wheels-*
-          merge-multiple: true
-          path: dist
-      - uses: actions/download-artifact@v8
-        with:
-          pattern: mac-wheels-*
+          pattern: frontend-*
           merge-multiple: true
           path: dist
       - name: Display structure of downloaded files
         run: du -ah dist
       - name: Publish package distributions to PyPI
-        if: ${{ !inputs.dry_run }}
+        if: *publish
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
           repository-url: https://upload.pypi.org/legacy/
 
   pypi-publish-cuda:
-    name: Upload CUDA release to PyPI
+    name: Publish mlx-cuda to PyPI
     runs-on: ubuntu-latest
-    needs: [build_cuda_release]
+    needs: [test_wheel]
     permissions:
       id-token: write
     environment:
-      name: ${{ inputs.dry_run && 'dry-run' || 'pypi' }}
+      name: *pypi-env
       url: https://pypi.org/p/mlx-cuda
     steps:
       - uses: actions/download-artifact@v8
         with:
-          pattern: mlx-cuda-*
+          pattern: backend-cuda*
           merge-multiple: true
           path: dist
       - name: Display structure of downloaded files
         run: du -ah dist
       - name: Publish package distributions to PyPI
-        if: ${{ !inputs.dry_run }}
+        if: *publish
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
           repository-url: https://upload.pypi.org/legacy/
 
   pypi-publish-cpu:
-    name: Upload CPU release to PyPI
+    name: Publish mlx-cpu to PyPI
     runs-on: ubuntu-latest
-    needs: [build_linux_release]
+    needs: [test_wheel]
     permissions:
       id-token: write
     environment:
-      name: ${{ inputs.dry_run && 'dry-run' || 'pypi' }}
+      name: *pypi-env
       url: https://pypi.org/p/mlx-cpu
     steps:
       - uses: actions/download-artifact@v8
         with:
-          pattern: mlx-cpu-*
+          pattern: backend-cpu-*
           merge-multiple: true
           path: dist
       - name: Display structure of downloaded files
         run: du -ah dist
       - name: Publish package distributions to PyPI
-        if: ${{ !inputs.dry_run }}
+        if: *publish
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
           repository-url: https://upload.pypi.org/legacy/
 
   pypi-publish-metal:
-    name: Upload Metal release to PyPI
+    name: Publish mlx-metal to PyPI
     runs-on: ubuntu-latest
-    needs: [build_mac_release]
+    needs: [test_wheel]
     permissions:
       id-token: write
     environment:
-      name: ${{ inputs.dry_run && 'dry-run' || 'pypi' }}
+      name: *pypi-env
       url: https://pypi.org/p/mlx-metal
     steps:
       - uses: actions/download-artifact@v8
         with:
-          name: mlx-metal
+          pattern: backend-metal-*
           path: dist
       - name: Display structure of downloaded files
         run: du -ah dist
       - name: Publish package distributions to PyPI
-        if: ${{ !inputs.dry_run }}
+        if: *publish
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
           repository-url: https://upload.pypi.org/legacy/
diff --git a/.gitignore b/.gitignore
index 1daaa46d12..161d8d67eb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,11 +11,11 @@ __pycache__/
 *.metallib
 
 # Distribution / packaging
+/build/
 python/mlx/core
 python/mlx/share
 python/mlx/include
 .Python
-build/
 develop-eggs/
 dist/
 downloads/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3461faea5b..f8ee65ad4b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -209,6 +209,7 @@ if(MLX_BUILD_METAL)
     if(CMAKE_OSX_DEPLOYMENT_TARGET VERSION_LESS 14.0)
       message(FATAL_ERROR "MLX requires macOS >= 14.0")
     endif()
+    message(STATUS "Building for macOS ${CMAKE_OSX_DEPLOYMENT_TARGET}")
 
     # Get Metal version.
     set(XCRUN_FLAGS "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
@@ -278,19 +279,29 @@ if(MLX_BUILD_CPU)
     add_compile_definitions(MLX_USE_ACCELERATE)
     add_compile_definitions(ACCELERATE_NEW_LAPACK)
   elseif(WIN32)
+    if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64")
+      set(OPENBLAS_ZIP "OpenBLAS-0.3.33-woa64-dll.zip")
+      set(OPENBLAS_LIB "openblas")
+      set(OPENBLAS_INC "include/openblas")
+    else()
+      set(OPENBLAS_ZIP "OpenBLAS-0.3.33-x64.zip")
+      set(OPENBLAS_LIB "libopenblas")
+      set(OPENBLAS_INC "include")
+    endif()
     # Download and link prebuilt binaries of OpenBLAS. Note that we can only
     # link with the dynamic library, the prebuilt binaries were built with MinGW
     # so static-linking would require linking with MinGW's runtime.
     FetchContent_Declare(
       openblas
-      URL "https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.31/OpenBLAS-0.3.31-x64.zip"
+      URL "https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.33/${OPENBLAS_ZIP}"
     )
     FetchContent_MakeAvailable(openblas)
-    target_link_libraries(mlx
-                          PRIVATE "${openblas_SOURCE_DIR}/lib/libopenblas.lib")
-    target_include_directories(mlx PRIVATE "${openblas_SOURCE_DIR}/include")
+    target_link_libraries(
+      mlx PRIVATE "${openblas_SOURCE_DIR}/lib/${OPENBLAS_LIB}.lib")
+    target_include_directories(mlx
+                               PRIVATE "${openblas_SOURCE_DIR}/${OPENBLAS_INC}")
     # Make sure the DLL file is placed in the same dir with executables.
-    set(OPENBLAS_DLL_FILE "${openblas_SOURCE_DIR}/bin/libopenblas.dll")
+    set(OPENBLAS_DLL_FILE "${openblas_SOURCE_DIR}/bin/${OPENBLAS_LIB}.dll")
     add_custom_command(
       TARGET mlx
       POST_BUILD
diff --git a/MANIFEST.in b/MANIFEST.in
index d0daeb7ae6..632fae78c6 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,6 @@
 include CMakeLists.txt
 include mlx.pc.in
-recursive-include mlx/ *
+recursive-include mlx *
 include cmake/*
 include python/src/*
 include python/mlx/py.typed # support type hinting as in PEP-561
diff --git a/mlx/backend/cuda/CMakeLists.txt b/mlx/backend/cuda/CMakeLists.txt
index 10adffc23d..065220e24e 100644
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -72,6 +72,10 @@ target_sources(
           ${CMAKE_CURRENT_SOURCE_DIR}/quantized/convert_fp8.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)
 
+# Put dynamic defines in the dirs.cpp file.
+add_library(mlx_dirs OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/dirs.cpp)
+target_link_libraries(mlx PRIVATE $<BUILD_INTERFACE:mlx_dirs>)
+
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/binary)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unary)
 
@@ -197,8 +201,8 @@ if(WIN32)
   # Pass the locations where CUDA DLLs are placed.
   if(NOT MLX_LOAD_CUDA_LIBS_FROM_PYTHON)
     target_compile_definitions(
-      mlx PUBLIC MLX_CUDA_BIN_DIR="${CUDAToolkit_BIN_DIR}/x64"
-                 MLX_CUDNN_BIN_DIR="${CUDNN_BIN_DIR}")
+      mlx_dirs PRIVATE MLX_CUDA_BIN_DIR="${CUDAToolkit_BIN_DIR}/x64"
+                       MLX_CUDNN_BIN_DIR="${CUDNN_BIN_DIR}")
   endif()
 else()
   # For POSIX we rely on RPATH to search for CUDA libs.
@@ -208,9 +212,11 @@ else()
       APPEND
       PROPERTY INSTALL_RPATH
                # The paths here should match the install_requires in setup.py.
+               "$ORIGIN/../../nvidia/cu13/lib"
                "$ORIGIN/../../nvidia/cublas/lib"
                "$ORIGIN/../../nvidia/cuda_nvrtc/lib"
                "$ORIGIN/../../nvidia/cudnn/lib"
+               "$ORIGIN/../../nvidia/cufft/lib"
                "$ORIGIN/../../nvidia/nccl/lib")
   endif()
 endif()
@@ -233,7 +239,7 @@ install(DIRECTORY ${cccl_SOURCE_DIR}/include/nv
 # The binary of C++ tests will not be installed so it can not find the CCCL
 # headers, and we have to hard-code the path.
 if(MLX_BUILD_TESTS)
-  target_compile_definitions(mlx
+  target_compile_definitions(mlx_dirs
                              PRIVATE MLX_CCCL_DIR="${cccl_SOURCE_DIR}/include")
 endif()
 
diff --git a/mlx/backend/cuda/delayload.cpp b/mlx/backend/cuda/delayload.cpp
index 0ac1cc5e4b..aba7566c5b 100644
--- a/mlx/backend/cuda/delayload.cpp
+++ b/mlx/backend/cuda/delayload.cpp
@@ -7,7 +7,11 @@
 #include <delayimp.h>
 // clang-format on
 
-namespace mlx::core {
+namespace mlx::core::cu {
+
+// Defined in dirs.cpp to avoid invalidating compile cache.
+const char* cuda_bin_dir();
+const char* cudnn_bin_dir();
 
 namespace fs = std::filesystem;
 
@@ -15,35 +19,27 @@ inline fs::path relative_to_current_binary(const char* relative) {
   return fs::absolute(current_binary_dir() / relative);
 }
 
-inline fs::path cublas_bin_dir() {
-#if defined(MLX_CUDA_BIN_DIR)
-  return MLX_CUDA_BIN_DIR;
-#else
-  return relative_to_current_binary("../nvidia/cublas/bin");
-#endif
+inline fs::path cublas_dir() {
+  return cuda_bin_dir() ? fs::path(cuda_bin_dir())
+                        : relative_to_current_binary("../nvidia/cublas/bin");
 }
 
 fs::path load_nvrtc() {
-#if defined(MLX_CUDA_BIN_DIR)
-  fs::path nvrtc_bin_dir = MLX_CUDA_BIN_DIR;
-#else
-  fs::path nvrtc_bin_dir =
-      relative_to_current_binary("../nvidia/cuda_nvrtc/bin");
-#endif
+  fs::path nvrtc_dir = cuda_bin_dir()
+      ? fs::path(cuda_bin_dir())
+      : relative_to_current_binary("../nvidia/cuda_nvrtc/bin");
   // Internally nvrtc loads some libs dynamically, add to search dirs.
-  ::AddDllDirectory(nvrtc_bin_dir.c_str());
-  return nvrtc_bin_dir;
+  ::AddDllDirectory(nvrtc_dir.c_str());
+  return nvrtc_dir;
 }
 
 fs::path load_cudnn() {
-#if defined(MLX_CUDNN_BIN_DIR)
-  fs::path cudnn_bin_dir = MLX_CUDNN_BIN_DIR;
-#else
-  fs::path cudnn_bin_dir = relative_to_current_binary("../nvidia/cudnn/bin");
-#endif
+  fs::path cudnn_dir = cudnn_bin_dir()
+      ? fs::path(cudnn_bin_dir())
+      : relative_to_current_binary("../nvidia/cudnn/bin");
   // Must load cudnn_graph64_9.dll before locating symbols, otherwise We would
   // get errors like "Invalid handle. Cannot load symbol cudnnCreate".
-  for (const auto& dll : fs::directory_iterator(cudnn_bin_dir)) {
+  for (const auto& dll : fs::directory_iterator(cudnn_dir)) {
     if (dll.path().filename().string().starts_with("cudnn_graph") &&
         dll.path().extension() == ".dll") {
       ::LoadLibraryW(dll.path().c_str());
@@ -52,9 +48,9 @@ fs::path load_cudnn() {
   }
   // Internally cuDNN loads some libs dynamically, add to search dirs.
   load_nvrtc();
-  ::AddDllDirectory(cudnn_bin_dir.c_str());
-  ::AddDllDirectory(cublas_bin_dir().c_str());
-  return cudnn_bin_dir;
+  ::AddDllDirectory(cudnn_dir.c_str());
+  ::AddDllDirectory(cublas_dir().c_str());
+  return cudnn_dir;
 }
 
 // Called by system when failed to locate a lazy-loaded DLL.
@@ -63,18 +59,19 @@ FARPROC WINAPI delayload_helper(unsigned dliNotify, PDelayLoadInfo pdli) {
   if (dliNotify == dliNotePreLoadLibrary) {
     std::string dll = pdli->szDll;
     if (dll.starts_with("cudnn")) {
-      static auto cudnn_bin_dir = load_cudnn();
-      mod = ::LoadLibraryW((cudnn_bin_dir / dll).c_str());
+      static auto cudnn_dir = load_cudnn();
+      mod = ::LoadLibraryW((cudnn_dir / dll).c_str());
     } else if (dll.starts_with("cublas")) {
-      mod = ::LoadLibraryW((cublas_bin_dir() / dll).c_str());
+      mod = ::LoadLibraryW((cublas_dir() / dll).c_str());
     } else if (dll.starts_with("nvrtc")) {
-      static auto nvrtc_bin_dir = load_nvrtc();
-      mod = ::LoadLibraryW((nvrtc_bin_dir / dll).c_str());
+      static auto nvrtc_dir = load_nvrtc();
+      mod = ::LoadLibraryW((nvrtc_dir / dll).c_str());
     }
   }
   return reinterpret_cast<FARPROC>(mod);
 }
 
-} // namespace mlx::core
+} // namespace mlx::core::cu
 
-extern "C" const PfnDliHook __pfnDliNotifyHook2 = mlx::core::delayload_helper;
+extern "C" const PfnDliHook __pfnDliNotifyHook2 =
+    mlx::core::cu::delayload_helper;
diff --git a/mlx/backend/cuda/dirs.cpp b/mlx/backend/cuda/dirs.cpp
new file mode 100644
index 0000000000..a9d33b4790
--- /dev/null
+++ b/mlx/backend/cuda/dirs.cpp
@@ -0,0 +1,29 @@
+// Copyright © 2026 Apple Inc.
+
+namespace mlx::core::cu {
+
+const char* cccl_dir() {
+#if defined(MLX_CCCL_DIR)
+  return MLX_CCCL_DIR;
+#else
+  return nullptr;
+#endif
+}
+
+const char* cuda_bin_dir() {
+#if defined(MLX_CUDA_BIN_DIR)
+  return MLX_CUDA_BIN_DIR;
+#else
+  return nullptr;
+#endif
+}
+
+const char* cudnn_bin_dir() {
+#if defined(MLX_CUDNN_BIN_DIR)
+  return MLX_CUDNN_BIN_DIR;
+#else
+  return nullptr;
+#endif
+}
+
+} // namespace mlx::core::cu
diff --git a/mlx/backend/cuda/jit_module.cpp b/mlx/backend/cuda/jit_module.cpp
index 3a493fd14e..3de1ddb018 100644
--- a/mlx/backend/cuda/jit_module.cpp
+++ b/mlx/backend/cuda/jit_module.cpp
@@ -16,6 +16,9 @@
 
 namespace mlx::core::cu {
 
+// Defined in dirs.cpp to avoid invalidating compile cache.
+const char* cccl_dir();
+
 namespace {
 
 #define CHECK_NVRTC_ERROR(cmd) check_nvrtc_error(#cmd, (cmd))
@@ -61,11 +64,9 @@ const std::vector<std::string>& include_path_args() {
     }
     // Add path to CCCL headers.
     path = path / "cccl";
-#if defined(MLX_CCCL_DIR)
-    if (!std::filesystem::exists(path)) {
-      path = MLX_CCCL_DIR;
+    if (!std::filesystem::exists(path) && cccl_dir()) {
+      path = cccl_dir();
     }
-#endif
     if (std::filesystem::exists(path)) {
       args.push_back(fmt::format("--include-path={}", path.string()));
     }
diff --git a/setup.py b/setup.py
index b27abab0c8..3c2f138048 100644
--- a/setup.py
+++ b/setup.py
@@ -68,6 +68,14 @@ def __init__(self, name: str, sourcedir: str = "") -> None:
 
 
 class CMakeBuild(build_ext):
+    def finalize_options(self) -> None:
+        super().finalize_options()
+
+        # Setuptools does some clever things for Windows to make it
+        # more "native" but eventually made our life harder, revert back.
+        if platform.system() == "Windows":
+            self.build_temp = os.path.dirname(self.build_temp)
+
     def build_extension(self, ext: CMakeExtension) -> None:
         # Must be in this form due to bug in .resolve() only fixed in Python 3.10+
         ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)  # type: ignore[no-untyped-call]
@@ -99,29 +107,31 @@ def build_extension(self, ext: CMakeExtension) -> None:
             "-DMLX_BUILD_EXAMPLES=OFF",
             "-DBUILD_SHARED_LIBS=ON",
         ]
+
+        # Adding CMake arguments set as environment variable
+        # (needed e.g. to build for ARM OSx on conda-forge)
+        if "CMAKE_ARGS" in os.environ:
+            cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
+
+        # For release wheel force building for all supported arches.
         if build_stage == 2 and build_cuda:
             # Last arch is always real and virtual for forward-compatibility
-            cuda_archs = ";".join(
-                (
-                    "75-real",
-                    "80-real",
+            cuda_archs = [
+                "75-real",
+                "80-real",
+                "120a-real",
+                "120-virtual",
+            ]
+            if platform.system() == "Linux":
+                cuda_archs += [
                     "90a-real",
                     "100a-real",
-                    "120a-real",
-                    "120-virtual",
-                )
-            )
-            cmake_args += [f"-DMLX_CUDA_ARCHITECTURES={cuda_archs}"]
+                    "121a-real",
+                ]
+            cmake_args += [f"-DMLX_CUDA_ARCHITECTURES={';'.join(cuda_archs)}"]
             # Search CUDA libs from python packages.
             cmake_args += ["-DMLX_LOAD_CUDA_LIBS_FROM_PYTHON=ON"]
 
-        # Some generators require explcitly passing config when building.
-        build_args = ["--config", cfg]
-        # Adding CMake arguments set as environment variable
-        # (needed e.g. to build for ARM OSx on conda-forge)
-        if "CMAKE_ARGS" in os.environ:
-            cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
-
         # Pass version to C++
         cmake_args += [f"-DMLX_VERSION={self.distribution.get_version()}"]  # type: ignore[attr-defined]
 
@@ -131,13 +141,17 @@ def build_extension(self, ext: CMakeExtension) -> None:
             if archs:
                 cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
 
+        # Some generators require explcitly passing config when building.
+        build_args = ["--config", cfg]
+
         # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
         # across all generators.
         if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
             build_args += [f"-j{os.cpu_count()}"]
 
         # Avoid cache miss when building from temporary dirs.
-        os.environ["CCACHE_BASEDIR"] = os.path.realpath(self.build_temp)
+        os.environ["CCACHE_BASEDIR"] = os.path.realpath(build_temp)
+        os.environ["CCACHE_IGNOREHEADERS"] = os.path.realpath(build_temp)
         os.environ["CCACHE_NOHASHDIR"] = "true"
 
         subprocess.run(
@@ -289,24 +303,29 @@ def get_tag(self) -> tuple[str, str, str]:
             toolkit = cuda_toolkit_major_version()
             name = f"mlx-cuda-{toolkit}"
             # Note: update following files when new dependency is added:
-            # * .github/actions/build-cuda-release/action.yml
+            # * .github/actions/build-wheel/action.yml
             # * mlx/backend/cuda/CMakeLists.txt
+            install_requires += [
+                f"nvidia-cudnn-cu{toolkit}==9.*",
+            ]
+            if platform.system() == "Linux":
+                install_requires += [
+                    f"nvidia-nccl-cu{toolkit}",
+                ]
             if toolkit == 12:
                 install_requires += [
                     "nvidia-cublas-cu12==12.9.*",
+                    "nvidia-cufft-cu12==11.4.*",
                     "nvidia-cuda-nvrtc-cu12==12.9.*",
                 ]
             elif toolkit == 13:
                 install_requires += [
                     "nvidia-cublas",
+                    "nvidia-cufft",
                     "nvidia-cuda-nvrtc",
                 ]
             else:
                 raise ValueError(f"Unknown toolkit {toolkit}")
-            install_requires += [
-                f"nvidia-cudnn-cu{toolkit}==9.*",
-                f"nvidia-nccl-cu{toolkit}",
-            ]
 
         else:
             name = "mlx-cpu"