From 9e67f971137f99622f9ee4b0de35109099e51ec0 Mon Sep 17 00:00:00 2001 From: Cheng Date: Tue, 23 Jun 2026 08:28:53 +0900 Subject: [PATCH] Add CI for Windows CUDA build --- .github/actions/build-cuda-release/action.yml | 31 --- .github/actions/build-docs/action.yml | 20 +- .../actions/build-linux-release/action.yml | 42 --- .github/actions/build-linux/action.yml | 38 --- .../actions/build-macos-release/action.yml | 63 ----- .github/actions/build-wheel/action.yml | 115 ++++++++ .github/actions/build-windows/action.yml | 26 -- .github/actions/build/action.yml | 44 +++ .github/actions/setup-linux/action.yml | 111 -------- .github/actions/setup-macos/action.yml | 32 --- .github/actions/setup-windows/action.yml | 42 --- .github/actions/setup/action.yml | 256 ++++++++++++++++++ .github/actions/test-linux/action.yml | 37 ++- .github/actions/test-wheel/action.yml | 50 ++++ .github/actions/test-windows/action.yml | 8 +- .github/workflows/build_and_test.yml | 76 +++--- .github/workflows/nightly.yml | 108 -------- .github/workflows/release.yml | 254 ++++++++++------- .gitignore | 2 +- CMakeLists.txt | 21 +- MANIFEST.in | 2 +- mlx/backend/cuda/CMakeLists.txt | 12 +- mlx/backend/cuda/delayload.cpp | 59 ++-- mlx/backend/cuda/dirs.cpp | 29 ++ mlx/backend/cuda/jit_module.cpp | 9 +- setup.py | 63 +++-- 26 files changed, 836 insertions(+), 714 deletions(-) delete mode 100644 .github/actions/build-cuda-release/action.yml delete mode 100644 .github/actions/build-linux-release/action.yml delete mode 100644 .github/actions/build-linux/action.yml delete mode 100644 .github/actions/build-macos-release/action.yml create mode 100644 .github/actions/build-wheel/action.yml delete mode 100644 .github/actions/build-windows/action.yml create mode 100644 .github/actions/build/action.yml delete mode 100644 .github/actions/setup-linux/action.yml delete mode 100644 .github/actions/setup-macos/action.yml delete mode 100644 .github/actions/setup-windows/action.yml create mode 100644 .github/actions/setup/action.yml create mode 100644 .github/actions/test-wheel/action.yml delete mode 100644 .github/workflows/nightly.yml create mode 100644 mlx/backend/cuda/dirs.cpp diff --git a/.github/actions/build-cuda-release/action.yml b/.github/actions/build-cuda-release/action.yml deleted file mode 100644 index f78b0c10f8..0000000000 --- a/.github/actions/build-cuda-release/action.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: 'Build CUDA wheel' -description: 'Build CUDA wheel' - -inputs: - arch: - description: 'Platform architecture tag' - required: true - type: choice - options: - - x86_64 - - aarch64 - -runs: - using: "composite" - steps: - - name: Build package - shell: bash - env: - CMAKE_ARGS: -DMLX_BUILD_CUDA=ON - run: | - pip install auditwheel "build<=1.4.2" patchelf setuptools - python setup.py clean --all - MLX_BUILD_STAGE=2 python -m build -w - - auditwheel repair dist/mlx_cuda*.whl \ - --plat manylinux_2_35_${{ inputs.arch }} \ - --exclude libcublas* \ - --exclude libcuda* \ - --exclude libcudnn* \ - --exclude libnccl* \ - --exclude libnvrtc* diff --git a/.github/actions/build-docs/action.yml b/.github/actions/build-docs/action.yml index 411f6be8d9..4d4286e3c8 100644 --- a/.github/actions/build-docs/action.yml +++ b/.github/actions/build-docs/action.yml @@ -5,24 +5,28 @@ runs: using: "composite" steps: - name: Setup machine - uses: ./.github/actions/setup-linux + id: setup + uses: ./.github/actions/setup + with: + ccache-key: 'release' + ccache-save: false - name: Install dependencies shell: bash + env: + CMAKE_ARGS: ${{ steps.setup.outputs.cmake-args }} run: | sudo apt-get install -y doxygen - source .venv/bin/activate - pip install -r docs/requirements.txt - pip install . -v - + uv pip install -r docs/requirements.txt + uv pip install . -v + - name: Build documentation shell: bash run: | - source .venv/bin/activate cd docs doxygen make html O=-W - + - name: Create artifact tar shell: bash run: tar -cf artifact.tar -C docs --dereference build/html index.html @@ -30,7 +34,7 @@ runs: # Do it manually because upload-pages-artifact requires gtar - name: Upload artifact id: upload-artifact - uses: actions/upload-artifact@v5 + uses: actions/upload-artifact@v7 with: name: github-pages path: artifact.tar diff --git a/.github/actions/build-linux-release/action.yml b/.github/actions/build-linux-release/action.yml deleted file mode 100644 index 2e938d8592..0000000000 --- a/.github/actions/build-linux-release/action.yml +++ /dev/null @@ -1,42 +0,0 @@ -name: 'Build Linux wheel' -description: 'Build Linux wheel' - -inputs: - build-backend: - description: 'Build the backend mlx-cpu package' - type: boolean - required: false - default: false - arch: - description: 'Platform architecture tag' - required: true - type: choice - options: - - x86_64 - - aarch64 - -runs: - using: "composite" - steps: - - name: Build MLX - shell: bash - run: pip install -e . -v - - - name: Build Python package - shell: bash - run: | - pip install auditwheel patchelf "build<=1.4.2" - python setup.py clean --all - MLX_BUILD_STAGE=1 python -m build -w - auditwheel repair dist/mlx-*.whl \ - --plat manylinux_2_35_${{ inputs.arch }} \ - --exclude libmlx.so* \ - --only-plat - - - name: Build backend package - if: ${{ inputs.build-backend }} - shell: bash - run: | - python setup.py clean --all - MLX_BUILD_STAGE=2 python -m build -w - auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_${{ inputs.arch }} diff --git a/.github/actions/build-linux/action.yml b/.github/actions/build-linux/action.yml deleted file mode 100644 index b7a3b07c39..0000000000 --- a/.github/actions/build-linux/action.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: 'Build and Test on Linux' - -inputs: - toolkit: - description: 'The toolkit to build with' - required: false - default: 'cpu' - -runs: - using: "composite" - steps: - - - name: Install Python package - id: python_build - shell: sh - env: - DEBUG: 1 - CMAKE_ARGS: >- - -DCMAKE_COMPILE_WARNING_AS_ERROR=ON - -DMLX_BUILD_CUDA=${{ startsWith(inputs.toolkit, 'cuda') && 'ON' || 'OFF' }} - run: | - if ${{ startsWith(inputs.toolkit, 'cuda') && runner.arch == 'arm64' }} ; then - # There is no GPU in arm64 runner, use a common arch. - CMAKE_ARGS="$CMAKE_ARGS -DMLX_CUDA_ARCHITECTURES=80" - # Can not build tests and stubs when the built executables can not run. - CMAKE_ARGS="$CMAKE_ARGS -DMLX_BUILD_TESTS=OFF -DMLX_BUILD_PYTHON_STUBS=OFF" - fi - # Install cpu-only torch to save space - pip install torch --index-url https://download.pytorch.org/whl/cpu - pip install --no-build-isolation -e ".[dev]" -v - # Pass the CMAKE_ARGS to following steps. - echo CMAKE_ARGS="$CMAKE_ARGS" >> $GITHUB_OUTPUT - - - name: Build CPP only - shell: bash - run: | - cmake . -B build -DCMAKE_BUILD_TYPE=Debug ${{ steps.python_build.outputs.CMAKE_ARGS }} - cmake --build build -j $(nproc) diff --git a/.github/actions/build-macos-release/action.yml b/.github/actions/build-macos-release/action.yml deleted file mode 100644 index ec36c46f6f..0000000000 --- a/.github/actions/build-macos-release/action.yml +++ /dev/null @@ -1,63 +0,0 @@ -name: 'Build macOS release' -description: 'Build MLX releases macOS' - -inputs: - macos-target: - description: 'macOS build target' - required: false - default: '15.0' - build-backend: - description: 'Build the backend mlx-metal package' - required: false - default: 'false' - python-version: - description: 'Python version to use' - required: false - default: '3.10' - -runs: - using: "composite" - steps: - - name: Clear build environment - shell: bash - run: | - echo "::group::Clear build environment" - uv venv --clear - uv pip install build setuptools - python setup.py clean --all - source .venv/bin/activate - echo "::endgroup::" - - - name: Build Python package - shell: bash - env: - DEVELOPER_DIR: /Applications/Xcode-latest.app - MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }} - run: | - echo "::group::Build Python package" - MLX_BUILD_STAGE=1 python -m build -w - python setup.py clean --all - echo "::endgroup::" - - - name: Build backend package - if: ${{ inputs.build-backend == 'true' }} - shell: bash - env: - DEVELOPER_DIR: /Applications/Xcode-latest.app - MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }} - run: | - echo "::group::Build backend package" - MLX_BUILD_STAGE=2 python -m build -w - python setup.py clean --all - echo "::endgroup::" - - - name: Test local packages - if: ${{ inputs.build-backend == 'true' && startsWith(inputs.macos-target, '26.') }} - shell: bash - run: | - echo "::group::Test local packages" - uv pip install numpy torch - uv pip install dist/mlx_metal-*-macosx_26_0_arm64.whl - uv pip install dist/mlx-*-macosx_26_0_arm64.whl - python -m unittest discover -v python/tests - echo "::endgroup::" diff --git a/.github/actions/build-wheel/action.yml b/.github/actions/build-wheel/action.yml new file mode 100644 index 0000000000..96cca9eadc --- /dev/null +++ b/.github/actions/build-wheel/action.yml @@ -0,0 +1,115 @@ +name: 'Build wheel' +description: 'Build the Python wheels for release on all platforms' + +inputs: + cmake-args: + description: 'The args for generating CMake project' + required: true + build-frontend: + description: 'Build the frontend mlx package' + required: false + default: 'true' + build-backend: + description: 'Build the backend mlx-cpu/mlx-cuda/mlx-metal packages' + required: false + default: 'true' + macos-target: + description: 'The target macOS version to build for' + required: false + default: '26.2' + arch-tag: + description: 'Platform architecture tag' + required: false + default: |- + ${{ case(runner.arch == 'x64', 'x86_64', + runner.arch == 'x86', 'i686', + runner.arch == 'arm', 'armv7l', + runner.arch == 'arm64', 'aarch64', + 'unknown') + }} + +runs: + using: 'composite' + steps: + - name: Install dependencies + shell: bash + run: | + echo "::group::Install dependencies" + uv pip install 'build<=1.4.2' setuptools + if ${{ runner.os == 'Linux' }} ; then + uv pip install auditwheel patchelf + fi + mkdir -p wheelhouse + echo "::endgroup::" + + - name: Build frontend package + if: inputs.build-frontend == 'true' + shell: bash + env: + CMAKE_ARGS: ${{ inputs.cmake-args }} + MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }} + run: | + echo "::group::Build frontend package" + python setup.py clean --all + MLX_BUILD_STAGE=1 python -m build -w + echo "::endgroup::" + + - name: Post-process frontend package + if: inputs.build-frontend == 'true' + shell: bash + run: | + echo "::group::Post-process frontend package" + if ${{ runner.os == 'Linux' }} ; then + auditwheel repair dist/mlx-*.whl \ + --plat manylinux_2_35_${{ inputs.arch-tag }} \ + --exclude libmlx.so* \ + --only-plat + else + mv dist/mlx-*.whl wheelhouse/ + fi + echo "::endgroup::" + + - name: Build backend package + if: inputs.build-backend == 'true' + shell: bash + env: + CMAKE_ARGS: ${{ inputs.cmake-args }} + MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }} + run: | + echo "::group::Build backend package" + python setup.py clean --all + MLX_BUILD_STAGE=2 python -m build -w + echo "::endgroup::" + + - name: Post-process backend package + if: inputs.build-backend == 'true' + shell: bash + run: | + echo "::group::Post-process backend package" + if ${{ runner.os == 'Linux' }} ; then + if [ -f dist/mlx_cpu*.whl ]; then + auditwheel repair dist/mlx_cpu*.whl \ + --plat manylinux_2_35_${{ inputs.arch-tag }} + fi + if [ -f dist/mlx_cuda*.whl ]; then + auditwheel repair dist/mlx_cuda*.whl \ + --plat manylinux_2_35_${{ inputs.arch-tag }} \ + --exclude libcublas* \ + --exclude libcuda* \ + --exclude libcudnn* \ + --exclude libcufft* \ + --exclude libnccl* \ + --exclude libnvrtc* + fi + else + if [ -f dist/mlx_cpu*.whl ]; then + mv dist/mlx_cpu*.whl wheelhouse/ + fi + if [ -f dist/mlx_cuda*.whl ]; then + mv dist/mlx_cuda*.whl wheelhouse/ + fi + if [ -f dist/mlx_metal*.whl ]; then + mv dist/mlx_metal*.whl wheelhouse/ + fi + fi + echo "::endgroup::" diff --git a/.github/actions/build-windows/action.yml b/.github/actions/build-windows/action.yml deleted file mode 100644 index 372f848156..0000000000 --- a/.github/actions/build-windows/action.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: 'Build on Windows' - -runs: - using: 'composite' - steps: - - name: Install Python package - id: python-build - shell: cmd - env: - # For MSVC, Ninja/Release is the only config supported by ccache. - CMAKE_ARGS: >- - -G Ninja - -DCMAKE_BUILD_TYPE=Release - -DCMAKE_C_COMPILER=cl - -DCMAKE_CXX_COMPILER=cl - -DCMAKE_RC_COMPILER=rc - run: | - uv pip install ".[dev]" -v - :: Pass the CMAKE_ARGS to following steps. - >>%GITHUB_OUTPUT% ECHO CMAKE_ARGS=%CMAKE_ARGS% - - - name: Build CPP only - shell: cmd - run: | - cmake . -B build ${{ steps.python-build.outputs.CMAKE_ARGS }} - cmake --build build -j %NUMBER_OF_PROCESSORS% diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml new file mode 100644 index 0000000000..95c6f5ff70 --- /dev/null +++ b/.github/actions/build/action.yml @@ -0,0 +1,44 @@ +name: 'Build' +description: 'Build C++ and Python binaries for testing on Linux and Windows' + +inputs: + cmake-args: + description: 'The args for generating CMake project' + required: true + debug: + description: 'Do debug build' + required: true + +runs: + using: 'composite' + steps: + - name: Install Python package + shell: bash + env: + DEBUG: ${{ inputs.debug == 'true' && 1 || 0 }} + CMAKE_ARGS: ${{ inputs.cmake-args }} + run: | + echo "::group::Install Python package" + # Install cpu-only torch to save space + uv pip install torch --index-url https://download.pytorch.org/whl/cpu + uv pip install --no-build-isolation -e ".[dev]" -v + echo "::endgroup::" + + - name: Build CPP only + shell: bash + env: + # The cpp build is using some extra settings to reuse the compile cache + # generated by the python install: + # 1. Use the same ccache options with setup.py. + # 2. Build dynamic library. + # 3. Put the build dir in the same depth with python build dir. + CCACHE_BASEDIR: ${{ github.workspace }}/build/cpp/mlx + CCACHE_NOHASHDIR: true + run: | + echo "::group::Build CPP only" + cmake . -B build/cpp/mlx ${{ inputs.cmake-args }} \ + -DBUILD_SHARED_LIBS=ON \ + -DCMAKE_BUILD_TYPE=${{ inputs.debug == 'true' && 'Debug' || 'Release' }} + cmake --build build/cpp/mlx \ + -j ${{ runner.os == 'Windows' && '$NUMBER_OF_PROCESSORS' || '$(nproc)' }} + echo "::endgroup::" diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml deleted file mode 100644 index 419abfb163..0000000000 --- a/.github/actions/setup-linux/action.yml +++ /dev/null @@ -1,111 +0,0 @@ -name: 'Setup Linux Environment' -description: 'Install dependencies for Linux builds' - -inputs: - toolkit: - description: 'Which toolkit to install' - required: false - default: 'cpu' - python-version: - description: 'Version of python to set up' - required: false - default: '3.14' - use-ccache: - description: 'Whether to enable ccache' - required: false - default: 'true' - ccache-key: - required: false - default: 'ccache' - -runs: - using: "composite" - steps: - - name: Install common dependencies - shell: bash - run: | - echo "::group::Install common dependencies" - sudo apt-get update - sudo apt-get install -y --no-install-recommends \ - gdb zip \ - libblas-dev liblapack-dev liblapacke-dev \ - openmpi-bin openmpi-common libopenmpi-dev - echo "::endgroup::" - - - name: Use ccache - if: ${{ inputs.use-ccache == 'true' }} - uses: hendrikmuhs/ccache-action@v1.2 - with: - key: ${{ inputs.ccache-key }}-${{ runner.os }}-${{ runner.arch }}-${{ inputs.toolkit }} - max-size: 1GB - # ccache-action bug: running "apt-get update" fails on large arm runner. - update-package-index: false - - - name: Cache JIT-compiled CUDA kernels - if: ${{ startsWith(inputs.toolkit, 'cuda') }} - uses: actions/cache@v5 - with: - path: /tmp/mlx-ptx-cache - key: >- - ptx-${{ runner.os }}-${{ runner.arch }}-${{ inputs.toolkit }}- - ${{ hashFiles('mlx/backend/cuda/**') }} - - - uses: actions/setup-python@v6 - with: - python-version: ${{ inputs.python-version }} - - - name: Setup Python venv - shell: bash - run: | - echo "::group::Setup Python venv" - python -m venv .venv - source .venv/bin/activate - pip install setuptools cmake typing_extensions - echo PATH=$PATH >> $GITHUB_ENV - # Search python packages in .venv - echo PYTHONPATH=`python -c 'import sys; print(sys.path[-1])'` >> $GITHUB_ENV - echo "::endgroup::" - - - name: Set swap space - if: ${{ startsWith(inputs.toolkit, 'cuda') }} - uses: pierotofy/set-swap-space@fc79b3f67fa8a838184ce84a674ca12238d2c761 - with: - swap-size-gb: 16 - - - name: Install CUDA toolkit - if: ${{ startsWith(inputs.toolkit, 'cuda') }} - shell: bash - env: - # Note: the CI machine does not meet CUDA 13's driver requirement. - # Compatibility matrix: - # https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html - PACKAGES: | - { - "cuda-12.6": "libcudnn9-dev-cuda-12 cuda-compiler-12-6 cuda-libraries-dev-12-6", - "cuda-12.9": "libcudnn9-dev-cuda-12 cuda-compiler-12-9 cuda-libraries-dev-12-9", - "cuda-13.0": "libcudnn9-dev-cuda-13 cuda-compiler-13-0 cuda-libraries-dev-13-0" - } - run: | - echo "::group::Install CUDA toolkit" - # The CUDA binaries are hosted in the "sbsa" repo, the "arm64" repo is - # Jetson specific. SBSA means Arm Server Base System Architecture. - ARCH=${{ runner.arch == 'arm64' && 'sbsa' || 'x86_64' }} - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$ARCH/cuda-keyring_1.1-1_all.deb - sudo dpkg -i cuda-keyring_1.1-1_all.deb - sudo apt-get update - sudo apt-get install -y --no-install-recommends \ - libnccl2 libnccl-dev \ - ${{ fromJson(env.PACKAGES)[inputs.toolkit] }} - echo "/usr/local/${{ inputs.toolkit }}/bin" >> $GITHUB_PATH - echo "::endgroup::" - - - name: CUDA packages and driver report - if: ${{ startsWith(inputs.toolkit, 'cuda') }} - shell: bash - run: | - echo "::group::Installed NVIDIA and CUDA packages" - dpkg -l | egrep "cuda|nvidia" -i - echo "::endgroup::" - echo "::group::NVIDIA-SMI Status" - nvidia-smi || true - echo "::endgroup::" diff --git a/.github/actions/setup-macos/action.yml b/.github/actions/setup-macos/action.yml deleted file mode 100644 index 97655daa3a..0000000000 --- a/.github/actions/setup-macos/action.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: 'Setup macOS Environment' -description: 'Install dependencies for macOS builds' - -inputs: - python-version: - description: 'Python version to use' - required: false - default: '3.10' - -runs: - using: "composite" - steps: - - name: Install Homebrew packages - shell: sh - run: /opt/homebrew/bin/brew install openmpi - - - name: Verify MetalToolchain installed - shell: bash - run: xcodebuild -showComponent MetalToolchain - - - uses: astral-sh/setup-uv@v7 - - - name: Setup Python venv - shell: bash - run: | - echo "::group::Setup Python venv" - uv venv --python ${{ inputs.python-version }} --managed-python - source .venv/bin/activate - echo PATH=$PATH >> $GITHUB_ENV - # Search python packages in .venv - echo PYTHONPATH=`python -c 'import sys; print(sys.path[-1])'` >> $GITHUB_ENV - echo "::endgroup::" diff --git a/.github/actions/setup-windows/action.yml b/.github/actions/setup-windows/action.yml deleted file mode 100644 index 83afd4de12..0000000000 --- a/.github/actions/setup-windows/action.yml +++ /dev/null @@ -1,42 +0,0 @@ -name: 'Setup Windows environment' - -inputs: - python-version: - description: 'Version of python to set up' - required: false - default: '3.14' - use-ccache: - description: 'Whether to enable ccache' - required: false - default: 'true' - -runs: - using: 'composite' - steps: - - name: Use ccache - if: ${{ inputs.use-ccache == 'true' }} - uses: hendrikmuhs/ccache-action@v1.2 - with: - key: ccache-${{ runner.os }}-${{ runner.arch }}-cpu - max-size: 1GB - - - name: Setup Visual Studio cmd - shell: cmd - run: | - :: Find out path to VS. - pushd "C:\Program Files (x86)\Microsoft Visual Studio\Installer\" - for /f "delims=" %%x in ('.\vswhere.exe -latest -property InstallationPath') do set VSPATH=%%x - popd - :: Import VS vars. - call "%VSPATH%\VC\Auxiliary\Build\vcvarsall.bat" x64 - :: Export to all steps. - >>%GITHUB_ENV% set - - - uses: astral-sh/setup-uv@v7 - - - name: Setup Python venv - shell: cmd - run: | - uv venv --python ${{ inputs.python-version }} - call ".venv/Scripts/activate.bat" - >>%GITHUB_ENV% set diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml new file mode 100644 index 0000000000..731968d4d4 --- /dev/null +++ b/.github/actions/setup/action.yml @@ -0,0 +1,256 @@ +name: 'Setup environment' +description: 'Install dependencies for Linux and Windows builds' + +inputs: + toolkit: + description: 'Which toolkit to install' + required: false + default: 'cpu' + python-version: + description: 'Version of python to set up' + required: false + default: '3.12' + use-ccache: + description: 'Whether to enable ccache' + required: false + default: 'true' + ccache-key: + description: 'Extra key to use in the key of ccache' + required: false + default: 'test' + ccache-save: + description: 'Whether the ccache in this workflow will be saved' + required: false + default: 'auto' + +outputs: + cmake-args: + description: 'The args for generating CMake project' + value: ${{ steps.cmake-args.outputs.cmakeArgs }} + +runs: + using: 'composite' + steps: + - name: Install Linux dependencies + if: runner.os == 'Linux' + shell: bash + run: | + echo "::group::Install common dependencies" + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + gdb g++ ninja-build zip \ + libblas-dev liblapack-dev liblapacke-dev \ + openmpi-bin openmpi-common libopenmpi-dev + echo "::endgroup::" + + - name: Install macOS dependencies + if: runner.os == 'macOS' + shell: bash + run: | + echo "::group::Install macOS dependencies" + brew install openmpi + xcodebuild -showComponent MetalToolchain + echo "::endgroup::" + + - name: Setup Windows environment + if: runner.os == 'Windows' + shell: cmd + run: | + echo "::group::Setup environment" + :: Find out path to Visual Studio. + pushd "C:\Program Files (x86)\Microsoft Visual Studio\Installer\" + for /f "delims=" %%x in ('.\vswhere.exe -latest -property InstallationPath') do set VSPATH=%%x + popd + :: Import Visual stubs vars. + call "%VSPATH%\VC\Auxiliary\Build\vcvarsall.bat" ${{ runner.arch }} + :: Avoid using default TMP which uses short path and causes mismatch of CCACHE_BASEDIR. + set TMP=%RUNNER_TEMP% + set TEMP=%RUNNER_TEMP% + set UV_CACHE_DIR=%RUNNER_TEMP% + :: The cuda headers are downloaded. + set CCACHE_COMPILERCHECK=content + set CCACHE_SLOPPINESS=include_file_ctime,include_file_mtime + :: Export to all steps. + >>%GITHUB_ENV% set + echo "::endgroup::" + + - uses: astral-sh/setup-uv@v8.2.0 + with: + enable-cache: false + quiet: true + + - name: Use ccache + if: inputs.use-ccache == 'true' + uses: hendrikmuhs/ccache-action@v1.2.23 + with: + key: v7-${{ inputs.ccache-key }}-${{ runner.os }}-${{ runner.arch }}-${{ inputs.toolkit }} + max-size: ${{ inputs.toolkit == 'cpu' && '200MB' || '600MB' }} + save: ${{ !startsWith(github.ref, 'refs/pull/') && (inputs.ccache-save != 'false') }} + # ccache-action bug: running "apt-get update" fails on large arm runner. + update-package-index: false + + - name: Cache JIT-compiled CUDA kernels + if: runner.os == 'Linux' && startsWith(inputs.toolkit, 'cuda') + uses: actions/cache@v5 + with: + path: /tmp/mlx-ptx-cache + key: >- + ptx-${{ runner.os }}-${{ runner.arch }}-${{ inputs.toolkit }}- + ${{ hashFiles('mlx/backend/cuda/**') }} + + - name: Setup Python venv + if: runner.os != 'Windows' + shell: bash + run: | + echo "::group::Setup Python venv" + uv venv --python ${{ inputs.python-version }} --managed-python + # Make sure all builds use the same cmake binary. + uv pip install cmake + source .venv/bin/activate + echo PATH=$PATH >> $GITHUB_ENV + # Search python packages in .venv + echo PYTHONPATH=`python -c 'import sys; print(sys.path[-1])'` >> $GITHUB_ENV + echo "::endgroup::" + + - name: Setup Python venv (Windows) + if: runner.os == 'Windows' + shell: cmd + run: | + echo "::group::Setup Python venv" + uv venv --python ${{ inputs.python-version }}${{ runner.arch == 'arm64' && '-arm64' || ''}} || exit /b + uv pip install cmake + call ".venv/Scripts/activate.bat" + >>%GITHUB_ENV% set + echo "::endgroup::" + + - name: Install CUDA toolkit (Linux) + if: runner.os == 'Linux' && startsWith(inputs.toolkit, 'cuda') + shell: bash + env: + PACKAGES: | + { + "cuda-12.6": "libcudnn9-dev-cuda-12 cuda-compiler-12-6 cuda-libraries-dev-12-6", + "cuda-12.9": "libcudnn9-dev-cuda-12 cuda-compiler-12-9 cuda-libraries-dev-12-9", + "cuda-13.0": "libcudnn9-dev-cuda-13 cuda-compiler-13-0 cuda-libraries-dev-13-0" + } + run: | + echo "::group::Install CUDA toolkit" + # The CUDA binaries are hosted in the "sbsa" repo, the "arm64" repo is + # Jetson specific. SBSA means Arm Server Base System Architecture. + ARCH=${{ runner.arch == 'arm64' && 'sbsa' || 'x86_64' }} + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$ARCH/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + libnccl2 libnccl-dev \ + ${{ fromJson(env.PACKAGES)[inputs.toolkit] }} + echo "/usr/local/${{ inputs.toolkit }}/bin" >> $GITHUB_PATH + echo "::endgroup::" + + - name: Install CUDA Toolkit (Windows) + if: runner.os == 'Windows' && startsWith(inputs.toolkit, 'cuda') + shell: powershell + env: + INSTALLERS: | + { + "cuda-12.6": "https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_561.17_windows.exe", + "cuda-12.9": "https://developer.download.nvidia.com/compute/cuda/12.9.1/local_installers/cuda_12.9.1_576.57_windows.exe", + "cuda-13.0": "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_windows.exe" + } + PACKAGES: | + { + "cuda-12.6": ["cudart_12.6", "nvcc_12.6", "cublas_12.6", "cublas_dev_12.6", "cufft_12.6", "cufft_dev_12.6", "nvrtc_12.6", "nvrtc_dev_12.6"], + "cuda-12.9": ["cudart_12.9", "nvcc_12.9", "cublas_12.9", "cublas_dev_12.9", "cufft_12.9", "cufft_dev_12.9", "nvrtc_12.9", "nvrtc_dev_12.9"], + "cuda-13.0": ["cudart_13.0", "nvcc_13.0", "cublas_13.0", "cublas_dev_13.0", "cufft_13.0", "cufft_dev_13.0", "nvrtc_13.0", "nvrtc_dev_13.0", "crt_13.0", "nvvm_13.0", "nvptxcompiler_13.0"], + } + run: | + echo "::group::Install CUDA toolkit" + $ErrorActionPreference = "Stop" + $cudaUrl = "${{ fromJson(env.INSTALLERS)[inputs.toolkit] }}" + $cudaInstaller = "./install.exe" + + echo "Downloading '$cudaUrl'..." + $webClient = New-Object System.Net.WebClient + $webClient.DownloadFile($cudaUrl, $cudaInstaller) + echo "Downloaded: $cudaInstaller ($(([math]::Round((Get-Item $cudaInstaller).Length / 1MB, 2))) MB)" + + $args = "-s ${{ join(fromJson(env.PACKAGES)[inputs.toolkit], ' ') }}" + echo "Running '$cudaInstaller $args'..." + Start-Process -FilePath $cudaInstaller -ArgumentList "$args" -NoNewWindow -Wait + $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path + echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + echo "::endgroup::" + + - name: Install cuDNN (Windows) + if: runner.os == 'Windows' && startsWith(inputs.toolkit, 'cuda') + id: cudnn + shell: powershell + env: + ARCHIVES: | + { + "cuda-12.6": "https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.23.2.1_cuda12-archive.zip", + "cuda-12.9": "https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.23.2.1_cuda12-archive.zip", + "cuda-13.0": "https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.23.2.1_cuda13-archive.zip" + } + run: | + echo "::group::Install cuDNN" + $ErrorActionPreference = "Stop" + $cudnnUrl = "${{ fromJson(env.ARCHIVES)[inputs.toolkit] }}" + $cudnnZip = "cudnn.zip" + + echo "Downloading '$cudnnUrl'..." + $webClient = New-Object System.Net.WebClient + $webClient.DownloadFile($cudnnUrl, $cudnnZip) + echo "Downloaded: $cudnnZip ($(([math]::Round((Get-Item $cudnnZip).Length / 1MB, 2))) MB)" + + echo "Extracing..." + Expand-Archive -Path $cudnnZip -DestinationPath cudnn-extracted + $cudnnDir = (Get-ChildItem -Path cudnn-extracted -Directory)[0].FullName + echo "cudnnDir=$($cudnnDir -replace '\\', '/')" | Out-File -FilePath $env:GITHUB_OUTPUT + echo "::endgroup::" + + - name: Generate CMake args + id: cmake-args + shell: bash + run: | + echo "::group::Generate CMake args" + cmakeArgs=( + "-G Ninja" + ) + if ${{ runner.os == 'Windows' }} ; then + cmakeArgs+=( + "-DCMAKE_C_COMPILER=cl" + "-DCMAKE_CXX_COMPILER=cl" + "-DCMAKE_RC_COMPILER=rc" + ) + else + cmakeArgs+=( + "-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" + ) + fi + if ${{ startsWith(inputs.toolkit, 'cuda') }} ; then + cmakeArgs+=("-DMLX_BUILD_CUDA=ON") + # Some machines have no GPU. + if ! __nvcc_device_query ; then + # Use a fallback arch for testing. + cmakeArgs+=("-DMLX_CUDA_ARCHITECTURES=80") + # Can not build tests and stubs when the built executables can not run. + cmakeArgs+=("-DMLX_BUILD_TESTS=OFF") + cmakeArgs+=("-DMLX_BUILD_PYTHON_STUBS=OFF") + fi + echo + # Set cuDNN paths. + if ${{ runner.os == 'Windows' }} ; then + cmakeArgs+=( + "-DCUDNN_INCLUDE_PATH=${{ steps.cudnn.outputs.cudnnDir }}/include" + "-DCUDNN_LIBRARY_PATH=${{ steps.cudnn.outputs.cudnnDir }}/lib/x64" + ) + fi + else + cmakeArgs+=("-DMLX_BUILD_CUDA=OFF") + fi + # Pass to following steps. + IFS=" " + echo ${cmakeArgs[*]} + echo "cmakeArgs=${cmakeArgs[*]}" >> $GITHUB_OUTPUT + echo "::endgroup::" diff --git a/.github/actions/test-linux/action.yml b/.github/actions/test-linux/action.yml index 24cda6c103..33ca15268f 100644 --- a/.github/actions/test-linux/action.yml +++ b/.github/actions/test-linux/action.yml @@ -1,15 +1,24 @@ -name: 'Run Linux tests' - -inputs: - has-gpu: - description: 'Run GPU tests' - required: false - default: false +name: 'Run tests' +description: 'Run Python and C++ tests on Linux' runs: - using: "composite" + using: 'composite' steps: + - name: Check GPU support + id: gpu-check + shell: bash + run: | + echo "::group::Check GPU support" + if __nvcc_device_query ; then + echo "good=true" >> $GITHUB_OUTPUT + else + echo "good=false" >> $GITHUB_OUTPUT + fi + echo + echo "::endgroup::" + - name: Run MPI tests + if: steps.gpu-check.outputs.good == 'false' shell: bash run: | echo "::group::MPI tests" @@ -17,7 +26,7 @@ runs: echo "::endgroup::" - name: Run distributed tests - if: ${{ inputs.has-gpu == 'false' }} + if: steps.gpu-check.outputs.good == 'false' shell: bash run: | echo "::group::Distributed tests" @@ -30,7 +39,7 @@ runs: echo "::endgroup::" - name: Run Python tests - CPU - if: ${{ inputs.has-gpu == 'false' }} + if: steps.gpu-check.outputs.good == 'false' shell: bash env: DEVICE: cpu @@ -40,7 +49,7 @@ runs: echo "::endgroup::" - name: Run Python tests - GPU - if: ${{ inputs.has-gpu == 'true' }} + if: steps.gpu-check.outputs.good == 'true' shell: bash env: DEVICE: gpu @@ -56,17 +65,17 @@ runs: DEVICE: cpu run: | echo "::group::CPP tests - CPU" - ./build/tests/tests + ./build/cpp/mlx/tests/tests echo "::endgroup::" - name: Run CPP tests - GPU - if: ${{ inputs.has-gpu == 'true' }} + if: steps.gpu-check.outputs.good == 'true' shell: bash env: DEVICE: gpu run: | echo "::group::CPP tests - GPU" - ./build/tests/tests -sfe="*linalg_tests.cpp" + ./build/cpp/mlx/tests/tests -sfe="*linalg_tests.cpp" echo "::endgroup::" - name: Show stack trace on crash diff --git a/.github/actions/test-wheel/action.yml b/.github/actions/test-wheel/action.yml new file mode 100644 index 0000000000..6e282d2691 --- /dev/null +++ b/.github/actions/test-wheel/action.yml @@ -0,0 +1,50 @@ +name: 'Test wheel' +description: 'Run tests with the built wheels' + +inputs: + toolkit: + description: 'Which toolkit to test' + required: false + default: 'cpu' + +runs: + using: 'composite' + steps: + - name: Get Python version + id: python + shell: bash + run: python -c "import sys; print(f'version={sys.version_info.major}.{sys.version_info.minor}')" >> $GITHUB_OUTPUT + + - name: Download frontend packages + uses: actions/download-artifact@v8 + with: + pattern: frontend-${{ runner.os }}-${{ runner.arch }}-py${{ steps.python.outputs.version }} + path: wheelhouse + + - name: Download backend packages + uses: actions/download-artifact@v8 + with: + pattern: backend-${{ inputs.toolkit}}-${{ runner.os }}-${{ runner.arch }} + path: wheelhouse + + - name: Test local packages + shell: bash + run: | + echo "::group::Test local packages" + uv pip install torch --index-url https://download.pytorch.org/whl/cpu + uv pip install numpy + if ${{ inputs.toolkit == 'cpu' }} ; then + uv pip install wheelhouse/mlx_cpu*.whl + uv pip install wheelhouse/mlx-*.whl + elif ${{ startsWith(inputs.toolkit, 'cuda') }} ; then + uv pip install wheelhouse/mlx_cuda*.whl + uv pip install wheelhouse/mlx-*.whl + elif ${{ inputs.toolkit == 'metal' }} ; then + uv pip install wheelhouse/mlx_metal-*-macosx_26_0_arm64.whl + uv pip install wheelhouse/mlx-*-macosx_26_0_arm64.whl + else + echo "No matching backend wheel to install" + exit 1 + fi + python -m unittest discover -v python/tests + echo "::endgroup::" diff --git a/.github/actions/test-windows/action.yml b/.github/actions/test-windows/action.yml index c2714df5a8..ac812d8af9 100644 --- a/.github/actions/test-windows/action.yml +++ b/.github/actions/test-windows/action.yml @@ -1,10 +1,13 @@ -name: 'Run tests on Windows' +name: 'Run tests' +description: 'Run Python and C++ tests on Windows' runs: using: 'composite' steps: - name: Run Python tests - CPU shell: bash + env: + DEVICE: cpu run: | echo "::group::Python tests - CPU" python -m unittest discover python/tests -v @@ -16,6 +19,5 @@ runs: DEVICE: cpu run: | echo "::group::CPP tests - CPU" - ./build/tests.exe -tce="*gguf*" - ./build/test_teardown.exe + ./build/cpp/mlx/tests.exe -tce="*gguf*" echo "::endgroup::" diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 69777f3cd4..faec6672a4 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -23,43 +23,54 @@ jobs: - uses: actions/checkout@v7 - uses: pre-commit/action@v3.0.1 - linux_build_and_test: - name: Linux (cpu, ${{ matrix.arch }}) - needs: check_lint - strategy: - fail-fast: false - matrix: - arch: ['x86_64', 'aarch64'] - runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }} - steps: - - uses: actions/checkout@v7 - - uses: ./.github/actions/setup-linux - - uses: ./.github/actions/build-linux - - uses: ./.github/actions/test-linux - - run: df -h - - cuda_build_and_test: - name: Linux (${{ matrix.toolkit }}, ${{ matrix.arch }}) + build_and_test: + name: ${{ matrix.os }} (${{ matrix.toolkit }}, ${{ matrix.arch }}) if: github.repository == 'ml-explore/mlx' needs: check_lint strategy: fail-fast: false matrix: + os: ['Linux', 'Windows'] arch: ['x86_64', 'aarch64'] - toolkit: ['cuda-12.6', 'cuda-12.9'] - runs-on: ${{ matrix.arch == 'x86_64' && 'gpu-t4-4-core' || 'ubuntu-22.04-arm' }} + toolkit: ['cpu', 'cuda-12.6', 'cuda-12.9', 'cuda-13.0'] + exclude: + # CUDA does not support Windows on arm. + - os: 'Windows' + arch: 'aarch64' + toolkit: 'cuda-12.6' + - os: 'Windows' + arch: 'aarch64' + toolkit: 'cuda-12.9' + - os: 'Windows' + arch: 'aarch64' + toolkit: 'cuda-13.0' + # CUDA 12.6 does not compile with CUTLASS on Windows. + - os: 'Windows' + arch: 'x86_64' + toolkit: 'cuda-12.6' + runs-on: |- + ${{ case(matrix.os == 'Windows', + case(matrix.arch == 'aarch64', 'windows-11-arm', + 'windows-2022'), + case(matrix.arch == 'x86_64' && startsWith(matrix.toolkit, 'cuda'), 'gpu-t4-4-core', + matrix.arch == 'aarch64', 'ubuntu-22.04-arm', + 'ubuntu-22.04')) + }} steps: - uses: actions/checkout@v7 - - uses: ./.github/actions/setup-linux + - uses: ./.github/actions/setup + id: setup with: toolkit: ${{ matrix.toolkit }} - - uses: ./.github/actions/build-linux + - uses: ./.github/actions/build with: - toolkit: ${{ matrix.toolkit }} + cmake-args: ${{ steps.setup.outputs.cmake-args }} + # For MSVC, Ninja/Release is the only config supported by ccache. + debug: ${{ matrix.os != 'Windows' }} - uses: ./.github/actions/test-linux - if: matrix.arch == 'x86_64' - with: - has-gpu: true + if: matrix.os == 'Linux' && (matrix.toolkit == 'cpu' || matrix.arch == 'x86_64') + - uses: ./.github/actions/test-windows + if: matrix.os == 'Windows' && matrix.toolkit == 'cpu' mac_build_and_test: name: macOS (${{ matrix.macos-target }}) @@ -73,19 +84,12 @@ jobs: needs: check_lint steps: - uses: actions/checkout@v7 - - uses: ./.github/actions/setup-macos + - uses: ./.github/actions/setup + with: + toolkit: 'metal' + ccache-key: 'test-${{ matrix.macos-target }}' - uses: ./.github/actions/build-macos - windows_build_and_test: - name: Windows (cpu, x86_64) - needs: check_lint - runs-on: windows-2025 - steps: - - uses: actions/checkout@v7 - - uses: ./.github/actions/setup-windows - - uses: ./.github/actions/build-windows - - uses: ./.github/actions/test-windows - build_documentation: name: Build Documentation if: github.repository == 'ml-explore/mlx' diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml deleted file mode 100644 index 9e750ff47d..0000000000 --- a/.github/workflows/nightly.yml +++ /dev/null @@ -1,108 +0,0 @@ -name: Nightly Build - -on: - schedule: - - cron: 33 6 * * 1-5 - workflow_dispatch: - -permissions: - contents: read - -jobs: - build_linux_release: - strategy: - fail-fast: false - matrix: - python_version: ["3.10", "3.14"] - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v7 - - uses: ./.github/actions/setup-linux - - uses: ./.github/actions/build-linux-release - with: - build-backend: ${{ matrix.python-version == '3.10' }} - arch: "x86_64" - - name: Upload mlx artifacts - uses: actions/upload-artifact@v7 - with: - name: linux-wheels-${{ matrix.python_version }} - path: wheelhouse/mlx-*.whl - retention-days: 7 - - name: Upload mlx-cpu artifacts - if: matrix.python_version == '3.10' - uses: actions/upload-artifact@v7 - with: - name: mlx-cpu - path: wheelhouse/mlx_cpu-*.whl - retention-days: 7 - - run: df -h - - build_linux_with_tests: - strategy: - fail-fast: false - matrix: - python_version: ["3.11", "3.12", "3.13", "3.14"] - runner: - - ubuntu-22.04 - - ubuntu-22.04-arm - runs-on: ${{ matrix.runner }} - steps: - - uses: actions/checkout@v7 - - uses: ./.github/actions/setup-linux - with: - python-version: ${{ matrix.python_version }} - - uses: ./.github/actions/build-linux - - uses: ./.github/actions/test-linux - - run: df -h - - build_mac_release: - if: github.repository == 'ml-explore/mlx' - strategy: - matrix: - python-version: ["3.10", "3.13"] - runs-on: [self-hosted, macos] - steps: - - uses: actions/checkout@v7 - - uses: ./.github/actions/setup-macos - with: - python-version: ${{ matrix.python-version }} - - uses: ./.github/actions/build-macos - - name: Build macOS 14 package - uses: ./.github/actions/build-macos-release - with: - macos-target: '14.0' - build-backend: ${{ matrix.python-version == '3.10' }} - - name: Build macOS 15 package - uses: ./.github/actions/build-macos-release - with: - macos-target: '15.0' - build-backend: ${{ matrix.python-version == '3.10' }} - - name: Build macOS 26 package - uses: ./.github/actions/build-macos-release - with: - macos-target: '26.2' - build-backend: ${{ matrix.python-version == '3.10' }} - - build_cuda_release: - if: github.repository == 'ml-explore/mlx' - strategy: - matrix: - arch: ['x86_64', 'aarch64'] - toolkit: ['cuda-12.9', 'cuda-13.0'] - runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22-large' || 'ubuntu-22-large-arm' }} - steps: - - uses: actions/checkout@v7 - - uses: ./.github/actions/setup-linux - with: - toolkit: ${{ matrix.toolkit }} - ccache-key: 'ccache-release' - - name: Build Python package - uses: ./.github/actions/build-cuda-release - with: - arch: ${{ matrix.arch }} - - name: Upload artifacts - uses: actions/upload-artifact@v7 - with: - name: mlx-${{ matrix.toolkit }}-${{ matrix.arch }} - path: wheelhouse/mlx_cuda_*.whl - retention-days: 7 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ab31ac53d2..d5c8a78b8f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,27 +1,35 @@ -name: PyPI Release +name: 'Release build' +description: 'Build Python wheels for nightly or offical releases' on: push: tags: - 'v*' - branches: - - 'test-publish/*' workflow_dispatch: inputs: - dry_run: - description: 'Dry run (do not publish to PyPi)' + publish: + description: 'Publish to PyPi' required: false type: boolean - dev_release: + dev-release: description: 'Development release (DEV_RELEASE=1)' required: false type: boolean + schedule: + - cron: 33 6 * * 1-5 + +# In jobs we must use |*publish| instead of |inputs.publish| because we can not +# set default value for workflow_dispatch inputs reliably. +env: + publish: &publish ${{ inputs.publish || github.event_name == 'push' }} + pypi-env: &pypi-env ${{ (inputs.publish || github.event_name == 'push') && 'pypi' || 'dry-run' }} permissions: contents: read jobs: build_documentation: + name: Build documentation if: github.repository == 'ml-explore/mlx' runs-on: ubuntu-22.04 steps: @@ -29,7 +37,8 @@ jobs: - uses: ./.github/actions/build-docs deploy_documentation: - if: ${{ !inputs.dry_run }} + name: Deploy documentation + if: *publish needs: build_documentation permissions: pages: write @@ -43,207 +52,266 @@ jobs: id: deployment uses: actions/deploy-pages@v5 - build_linux_release: - if: github.repository == 'ml-explore/mlx' + build_frontend: + name: ${{ matrix.os }} (python-${{ matrix.python-version }}, ${{ matrix.arch }}) strategy: matrix: - python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"] + os: ['Linux', 'Windows'] arch: ['x86_64', 'aarch64'] - runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }} - env: + python-version: ['3.10', '3.11', '3.12', '3.13', '3.14'] + # There is no cp310 binary for Windows on arm. + exclude: + - os: 'Windows' + arch: 'aarch64' + python-version: '3.10' + runs-on: |- + ${{ case(matrix.os == 'Windows', + case(matrix.arch == 'aarch64', 'windows-11-arm', + 'windows-2022'), + case(matrix.arch == 'aarch64', 'ubuntu-22.04-arm', + 'ubuntu-22.04')) + }} + env: &build-env PYPI_RELEASE: 1 - DEV_RELEASE: ${{ inputs.dev_release && 1 || 0 }} + DEV_RELEASE: ${{ inputs.dev-release && 1 || 0 }} steps: - uses: actions/checkout@v7 - - uses: ./.github/actions/setup-linux + - uses: ./.github/actions/setup + id: setup with: - python-version: ${{ matrix.python_version }} - use-ccache: false - - uses: ./.github/actions/build-linux-release + python-version: ${{ matrix.python-version }} + ccache-key: 'release' + ccache-save: false + - uses: ./.github/actions/build-wheel with: - build-backend: ${{ matrix.python_version == '3.10' }} - arch: ${{ matrix.arch }} - - name: Upload MLX artifacts - uses: actions/upload-artifact@v7 + cmake-args: ${{ steps.setup.outputs.cmake-args }} + build-backend: false + - uses: actions/upload-artifact@v7 with: - overwrite: true - name: linux-wheels-${{ matrix.python_version }}-${{ matrix.arch }} + name: frontend-${{ runner.os }}-${{ runner.arch }}-py${{ matrix.python-version }} path: wheelhouse/mlx-*.whl if-no-files-found: error - - name: Upload CPU artifacts - if: matrix.python_version == '3.10' - uses: actions/upload-artifact@v7 + + build_backend: + name: ${{ matrix.os }} (${{ matrix.toolkit }}, ${{ matrix.arch }}) + if: github.repository == 'ml-explore/mlx' + strategy: + matrix: + os: ['Linux', 'Windows'] + arch: ['x86_64', 'aarch64'] + toolkit: ['cpu', 'cuda-12.9', 'cuda-13.0'] + exclude: + # CUDA does not support Windows on arm. + - os: 'Windows' + arch: 'aarch64' + toolkit: 'cuda-12.9' + - os: 'Windows' + arch: 'aarch64' + toolkit: 'cuda-13.0' + runs-on: |- + ${{ case(matrix.os == 'Windows', + case(matrix.arch == 'aarch64', 'windows-11-arm', + 'windows-2022'), + case(matrix.arch == 'aarch64', 'ubuntu-22-large-arm', + 'ubuntu-22-large')) + }} + env: *build-env + steps: + - uses: actions/checkout@v7 + - uses: ./.github/actions/setup + id: setup with: - overwrite: true - name: mlx-cpu-${{ matrix.arch }} - path: wheelhouse/mlx_cpu-*.whl - if-no-files-found: error + toolkit: ${{ matrix.toolkit }} + ccache-key: 'release' + - uses: ./.github/actions/build-wheel + with: + cmake-args: ${{ steps.setup.outputs.cmake-args }} + build-frontend: false + - uses: actions/upload-artifact@v7 + with: + name: backend-${{ matrix.toolkit }}-${{ runner.os }}-${{ runner.arch }} + path: wheelhouse/*.whl + if-no-files-found: ignore - build_mac_release: + build_mac_wheels: + name: macOS (python-${{ matrix.python-version }}) if: github.repository == 'ml-explore/mlx' strategy: matrix: - python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] - runs-on: [self-hosted, macos] - env: - PYPI_RELEASE: 1 - DEV_RELEASE: ${{ inputs.dev_release && 1 || 0 }} + python-version: ['3.10', '3.11', '3.12', '3.13', '3.14'] + runs-on: 'macos-26' + env: *build-env steps: - uses: actions/checkout@v7 - - uses: ./.github/actions/setup-macos + - uses: ./.github/actions/setup + id: setup with: + toolkit: 'metal' python-version: ${{ matrix.python-version }} + ccache-key: 'release' + ccache-save: ${{ matrix.python-version == '3.10' }} - name: Build macOS 14 package - uses: ./.github/actions/build-macos-release + uses: ./.github/actions/build-wheel with: macos-target: '14.0' + cmake-args: ${{ steps.setup.outputs.cmake-args }} build-backend: ${{ matrix.python-version == '3.10' }} - name: Build macOS 15 package - uses: ./.github/actions/build-macos-release + uses: ./.github/actions/build-wheel with: macos-target: '15.0' + cmake-args: ${{ steps.setup.outputs.cmake-args }} build-backend: ${{ matrix.python-version == '3.10' }} - name: Build macOS 26 package - uses: ./.github/actions/build-macos-release + uses: ./.github/actions/build-wheel with: macos-target: '26.2' + cmake-args: ${{ steps.setup.outputs.cmake-args }} build-backend: ${{ matrix.python-version == '3.10' }} - - name: Upload MLX artifacts + - name: Upload frontend packages uses: actions/upload-artifact@v7 with: - overwrite: true - name: mac-wheels-${{ matrix.python-version }} - path: dist/mlx-*.whl + name: frontend-${{ runner.os }}-${{ runner.arch }}-py${{ matrix.python-version }} + path: wheelhouse/mlx-*.whl if-no-files-found: error - - name: Upload Metal artifacts + - name: Upload backend packages if: matrix.python-version == '3.10' uses: actions/upload-artifact@v7 with: - overwrite: true - name: mlx-metal - path: dist/mlx_metal-*.whl + name: backend-metal-${{ runner.os }}-${{ runner.arch }} + path: wheelhouse/mlx_metal-*.whl if-no-files-found: error - build_cuda_release: + test_wheel: + name: Test (${{ matrix.os }}, ${{ matrix.toolkit }}, ${{ matrix.arch }}) if: github.repository == 'ml-explore/mlx' + needs: [build_frontend, build_backend, build_mac_wheels] strategy: matrix: - arch: ['x86_64', 'aarch64'] - toolkit: ['cuda-12.9', 'cuda-13.0'] - runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22-large' || 'ubuntu-22-large-arm' }} - env: - PYPI_RELEASE: 1 - DEV_RELEASE: ${{ inputs.dev_release && 1 || 0 }} + os: ['Linux', 'Windows'] + arch: ['aarch64'] + toolkit: ['cpu'] + include: + - os: 'Linux' + arch: 'x86_64' + toolkit: 'cpu' + - os: 'Linux' + arch: 'x86_64' + toolkit: 'cuda-12.9' + - os: 'Linux' + arch: 'x86_64' + toolkit: 'cuda-13.0' + - os: 'Windows' + arch: 'x86_64' + toolkit: 'cpu' + - os: 'macOS' + arch: 'aarch64' + toolkit: 'metal' + runs-on: |- + ${{ case(matrix.os == 'Windows', case(matrix.arch == 'aarch64', 'windows-11-arm', + 'windows-2022'), + matrix.os == 'macOS', fromJson('["self-hosted","macos"]'), + case(matrix.arch == 'x86_64' && startsWith(matrix.toolkit, 'cuda'), 'gpu-t4-4-core', + matrix.arch == 'aarch64', 'ubuntu-22.04-arm', + 'ubuntu-22.04')) + }} steps: - uses: actions/checkout@v7 - - uses: ./.github/actions/setup-linux + - uses: ./.github/actions/setup with: toolkit: ${{ matrix.toolkit }} - ccache-key: 'ccache-release' - - name: Build Python package - uses: ./.github/actions/build-cuda-release - with: - arch: ${{ matrix.arch }} - - name: Upload artifacts - uses: actions/upload-artifact@v7 + use-ccache: false + - uses: ./.github/actions/test-wheel with: - overwrite: true - name: mlx-${{ matrix.toolkit }}-${{ matrix.arch }} - path: wheelhouse/mlx_cuda_*.whl - if-no-files-found: error + toolkit: ${{ matrix.toolkit }} - pypi-publish: - name: Upload release to PyPI + pypi-publish-frontend: + name: Publish mlx to PyPI runs-on: ubuntu-latest - needs: [build_linux_release, build_mac_release] + needs: [test_wheel] permissions: id-token: write environment: - name: ${{ inputs.dry_run && 'dry-run' || 'pypi' }} + name: *pypi-env url: https://pypi.org/p/mlx steps: - uses: actions/download-artifact@v8 with: - pattern: linux-wheels-* - merge-multiple: true - path: dist - - uses: actions/download-artifact@v8 - with: - pattern: mac-wheels-* + pattern: frontend-* merge-multiple: true path: dist - name: Display structure of downloaded files run: du -ah dist - name: Publish package distributions to PyPI - if: ${{ !inputs.dry_run }} + if: *publish uses: pypa/gh-action-pypi-publish@release/v1 with: repository-url: https://upload.pypi.org/legacy/ pypi-publish-cuda: - name: Upload CUDA release to PyPI + name: Publish mlx-cuda to PyPI runs-on: ubuntu-latest - needs: [build_cuda_release] + needs: [test_wheel] permissions: id-token: write environment: - name: ${{ inputs.dry_run && 'dry-run' || 'pypi' }} + name: *pypi-env url: https://pypi.org/p/mlx-cuda steps: - uses: actions/download-artifact@v8 with: - pattern: mlx-cuda-* + pattern: backend-cuda* merge-multiple: true path: dist - name: Display structure of downloaded files run: du -ah dist - name: Publish package distributions to PyPI - if: ${{ !inputs.dry_run }} + if: *publish uses: pypa/gh-action-pypi-publish@release/v1 with: repository-url: https://upload.pypi.org/legacy/ pypi-publish-cpu: - name: Upload CPU release to PyPI + name: Publish mlx-cpu to PyPI runs-on: ubuntu-latest - needs: [build_linux_release] + needs: [test_wheel] permissions: id-token: write environment: - name: ${{ inputs.dry_run && 'dry-run' || 'pypi' }} + name: *pypi-env url: https://pypi.org/p/mlx-cpu steps: - uses: actions/download-artifact@v8 with: - pattern: mlx-cpu-* + pattern: backend-cpu-* merge-multiple: true path: dist - name: Display structure of downloaded files run: du -ah dist - name: Publish package distributions to PyPI - if: ${{ !inputs.dry_run }} + if: *publish uses: pypa/gh-action-pypi-publish@release/v1 with: repository-url: https://upload.pypi.org/legacy/ pypi-publish-metal: - name: Upload Metal release to PyPI + name: Publish mlx-metal to PyPI runs-on: ubuntu-latest - needs: [build_mac_release] + needs: [test_wheel] permissions: id-token: write environment: - name: ${{ inputs.dry_run && 'dry-run' || 'pypi' }} + name: *pypi-env url: https://pypi.org/p/mlx-metal steps: - uses: actions/download-artifact@v8 with: - name: mlx-metal + pattern: backend-metal-* path: dist - name: Display structure of downloaded files run: du -ah dist - name: Publish package distributions to PyPI - if: ${{ !inputs.dry_run }} + if: *publish uses: pypa/gh-action-pypi-publish@release/v1 with: repository-url: https://upload.pypi.org/legacy/ diff --git a/.gitignore b/.gitignore index 1daaa46d12..161d8d67eb 100644 --- a/.gitignore +++ b/.gitignore @@ -11,11 +11,11 @@ __pycache__/ *.metallib # Distribution / packaging +/build/ python/mlx/core python/mlx/share python/mlx/include .Python -build/ develop-eggs/ dist/ downloads/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 3461faea5b..f8ee65ad4b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -209,6 +209,7 @@ if(MLX_BUILD_METAL) if(CMAKE_OSX_DEPLOYMENT_TARGET VERSION_LESS 14.0) message(FATAL_ERROR "MLX requires macOS >= 14.0") endif() + message(STATUS "Building for macOS ${CMAKE_OSX_DEPLOYMENT_TARGET}") # Get Metal version. set(XCRUN_FLAGS "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}") @@ -278,19 +279,29 @@ if(MLX_BUILD_CPU) add_compile_definitions(MLX_USE_ACCELERATE) add_compile_definitions(ACCELERATE_NEW_LAPACK) elseif(WIN32) + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64") + set(OPENBLAS_ZIP "OpenBLAS-0.3.33-woa64-dll.zip") + set(OPENBLAS_LIB "openblas") + set(OPENBLAS_INC "include/openblas") + else() + set(OPENBLAS_ZIP "OpenBLAS-0.3.33-x64.zip") + set(OPENBLAS_LIB "libopenblas") + set(OPENBLAS_INC "include") + endif() # Download and link prebuilt binaries of OpenBLAS. Note that we can only # link with the dynamic library, the prebuilt binaries were built with MinGW # so static-linking would require linking with MinGW's runtime. FetchContent_Declare( openblas - URL "https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.31/OpenBLAS-0.3.31-x64.zip" + URL "https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.33/${OPENBLAS_ZIP}" ) FetchContent_MakeAvailable(openblas) - target_link_libraries(mlx - PRIVATE "${openblas_SOURCE_DIR}/lib/libopenblas.lib") - target_include_directories(mlx PRIVATE "${openblas_SOURCE_DIR}/include") + target_link_libraries( + mlx PRIVATE "${openblas_SOURCE_DIR}/lib/${OPENBLAS_LIB}.lib") + target_include_directories(mlx + PRIVATE "${openblas_SOURCE_DIR}/${OPENBLAS_INC}") # Make sure the DLL file is placed in the same dir with executables. - set(OPENBLAS_DLL_FILE "${openblas_SOURCE_DIR}/bin/libopenblas.dll") + set(OPENBLAS_DLL_FILE "${openblas_SOURCE_DIR}/bin/${OPENBLAS_LIB}.dll") add_custom_command( TARGET mlx POST_BUILD diff --git a/MANIFEST.in b/MANIFEST.in index d0daeb7ae6..632fae78c6 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,6 @@ include CMakeLists.txt include mlx.pc.in -recursive-include mlx/ * +recursive-include mlx * include cmake/* include python/src/* include python/mlx/py.typed # support type hinting as in PEP-561 diff --git a/mlx/backend/cuda/CMakeLists.txt b/mlx/backend/cuda/CMakeLists.txt index 10adffc23d..065220e24e 100644 --- a/mlx/backend/cuda/CMakeLists.txt +++ b/mlx/backend/cuda/CMakeLists.txt @@ -72,6 +72,10 @@ target_sources( ${CMAKE_CURRENT_SOURCE_DIR}/quantized/convert_fp8.cu ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp) +# Put dynamic defines in the dirs.cpp file. +add_library(mlx_dirs OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/dirs.cpp) +target_link_libraries(mlx PRIVATE $) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/binary) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unary) @@ -197,8 +201,8 @@ if(WIN32) # Pass the locations where CUDA DLLs are placed. if(NOT MLX_LOAD_CUDA_LIBS_FROM_PYTHON) target_compile_definitions( - mlx PUBLIC MLX_CUDA_BIN_DIR="${CUDAToolkit_BIN_DIR}/x64" - MLX_CUDNN_BIN_DIR="${CUDNN_BIN_DIR}") + mlx_dirs PRIVATE MLX_CUDA_BIN_DIR="${CUDAToolkit_BIN_DIR}/x64" + MLX_CUDNN_BIN_DIR="${CUDNN_BIN_DIR}") endif() else() # For POSIX we rely on RPATH to search for CUDA libs. @@ -208,9 +212,11 @@ else() APPEND PROPERTY INSTALL_RPATH # The paths here should match the install_requires in setup.py. + "$ORIGIN/../../nvidia/cu13/lib" "$ORIGIN/../../nvidia/cublas/lib" "$ORIGIN/../../nvidia/cuda_nvrtc/lib" "$ORIGIN/../../nvidia/cudnn/lib" + "$ORIGIN/../../nvidia/cufft/lib" "$ORIGIN/../../nvidia/nccl/lib") endif() endif() @@ -233,7 +239,7 @@ install(DIRECTORY ${cccl_SOURCE_DIR}/include/nv # The binary of C++ tests will not be installed so it can not find the CCCL # headers, and we have to hard-code the path. if(MLX_BUILD_TESTS) - target_compile_definitions(mlx + target_compile_definitions(mlx_dirs PRIVATE MLX_CCCL_DIR="${cccl_SOURCE_DIR}/include") endif() diff --git a/mlx/backend/cuda/delayload.cpp b/mlx/backend/cuda/delayload.cpp index 0ac1cc5e4b..aba7566c5b 100644 --- a/mlx/backend/cuda/delayload.cpp +++ b/mlx/backend/cuda/delayload.cpp @@ -7,7 +7,11 @@ #include // clang-format on -namespace mlx::core { +namespace mlx::core::cu { + +// Defined in dirs.cpp to avoid invalidating compile cache. +const char* cuda_bin_dir(); +const char* cudnn_bin_dir(); namespace fs = std::filesystem; @@ -15,35 +19,27 @@ inline fs::path relative_to_current_binary(const char* relative) { return fs::absolute(current_binary_dir() / relative); } -inline fs::path cublas_bin_dir() { -#if defined(MLX_CUDA_BIN_DIR) - return MLX_CUDA_BIN_DIR; -#else - return relative_to_current_binary("../nvidia/cublas/bin"); -#endif +inline fs::path cublas_dir() { + return cuda_bin_dir() ? fs::path(cuda_bin_dir()) + : relative_to_current_binary("../nvidia/cublas/bin"); } fs::path load_nvrtc() { -#if defined(MLX_CUDA_BIN_DIR) - fs::path nvrtc_bin_dir = MLX_CUDA_BIN_DIR; -#else - fs::path nvrtc_bin_dir = - relative_to_current_binary("../nvidia/cuda_nvrtc/bin"); -#endif + fs::path nvrtc_dir = cuda_bin_dir() + ? fs::path(cuda_bin_dir()) + : relative_to_current_binary("../nvidia/cuda_nvrtc/bin"); // Internally nvrtc loads some libs dynamically, add to search dirs. - ::AddDllDirectory(nvrtc_bin_dir.c_str()); - return nvrtc_bin_dir; + ::AddDllDirectory(nvrtc_dir.c_str()); + return nvrtc_dir; } fs::path load_cudnn() { -#if defined(MLX_CUDNN_BIN_DIR) - fs::path cudnn_bin_dir = MLX_CUDNN_BIN_DIR; -#else - fs::path cudnn_bin_dir = relative_to_current_binary("../nvidia/cudnn/bin"); -#endif + fs::path cudnn_dir = cudnn_bin_dir() + ? fs::path(cudnn_bin_dir()) + : relative_to_current_binary("../nvidia/cudnn/bin"); // Must load cudnn_graph64_9.dll before locating symbols, otherwise We would // get errors like "Invalid handle. Cannot load symbol cudnnCreate". - for (const auto& dll : fs::directory_iterator(cudnn_bin_dir)) { + for (const auto& dll : fs::directory_iterator(cudnn_dir)) { if (dll.path().filename().string().starts_with("cudnn_graph") && dll.path().extension() == ".dll") { ::LoadLibraryW(dll.path().c_str()); @@ -52,9 +48,9 @@ fs::path load_cudnn() { } // Internally cuDNN loads some libs dynamically, add to search dirs. load_nvrtc(); - ::AddDllDirectory(cudnn_bin_dir.c_str()); - ::AddDllDirectory(cublas_bin_dir().c_str()); - return cudnn_bin_dir; + ::AddDllDirectory(cudnn_dir.c_str()); + ::AddDllDirectory(cublas_dir().c_str()); + return cudnn_dir; } // Called by system when failed to locate a lazy-loaded DLL. @@ -63,18 +59,19 @@ FARPROC WINAPI delayload_helper(unsigned dliNotify, PDelayLoadInfo pdli) { if (dliNotify == dliNotePreLoadLibrary) { std::string dll = pdli->szDll; if (dll.starts_with("cudnn")) { - static auto cudnn_bin_dir = load_cudnn(); - mod = ::LoadLibraryW((cudnn_bin_dir / dll).c_str()); + static auto cudnn_dir = load_cudnn(); + mod = ::LoadLibraryW((cudnn_dir / dll).c_str()); } else if (dll.starts_with("cublas")) { - mod = ::LoadLibraryW((cublas_bin_dir() / dll).c_str()); + mod = ::LoadLibraryW((cublas_dir() / dll).c_str()); } else if (dll.starts_with("nvrtc")) { - static auto nvrtc_bin_dir = load_nvrtc(); - mod = ::LoadLibraryW((nvrtc_bin_dir / dll).c_str()); + static auto nvrtc_dir = load_nvrtc(); + mod = ::LoadLibraryW((nvrtc_dir / dll).c_str()); } } return reinterpret_cast(mod); } -} // namespace mlx::core +} // namespace mlx::core::cu -extern "C" const PfnDliHook __pfnDliNotifyHook2 = mlx::core::delayload_helper; +extern "C" const PfnDliHook __pfnDliNotifyHook2 = + mlx::core::cu::delayload_helper; diff --git a/mlx/backend/cuda/dirs.cpp b/mlx/backend/cuda/dirs.cpp new file mode 100644 index 0000000000..a9d33b4790 --- /dev/null +++ b/mlx/backend/cuda/dirs.cpp @@ -0,0 +1,29 @@ +// Copyright © 2026 Apple Inc. + +namespace mlx::core::cu { + +const char* cccl_dir() { +#if defined(MLX_CCCL_DIR) + return MLX_CCCL_DIR; +#else + return nullptr; +#endif +} + +const char* cuda_bin_dir() { +#if defined(MLX_CUDA_BIN_DIR) + return MLX_CUDA_BIN_DIR; +#else + return nullptr; +#endif +} + +const char* cudnn_bin_dir() { +#if defined(MLX_CUDNN_BIN_DIR) + return MLX_CUDNN_BIN_DIR; +#else + return nullptr; +#endif +} + +} // namespace mlx::core::cu diff --git a/mlx/backend/cuda/jit_module.cpp b/mlx/backend/cuda/jit_module.cpp index 3a493fd14e..3de1ddb018 100644 --- a/mlx/backend/cuda/jit_module.cpp +++ b/mlx/backend/cuda/jit_module.cpp @@ -16,6 +16,9 @@ namespace mlx::core::cu { +// Defined in dirs.cpp to avoid invalidating compile cache. +const char* cccl_dir(); + namespace { #define CHECK_NVRTC_ERROR(cmd) check_nvrtc_error(#cmd, (cmd)) @@ -61,11 +64,9 @@ const std::vector& include_path_args() { } // Add path to CCCL headers. path = path / "cccl"; -#if defined(MLX_CCCL_DIR) - if (!std::filesystem::exists(path)) { - path = MLX_CCCL_DIR; + if (!std::filesystem::exists(path) && cccl_dir()) { + path = cccl_dir(); } -#endif if (std::filesystem::exists(path)) { args.push_back(fmt::format("--include-path={}", path.string())); } diff --git a/setup.py b/setup.py index b27abab0c8..3c2f138048 100644 --- a/setup.py +++ b/setup.py @@ -68,6 +68,14 @@ def __init__(self, name: str, sourcedir: str = "") -> None: class CMakeBuild(build_ext): + def finalize_options(self) -> None: + super().finalize_options() + + # Setuptools does some clever things for Windows to make it + # more "native" but eventually made our life harder, revert back. + if platform.system() == "Windows": + self.build_temp = os.path.dirname(self.build_temp) + def build_extension(self, ext: CMakeExtension) -> None: # Must be in this form due to bug in .resolve() only fixed in Python 3.10+ ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name) # type: ignore[no-untyped-call] @@ -99,29 +107,31 @@ def build_extension(self, ext: CMakeExtension) -> None: "-DMLX_BUILD_EXAMPLES=OFF", "-DBUILD_SHARED_LIBS=ON", ] + + # Adding CMake arguments set as environment variable + # (needed e.g. to build for ARM OSx on conda-forge) + if "CMAKE_ARGS" in os.environ: + cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item] + + # For release wheel force building for all supported arches. if build_stage == 2 and build_cuda: # Last arch is always real and virtual for forward-compatibility - cuda_archs = ";".join( - ( - "75-real", - "80-real", + cuda_archs = [ + "75-real", + "80-real", + "120a-real", + "120-virtual", + ] + if platform.system() == "Linux": + cuda_archs += [ "90a-real", "100a-real", - "120a-real", - "120-virtual", - ) - ) - cmake_args += [f"-DMLX_CUDA_ARCHITECTURES={cuda_archs}"] + "121a-real", + ] + cmake_args += [f"-DMLX_CUDA_ARCHITECTURES={';'.join(cuda_archs)}"] # Search CUDA libs from python packages. cmake_args += ["-DMLX_LOAD_CUDA_LIBS_FROM_PYTHON=ON"] - # Some generators require explcitly passing config when building. - build_args = ["--config", cfg] - # Adding CMake arguments set as environment variable - # (needed e.g. to build for ARM OSx on conda-forge) - if "CMAKE_ARGS" in os.environ: - cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item] - # Pass version to C++ cmake_args += [f"-DMLX_VERSION={self.distribution.get_version()}"] # type: ignore[attr-defined] @@ -131,13 +141,17 @@ def build_extension(self, ext: CMakeExtension) -> None: if archs: cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))] + # Some generators require explcitly passing config when building. + build_args = ["--config", cfg] + # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level # across all generators. if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ: build_args += [f"-j{os.cpu_count()}"] # Avoid cache miss when building from temporary dirs. - os.environ["CCACHE_BASEDIR"] = os.path.realpath(self.build_temp) + os.environ["CCACHE_BASEDIR"] = os.path.realpath(build_temp) + os.environ["CCACHE_IGNOREHEADERS"] = os.path.realpath(build_temp) os.environ["CCACHE_NOHASHDIR"] = "true" subprocess.run( @@ -289,24 +303,29 @@ def get_tag(self) -> tuple[str, str, str]: toolkit = cuda_toolkit_major_version() name = f"mlx-cuda-{toolkit}" # Note: update following files when new dependency is added: - # * .github/actions/build-cuda-release/action.yml + # * .github/actions/build-wheel/action.yml # * mlx/backend/cuda/CMakeLists.txt + install_requires += [ + f"nvidia-cudnn-cu{toolkit}==9.*", + ] + if platform.system() == "Linux": + install_requires += [ + f"nvidia-nccl-cu{toolkit}", + ] if toolkit == 12: install_requires += [ "nvidia-cublas-cu12==12.9.*", + "nvidia-cufft-cu12==11.4.*", "nvidia-cuda-nvrtc-cu12==12.9.*", ] elif toolkit == 13: install_requires += [ "nvidia-cublas", + "nvidia-cufft", "nvidia-cuda-nvrtc", ] else: raise ValueError(f"Unknown toolkit {toolkit}") - install_requires += [ - f"nvidia-cudnn-cu{toolkit}==9.*", - f"nvidia-nccl-cu{toolkit}", - ] else: name = "mlx-cpu"