diff --git a/.github/workflows/docker_image.yml b/.github/workflows/docker_image.yml index 20c3b594d8ac..eff5b389a1af 100644 --- a/.github/workflows/docker_image.yml +++ b/.github/workflows/docker_image.yml @@ -21,7 +21,8 @@ on: - main paths: - '.github/workflows/docker_image.yml' - - '.github/workflows/util/install-resources.sh' + - '.github/workflows/util/install-spark-deps.sh' + - '.github/workflows/util/install-spark-resources.sh' - 'dev/docker/*' - 'dev/docker/cudf/*' - 'dev/docker/ubuntu/*' diff --git a/.github/workflows/flink.yml b/.github/workflows/flink.yml index 5bd953c77cf8..b53a8f62c5ec 100644 --- a/.github/workflows/flink.yml +++ b/.github/workflows/flink.yml @@ -60,7 +60,7 @@ jobs: source /opt/rh/gcc-toolset-11/enable sudo dnf install -y patchelf sudo yum install https://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os/Packages/tzdata-2025a-1.el9.noarch.rpm -y - sudo .github/workflows/util/install-flink-resources.sh + sudo .github/workflows/util/install-flink-deps.sh git clone -b gluten-0530 https://github.com/bigo-sg/velox4j.git cd velox4j && git reset --hard 889bafcf2fa04e8c31a30edbdf40fe203ef58484 git apply $GITHUB_WORKSPACE/gluten-flink/patches/fix-velox4j.patch diff --git a/.github/workflows/util/install-flink-resources.sh b/.github/workflows/util/install-flink-deps.sh similarity index 100% rename from .github/workflows/util/install-flink-resources.sh rename to .github/workflows/util/install-flink-deps.sh diff --git a/.github/workflows/util/install-resources.sh b/.github/workflows/util/install-spark-deps.sh similarity index 56% rename from .github/workflows/util/install-resources.sh rename to .github/workflows/util/install-spark-deps.sh index fb54080cce4f..71ddc69e2785 100755 --- a/.github/workflows/util/install-resources.sh +++ b/.github/workflows/util/install-spark-deps.sh @@ -18,8 +18,7 @@ # for spark.test.home in mvn test. # # This file can be: -# 1. Executed directly: ./install-resources.sh [install-dir] -# 2. Sourced to use functions: source install-resources.sh; install_hadoop; setup_hdfs +# Sourced to use functions: source install-spark-deps.sh; install_hadoop; setup_hdfs set -e @@ -152,112 +151,3 @@ function setup_minio { mc alias set s3local http://localhost:9100 "$MINIO_ROOT_USER" "$MINIO_ROOT_PASSWORD" mc mb -p s3local/gluten-it || true } - -# Installs Spark binary and source releases with: -# 1 - spark version -# 2 - hadoop version -# 3 - scala version -function install_spark() { - local spark_version="$1" - local hadoop_version="$2" - local scala_version="$3" - local spark_version_short=$(echo "${spark_version}" | cut -d '.' -f 1,2 | tr -d '.') - local scala_suffix=$([ "${scala_version}" == '2.13' ] && echo '-scala-2.13' || echo '') - local scala_suffix_short=$([ "${scala_version}" == '2.13' ] && echo '-scala2.13' || echo '') - local mirror_host='https://www.apache.org/dyn/closer.lua/' - local mirror_host2='https://mirror.lyrahosting.com/apache/' # Fallback mirror due to closer.lua slowness - local url_query='?action=download' - local checksum_suffix='sha512' - local url_path="spark/spark-${spark_version}/" - local local_binary="spark-${spark_version}-bin-hadoop${hadoop_version}${scala_suffix_short}.tgz" - local local_binary_checksum="${local_binary}.${checksum_suffix}" - local local_source="spark-${spark_version}.tgz" - local local_source_checksum="${local_source}.${checksum_suffix}" - local remote_binary="${mirror_host2}${url_path}${local_binary}${url_query}" - local remote_binary_checksum="${mirror_host}${url_path}${local_binary_checksum}${url_query}" - local remote_source="${mirror_host2}${url_path}${local_source}${url_query}" - local remote_source_checksum="${mirror_host}${url_path}${local_source_checksum}${url_query}" - local wget_opts="--no-verbose --no-check-certificate" - - wget ${wget_opts} -O "${local_binary}" "${remote_binary}" - wget ${wget_opts} -O "${local_source}" "${remote_source}" - - # Checksum may not have been specified; don't check if doesn't exist - if [ "$(command -v shasum)" ]; then - wget ${wget_opts} -O "${local_binary_checksum}" "${remote_binary_checksum}" - if ! shasum -a 512 -c "${local_binary_checksum}" > /dev/null ; then - echo "Bad checksum from ${remote_binary_checksum}" - rm -f "${local_binary_checksum}" - exit 2 - fi - rm -f "${local_binary_checksum}" - - wget ${wget_opts} -O "${local_source_checksum}" "${remote_source_checksum}" - if ! shasum -a 512 -c "${local_source_checksum}" > /dev/null ; then - echo "Bad checksum from ${remote_source_checksum}" - rm -f "${local_source_checksum}" - exit 2 - fi - rm -f "${local_source_checksum}" - else - echo "Skipping checksum because shasum is not installed." 1>&2 - fi - - tar --strip-components=1 -xf "${local_binary}" spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/jars/ \ - spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/python \ - spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/bin - mkdir -p ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home/assembly/target/scala-"${scala_version}" - mv jars ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home/assembly/target/scala-"${scala_version}" - mv python ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home - mv bin ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home - - tar --strip-components=1 -xf "${local_source}" spark-"${spark_version}"/sql/core/src/test/resources/ - mkdir -p shims/spark"${spark_version_short}${scala_suffix}"/spark_home/ - mv sql shims/spark"${spark_version_short}${scala_suffix}"/spark_home/ - - rm -rf "${local_binary}" - rm -rf "${local_source}" -} - -# Only run install_spark when script is executed directly (not sourced) -if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then - INSTALL_DIR=${2:-/opt/} - mkdir -p ${INSTALL_DIR} - - case "$1" in - 3.3) - # Spark-3.3 - cd ${INSTALL_DIR} && \ - install_spark "3.3.1" "3" "2.12" - ;; - 3.4) - # Spark-3.4 - cd ${INSTALL_DIR} && \ - install_spark "3.4.4" "3" "2.12" - ;; - 3.5) - # Spark-3.5 - cd ${INSTALL_DIR} && \ - install_spark "3.5.5" "3" "2.12" - ;; - 3.5-scala2.13) - # Spark-3.5, scala 2.13 - cd ${INSTALL_DIR} && \ - install_spark "3.5.5" "3" "2.13" - ;; - 4.0) - # Spark-4.0, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix - cd ${INSTALL_DIR} && \ - install_spark "4.0.1" "3" "2.12" - ;; - 4.1) - # Spark-4.x, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix - cd ${INSTALL_DIR} && \ - install_spark "4.1.1" "3" "2.12" - ;; - *) - echo "Spark version is expected to be specified." - exit 1 - ;; - esac -fi diff --git a/.github/workflows/util/install-spark-resources.sh b/.github/workflows/util/install-spark-resources.sh new file mode 100755 index 000000000000..31defd69b845 --- /dev/null +++ b/.github/workflows/util/install-spark-resources.sh @@ -0,0 +1,134 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Download Spark resources, required by some Spark UTs. The resource path should be set +# for spark.test.home in mvn test. +# +# This file can be: +# Executed directly: ./install-spark-resources.sh [install-dir] + +set -e + +# Installs Spark binary and source releases with: +# 1 - spark version +# 2 - hadoop version +# 3 - scala version +function install_spark() { + local spark_version="$1" + local hadoop_version="$2" + local scala_version="$3" + local spark_version_short=$(echo "${spark_version}" | cut -d '.' -f 1,2 | tr -d '.') + local scala_suffix=$([ "${scala_version}" == '2.13' ] && echo '-scala-2.13' || echo '') + local scala_suffix_short=$([ "${scala_version}" == '2.13' ] && echo '-scala2.13' || echo '') + local mirror_host='https://www.apache.org/dyn/closer.lua/' + local mirror_host2='https://mirror.lyrahosting.com/apache/' # Fallback mirror due to closer.lua slowness + local url_query='?action=download' + local checksum_suffix='sha512' + local url_path="spark/spark-${spark_version}/" + local local_binary="spark-${spark_version}-bin-hadoop${hadoop_version}${scala_suffix_short}.tgz" + local local_binary_checksum="${local_binary}.${checksum_suffix}" + local local_source="spark-${spark_version}.tgz" + local local_source_checksum="${local_source}.${checksum_suffix}" + local remote_binary="${mirror_host2}${url_path}${local_binary}${url_query}" + local remote_binary_checksum="${mirror_host}${url_path}${local_binary_checksum}${url_query}" + local remote_source="${mirror_host2}${url_path}${local_source}${url_query}" + local remote_source_checksum="${mirror_host}${url_path}${local_source_checksum}${url_query}" + local wget_opts="--no-verbose --no-check-certificate" + + wget ${wget_opts} -O "${local_binary}" "${remote_binary}" + wget ${wget_opts} -O "${local_source}" "${remote_source}" + + # Checksum may not have been specified; don't check if doesn't exist + if [ "$(command -v shasum)" ]; then + wget ${wget_opts} -O "${local_binary_checksum}" "${remote_binary_checksum}" + if ! shasum -a 512 -c "${local_binary_checksum}" > /dev/null ; then + echo "Bad checksum from ${remote_binary_checksum}" + rm -f "${local_binary_checksum}" + exit 2 + fi + rm -f "${local_binary_checksum}" + + wget ${wget_opts} -O "${local_source_checksum}" "${remote_source_checksum}" + if ! shasum -a 512 -c "${local_source_checksum}" > /dev/null ; then + echo "Bad checksum from ${remote_source_checksum}" + rm -f "${local_source_checksum}" + exit 2 + fi + rm -f "${local_source_checksum}" + else + echo "Skipping checksum because shasum is not installed." 1>&2 + fi + + tar --strip-components=1 -xf "${local_binary}" spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/jars/ \ + spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/python \ + spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/bin + mkdir -p ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home/assembly/target/scala-"${scala_version}" + mv jars ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home/assembly/target/scala-"${scala_version}" + mv python ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home + mv bin ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home + + tar --strip-components=1 -xf "${local_source}" spark-"${spark_version}"/sql/core/src/test/resources/ + mkdir -p shims/spark"${spark_version_short}${scala_suffix}"/spark_home/ + mv sql shims/spark"${spark_version_short}${scala_suffix}"/spark_home/ + + rm -rf "${local_binary}" + rm -rf "${local_source}" +} + +# Only run install_spark when script is executed directly (not sourced) +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + INSTALL_DIR=${2:-/opt/} + mkdir -p ${INSTALL_DIR} + + case "$1" in + 3.3) + # Spark-3.3 + cd ${INSTALL_DIR} && \ + install_spark "3.3.1" "3" "2.12" + ;; + 3.4) + # Spark-3.4 + cd ${INSTALL_DIR} && \ + install_spark "3.4.4" "3" "2.12" + ;; + 3.5) + # Spark-3.5 + cd ${INSTALL_DIR} && \ + install_spark "3.5.5" "3" "2.12" + ;; + 3.5-scala2.13) + # Spark-3.5, scala 2.13 + cd ${INSTALL_DIR} && \ + install_spark "3.5.5" "3" "2.13" + ;; + 4.0) + # Spark-4.0, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix + cd ${INSTALL_DIR} && \ + install_spark "4.0.1" "3" "2.12" + mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 + ;; + 4.1) + # Spark-4.x, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix + cd ${INSTALL_DIR} && \ + install_spark "4.1.1" "3" "2.12" + mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 + ;; + *) + echo "Spark version is expected to be specified." + exit 1 + ;; + esac +fi diff --git a/.github/workflows/velox_backend_enhanced.yml b/.github/workflows/velox_backend_enhanced.yml index f6f0f8588180..288f03d26017 100644 --- a/.github/workflows/velox_backend_enhanced.yml +++ b/.github/workflows/velox_backend_enhanced.yml @@ -228,10 +228,6 @@ jobs: with: name: arrow-jars-enhanced-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - - name: Prepare Spark Resources for Spark 3.5.5 - run: | - rm -rf /opt/shims/spark35 - bash .github/workflows/util/install-resources.sh 3.5 - name: Build and Run unit test for Spark 3.5.5 (slow tests) run: | cd $GITHUB_WORKSPACE/ @@ -286,8 +282,7 @@ jobs: - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update run: | rm -rf /opt/shims/spark40 - bash .github/workflows/util/install-resources.sh 4.0 - mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 + bash .github/workflows/util/install-spark-resources.sh 4.0 - name: Build and Run unit test for Spark 4.0.0 with scala-2.13 (other tests) run: | cd $GITHUB_WORKSPACE/ diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index a39fc65e87d1..75b700ad9be2 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -19,7 +19,8 @@ on: pull_request: paths: - '.github/workflows/velox_backend_x86.yml' - - '.github/workflows/util/install-resources.sh' #TODO remove after image update + - '.github/workflows/util/install-spark-deps.sh' #TODO remove after image update + - '.github/workflows/util/install-spark-resources.sh' #TODO remove after image update - 'pom.xml' - 'backends-velox/**' - 'gluten-uniffle/**' @@ -180,7 +181,7 @@ jobs: shell: bash run: | export JAVA_HOME=/usr/lib/jvm/${{ matrix.java }}-openjdk-amd64 - source .github/workflows/util/install-resources.sh + source .github/workflows/util/install-spark-deps.sh install_hadoop setup_hdfs - name: Install MinIO @@ -188,7 +189,7 @@ jobs: shell: bash run: | export JAVA_HOME=/usr/lib/jvm/${{ matrix.java }}-openjdk-amd64 - source .github/workflows/util/install-resources.sh + source .github/workflows/util/install-spark-deps.sh install_minio - name: Build and run TPC-H / TPC-DS shell: bash @@ -210,7 +211,7 @@ jobs: if [ "${{ matrix.os }}" = "ubuntu:22.04" ] && \ [ "${{ matrix.spark }}" = "spark-3.5" ] && \ [ "${{ matrix.java }}" = "java-8" ]; then - source $GITHUB_WORKSPACE/.github/workflows/util/install-resources.sh + source $GITHUB_WORKSPACE/.github/workflows/util/install-spark-deps.sh SPARK_VERSION=$(echo "${{ matrix.spark }}" | sed 's/spark-//') setup_minio "$SPARK_VERSION" fi @@ -1259,8 +1260,7 @@ jobs: - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update run: | rm -rf /opt/shims/spark40 - bash .github/workflows/util/install-resources.sh 4.0 - mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 + bash .github/workflows/util/install-spark-resources.sh 4.0 - name: Build and Run unit test for Spark 4.0.0 with scala-2.13 (other tests) run: | cd $GITHUB_WORKSPACE/ @@ -1309,8 +1309,7 @@ jobs: - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update run: | rm -rf /opt/shims/spark40 - bash .github/workflows/util/install-resources.sh 4.0 - mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 + bash .github/workflows/util/install-spark-resources.sh 4.0 - name: Build and Run unit test for Spark 4.0 (slow tests) run: | cd $GITHUB_WORKSPACE/ @@ -1367,7 +1366,7 @@ jobs: - name: Prepare Spark Resources for Spark 4.1.0 #TODO remove after image update run: | rm -rf /opt/shims/spark41 - bash .github/workflows/util/install-resources.sh 4.1 + bash .github/workflows/util/install-spark-resources.sh 4.1 mv /opt/shims/spark41/spark_home/assembly/target/scala-2.12 /opt/shims/spark41/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.1.0 with scala-2.13 (other tests) run: | @@ -1417,7 +1416,7 @@ jobs: - name: Prepare Spark Resources for Spark 4.1.0 #TODO remove after image update run: | rm -rf /opt/shims/spark41 - bash .github/workflows/util/install-resources.sh 4.1 + bash .github/workflows/util/install-spark-resources.sh 4.1 mv /opt/shims/spark41/spark_home/assembly/target/scala-2.12 /opt/shims/spark41/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.0 (slow tests) run: | diff --git a/dev/docker/Dockerfile.centos8-dynamic-build b/dev/docker/Dockerfile.centos8-dynamic-build index c449921634b3..b87e234bab23 100644 --- a/dev/docker/Dockerfile.centos8-dynamic-build +++ b/dev/docker/Dockerfile.centos8-dynamic-build @@ -38,11 +38,11 @@ RUN set -ex; \ wget -nv ${mirror_host}/hadoop/common/hadoop-2.8.5/hadoop-2.8.5.tar.gz?action=download -O /opt/hadoop-2.8.5.tar.gz; \ git clone --depth=1 https://github.com/apache/gluten /opt/gluten; \ cd /opt/gluten/.github/workflows/util/; \ - ./install-resources.sh 3.3; \ - ./install-resources.sh 3.4; \ - ./install-resources.sh 3.5; \ - ./install-resources.sh 3.5-scala2.13; \ - ./install-resources.sh 4.0; \ + ./install-spark-resources.sh 3.3; \ + ./install-spark-resources.sh 3.4; \ + ./install-spark-resources.sh 3.5; \ + ./install-spark-resources.sh 3.5-scala2.13; \ + ./install-spark-resources.sh 4.0; \ ARCH=$(uname -m); \ if [[ "$ARCH" == "aarch64" || "$ARCH" == "ppc64le" ]]; then \ export CPU_TARGET="$ARCH"; \ diff --git a/dev/docker/Dockerfile.centos9-dynamic-build b/dev/docker/Dockerfile.centos9-dynamic-build index 55cc4dcc06b1..2034bf01e77f 100644 --- a/dev/docker/Dockerfile.centos9-dynamic-build +++ b/dev/docker/Dockerfile.centos9-dynamic-build @@ -36,11 +36,11 @@ RUN set -ex; \ wget -nv ${mirror_host}/hadoop/common/hadoop-2.8.5/hadoop-2.8.5.tar.gz?action=download -O /opt/hadoop-2.8.5.tar.gz; \ git clone --depth=1 https://github.com/apache/gluten /opt/gluten; \ cd /opt/gluten/.github/workflows/util/; \ - ./install-resources.sh 3.3; \ - ./install-resources.sh 3.4; \ - ./install-resources.sh 3.5; \ - ./install-resources.sh 3.5-scala2.13; \ - ./install-resources.sh 4.0; \ + ./install-spark-resources.sh 3.3; \ + ./install-spark-resources.sh 3.4; \ + ./install-spark-resources.sh 3.5; \ + ./install-spark-resources.sh 3.5-scala2.13; \ + ./install-spark-resources.sh 4.0; \ ARCH=$(uname -m); \ if [[ "$ARCH" == "aarch64" || "$ARCH" == "ppc64le" ]]; then \ export CPU_TARGET="$ARCH"; \