From a0beb85693a3212d44df76eec6c68650d696ef88 Mon Sep 17 00:00:00 2001 From: Yao-MR Date: Wed, 15 Apr 2026 15:38:15 +0800 Subject: [PATCH 1/4] [MINOR][DOCS]: Standardize flink && spark resource set script name to install-spark-deps.sh && install-flink-deps.sh --- .github/workflows/docker_image.yml | 3 +- .github/workflows/flink.yml | 2 +- ...ink-resources.sh => install-flink-deps.sh} | 0 ...all-resources.sh => install-spark-deps.sh} | 112 +-------------- .../workflows/util/install-spark-resources.sh | 132 ++++++++++++++++++ .github/workflows/velox_backend_enhanced.yml | 4 +- .github/workflows/velox_backend_x86.yml | 17 +-- dev/docker/Dockerfile.centos8-dynamic-build | 10 +- dev/docker/Dockerfile.centos9-dynamic-build | 10 +- 9 files changed, 157 insertions(+), 133 deletions(-) rename .github/workflows/util/{install-flink-resources.sh => install-flink-deps.sh} (100%) rename .github/workflows/util/{install-resources.sh => install-spark-deps.sh} (56%) create mode 100755 .github/workflows/util/install-spark-resources.sh diff --git a/.github/workflows/docker_image.yml b/.github/workflows/docker_image.yml index 20c3b594d8ac..eff5b389a1af 100644 --- a/.github/workflows/docker_image.yml +++ b/.github/workflows/docker_image.yml @@ -21,7 +21,8 @@ on: - main paths: - '.github/workflows/docker_image.yml' - - '.github/workflows/util/install-resources.sh' + - '.github/workflows/util/install-spark-deps.sh' + - '.github/workflows/util/install-spark-resources.sh' - 'dev/docker/*' - 'dev/docker/cudf/*' - 'dev/docker/ubuntu/*' diff --git a/.github/workflows/flink.yml b/.github/workflows/flink.yml index 5bd953c77cf8..b53a8f62c5ec 100644 --- a/.github/workflows/flink.yml +++ b/.github/workflows/flink.yml @@ -60,7 +60,7 @@ jobs: source /opt/rh/gcc-toolset-11/enable sudo dnf install -y patchelf sudo yum install https://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os/Packages/tzdata-2025a-1.el9.noarch.rpm -y - sudo .github/workflows/util/install-flink-resources.sh + sudo .github/workflows/util/install-flink-deps.sh git clone -b gluten-0530 https://github.com/bigo-sg/velox4j.git cd velox4j && git reset --hard 889bafcf2fa04e8c31a30edbdf40fe203ef58484 git apply $GITHUB_WORKSPACE/gluten-flink/patches/fix-velox4j.patch diff --git a/.github/workflows/util/install-flink-resources.sh b/.github/workflows/util/install-flink-deps.sh similarity index 100% rename from .github/workflows/util/install-flink-resources.sh rename to .github/workflows/util/install-flink-deps.sh diff --git a/.github/workflows/util/install-resources.sh b/.github/workflows/util/install-spark-deps.sh similarity index 56% rename from .github/workflows/util/install-resources.sh rename to .github/workflows/util/install-spark-deps.sh index fb54080cce4f..71ddc69e2785 100755 --- a/.github/workflows/util/install-resources.sh +++ b/.github/workflows/util/install-spark-deps.sh @@ -18,8 +18,7 @@ # for spark.test.home in mvn test. # # This file can be: -# 1. Executed directly: ./install-resources.sh [install-dir] -# 2. Sourced to use functions: source install-resources.sh; install_hadoop; setup_hdfs +# Sourced to use functions: source install-spark-deps.sh; install_hadoop; setup_hdfs set -e @@ -152,112 +151,3 @@ function setup_minio { mc alias set s3local http://localhost:9100 "$MINIO_ROOT_USER" "$MINIO_ROOT_PASSWORD" mc mb -p s3local/gluten-it || true } - -# Installs Spark binary and source releases with: -# 1 - spark version -# 2 - hadoop version -# 3 - scala version -function install_spark() { - local spark_version="$1" - local hadoop_version="$2" - local scala_version="$3" - local spark_version_short=$(echo "${spark_version}" | cut -d '.' -f 1,2 | tr -d '.') - local scala_suffix=$([ "${scala_version}" == '2.13' ] && echo '-scala-2.13' || echo '') - local scala_suffix_short=$([ "${scala_version}" == '2.13' ] && echo '-scala2.13' || echo '') - local mirror_host='https://www.apache.org/dyn/closer.lua/' - local mirror_host2='https://mirror.lyrahosting.com/apache/' # Fallback mirror due to closer.lua slowness - local url_query='?action=download' - local checksum_suffix='sha512' - local url_path="spark/spark-${spark_version}/" - local local_binary="spark-${spark_version}-bin-hadoop${hadoop_version}${scala_suffix_short}.tgz" - local local_binary_checksum="${local_binary}.${checksum_suffix}" - local local_source="spark-${spark_version}.tgz" - local local_source_checksum="${local_source}.${checksum_suffix}" - local remote_binary="${mirror_host2}${url_path}${local_binary}${url_query}" - local remote_binary_checksum="${mirror_host}${url_path}${local_binary_checksum}${url_query}" - local remote_source="${mirror_host2}${url_path}${local_source}${url_query}" - local remote_source_checksum="${mirror_host}${url_path}${local_source_checksum}${url_query}" - local wget_opts="--no-verbose --no-check-certificate" - - wget ${wget_opts} -O "${local_binary}" "${remote_binary}" - wget ${wget_opts} -O "${local_source}" "${remote_source}" - - # Checksum may not have been specified; don't check if doesn't exist - if [ "$(command -v shasum)" ]; then - wget ${wget_opts} -O "${local_binary_checksum}" "${remote_binary_checksum}" - if ! shasum -a 512 -c "${local_binary_checksum}" > /dev/null ; then - echo "Bad checksum from ${remote_binary_checksum}" - rm -f "${local_binary_checksum}" - exit 2 - fi - rm -f "${local_binary_checksum}" - - wget ${wget_opts} -O "${local_source_checksum}" "${remote_source_checksum}" - if ! shasum -a 512 -c "${local_source_checksum}" > /dev/null ; then - echo "Bad checksum from ${remote_source_checksum}" - rm -f "${local_source_checksum}" - exit 2 - fi - rm -f "${local_source_checksum}" - else - echo "Skipping checksum because shasum is not installed." 1>&2 - fi - - tar --strip-components=1 -xf "${local_binary}" spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/jars/ \ - spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/python \ - spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/bin - mkdir -p ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home/assembly/target/scala-"${scala_version}" - mv jars ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home/assembly/target/scala-"${scala_version}" - mv python ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home - mv bin ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home - - tar --strip-components=1 -xf "${local_source}" spark-"${spark_version}"/sql/core/src/test/resources/ - mkdir -p shims/spark"${spark_version_short}${scala_suffix}"/spark_home/ - mv sql shims/spark"${spark_version_short}${scala_suffix}"/spark_home/ - - rm -rf "${local_binary}" - rm -rf "${local_source}" -} - -# Only run install_spark when script is executed directly (not sourced) -if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then - INSTALL_DIR=${2:-/opt/} - mkdir -p ${INSTALL_DIR} - - case "$1" in - 3.3) - # Spark-3.3 - cd ${INSTALL_DIR} && \ - install_spark "3.3.1" "3" "2.12" - ;; - 3.4) - # Spark-3.4 - cd ${INSTALL_DIR} && \ - install_spark "3.4.4" "3" "2.12" - ;; - 3.5) - # Spark-3.5 - cd ${INSTALL_DIR} && \ - install_spark "3.5.5" "3" "2.12" - ;; - 3.5-scala2.13) - # Spark-3.5, scala 2.13 - cd ${INSTALL_DIR} && \ - install_spark "3.5.5" "3" "2.13" - ;; - 4.0) - # Spark-4.0, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix - cd ${INSTALL_DIR} && \ - install_spark "4.0.1" "3" "2.12" - ;; - 4.1) - # Spark-4.x, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix - cd ${INSTALL_DIR} && \ - install_spark "4.1.1" "3" "2.12" - ;; - *) - echo "Spark version is expected to be specified." - exit 1 - ;; - esac -fi diff --git a/.github/workflows/util/install-spark-resources.sh b/.github/workflows/util/install-spark-resources.sh new file mode 100755 index 000000000000..43677e4a5af6 --- /dev/null +++ b/.github/workflows/util/install-spark-resources.sh @@ -0,0 +1,132 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Download Spark resources, required by some Spark UTs. The resource path should be set +# for spark.test.home in mvn test. +# +# This file can be: +# Executed directly: ./install-spark-resources.sh [install-dir] + +set -e + +# Installs Spark binary and source releases with: +# 1 - spark version +# 2 - hadoop version +# 3 - scala version +function install_spark() { + local spark_version="$1" + local hadoop_version="$2" + local scala_version="$3" + local spark_version_short=$(echo "${spark_version}" | cut -d '.' -f 1,2 | tr -d '.') + local scala_suffix=$([ "${scala_version}" == '2.13' ] && echo '-scala-2.13' || echo '') + local scala_suffix_short=$([ "${scala_version}" == '2.13' ] && echo '-scala2.13' || echo '') + local mirror_host='https://www.apache.org/dyn/closer.lua/' + local mirror_host2='https://mirror.lyrahosting.com/apache/' # Fallback mirror due to closer.lua slowness + local url_query='?action=download' + local checksum_suffix='sha512' + local url_path="spark/spark-${spark_version}/" + local local_binary="spark-${spark_version}-bin-hadoop${hadoop_version}${scala_suffix_short}.tgz" + local local_binary_checksum="${local_binary}.${checksum_suffix}" + local local_source="spark-${spark_version}.tgz" + local local_source_checksum="${local_source}.${checksum_suffix}" + local remote_binary="${mirror_host2}${url_path}${local_binary}${url_query}" + local remote_binary_checksum="${mirror_host}${url_path}${local_binary_checksum}${url_query}" + local remote_source="${mirror_host2}${url_path}${local_source}${url_query}" + local remote_source_checksum="${mirror_host}${url_path}${local_source_checksum}${url_query}" + local wget_opts="--no-verbose --no-check-certificate" + + wget ${wget_opts} -O "${local_binary}" "${remote_binary}" + wget ${wget_opts} -O "${local_source}" "${remote_source}" + + # Checksum may not have been specified; don't check if doesn't exist + if [ "$(command -v shasum)" ]; then + wget ${wget_opts} -O "${local_binary_checksum}" "${remote_binary_checksum}" + if ! shasum -a 512 -c "${local_binary_checksum}" > /dev/null ; then + echo "Bad checksum from ${remote_binary_checksum}" + rm -f "${local_binary_checksum}" + exit 2 + fi + rm -f "${local_binary_checksum}" + + wget ${wget_opts} -O "${local_source_checksum}" "${remote_source_checksum}" + if ! shasum -a 512 -c "${local_source_checksum}" > /dev/null ; then + echo "Bad checksum from ${remote_source_checksum}" + rm -f "${local_source_checksum}" + exit 2 + fi + rm -f "${local_source_checksum}" + else + echo "Skipping checksum because shasum is not installed." 1>&2 + fi + + tar --strip-components=1 -xf "${local_binary}" spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/jars/ \ + spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/python \ + spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/bin + mkdir -p ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home/assembly/target/scala-"${scala_version}" + mv jars ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home/assembly/target/scala-"${scala_version}" + mv python ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home + mv bin ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home + + tar --strip-components=1 -xf "${local_source}" spark-"${spark_version}"/sql/core/src/test/resources/ + mkdir -p shims/spark"${spark_version_short}${scala_suffix}"/spark_home/ + mv sql shims/spark"${spark_version_short}${scala_suffix}"/spark_home/ + + rm -rf "${local_binary}" + rm -rf "${local_source}" +} + +# Only run install_spark when script is executed directly (not sourced) +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + INSTALL_DIR=${2:-/opt/} + mkdir -p ${INSTALL_DIR} + + case "$1" in + 3.3) + # Spark-3.3 + cd ${INSTALL_DIR} && \ + install_spark "3.3.1" "3" "2.12" + ;; + 3.4) + # Spark-3.4 + cd ${INSTALL_DIR} && \ + install_spark "3.4.4" "3" "2.12" + ;; + 3.5) + # Spark-3.5 + cd ${INSTALL_DIR} && \ + install_spark "3.5.5" "3" "2.12" + ;; + 3.5-scala2.13) + # Spark-3.5, scala 2.13 + cd ${INSTALL_DIR} && \ + install_spark "3.5.5" "3" "2.13" + ;; + 4.0) + # Spark-4.0, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix + cd ${INSTALL_DIR} && \ + install_spark "4.0.1" "3" "2.12" + ;; + 4.1) + # Spark-4.x, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix + cd ${INSTALL_DIR} && \ + install_spark "4.1.1" "3" "2.12" + ;; + *) + echo "Spark version is expected to be specified." + exit 1 + ;; + esac +fi diff --git a/.github/workflows/velox_backend_enhanced.yml b/.github/workflows/velox_backend_enhanced.yml index 183cd7a54a66..9e6306c18e5d 100644 --- a/.github/workflows/velox_backend_enhanced.yml +++ b/.github/workflows/velox_backend_enhanced.yml @@ -231,7 +231,7 @@ jobs: - name: Prepare Spark Resources for Spark 3.5.5 run: | rm -rf /opt/shims/spark35 - bash .github/workflows/util/install-resources.sh 3.5 + bash .github/workflows/util/install-spark-resources.sh 3.5 - name: Build and Run unit test for Spark 3.5.5 (slow tests) run: | cd $GITHUB_WORKSPACE/ @@ -286,7 +286,7 @@ jobs: - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update run: | rm -rf /opt/shims/spark40 - bash .github/workflows/util/install-resources.sh 4.0 + bash .github/workflows/util/install-spark-resources.sh 4.0 mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.0.0 with scala-2.13 (other tests) run: | diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index af17ac5ddb4c..09a2295587d5 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -19,7 +19,8 @@ on: pull_request: paths: - '.github/workflows/velox_backend_x86.yml' - - '.github/workflows/util/install-resources.sh' #TODO remove after image update + - '.github/workflows/util/install-spark-deps.sh' #TODO remove after image update + - '.github/workflows/util/install-spark-resources.sh' #TODO remove after image update - 'pom.xml' - 'backends-velox/**' - 'gluten-uniffle/**' @@ -180,7 +181,7 @@ jobs: shell: bash run: | export JAVA_HOME=/usr/lib/jvm/${{ matrix.java }}-openjdk-amd64 - source .github/workflows/util/install-resources.sh + source .github/workflows/util/install-spark-deps.sh install_hadoop setup_hdfs - name: Install MinIO @@ -188,7 +189,7 @@ jobs: shell: bash run: | export JAVA_HOME=/usr/lib/jvm/${{ matrix.java }}-openjdk-amd64 - source .github/workflows/util/install-resources.sh + source .github/workflows/util/install-spark-deps.sh install_minio - name: Build and run TPC-H / TPC-DS shell: bash @@ -210,7 +211,7 @@ jobs: if [ "${{ matrix.os }}" = "ubuntu:22.04" ] && \ [ "${{ matrix.spark }}" = "spark-3.5" ] && \ [ "${{ matrix.java }}" = "java-8" ]; then - source $GITHUB_WORKSPACE/.github/workflows/util/install-resources.sh + source $GITHUB_WORKSPACE/.github/workflows/util/install-spark-deps.sh SPARK_VERSION=$(echo "${{ matrix.spark }}" | sed 's/spark-//') setup_minio "$SPARK_VERSION" fi @@ -1259,7 +1260,7 @@ jobs: - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update run: | rm -rf /opt/shims/spark40 - bash .github/workflows/util/install-resources.sh 4.0 + bash .github/workflows/util/install-spark-resources.sh 4.0 mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.0.0 with scala-2.13 (other tests) run: | @@ -1309,7 +1310,7 @@ jobs: - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update run: | rm -rf /opt/shims/spark40 - bash .github/workflows/util/install-resources.sh 4.0 + bash .github/workflows/util/install-spark-resources.sh 4.0 mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.0 (slow tests) run: | @@ -1367,7 +1368,7 @@ jobs: - name: Prepare Spark Resources for Spark 4.1.0 #TODO remove after image update run: | rm -rf /opt/shims/spark41 - bash .github/workflows/util/install-resources.sh 4.1 + bash .github/workflows/util/install-spark-resources.sh 4.1 mv /opt/shims/spark41/spark_home/assembly/target/scala-2.12 /opt/shims/spark41/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.1.0 with scala-2.13 (other tests) run: | @@ -1417,7 +1418,7 @@ jobs: - name: Prepare Spark Resources for Spark 4.1.0 #TODO remove after image update run: | rm -rf /opt/shims/spark41 - bash .github/workflows/util/install-resources.sh 4.1 + bash .github/workflows/util/install-spark-resources.sh 4.1 mv /opt/shims/spark41/spark_home/assembly/target/scala-2.12 /opt/shims/spark41/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.0 (slow tests) run: | diff --git a/dev/docker/Dockerfile.centos8-dynamic-build b/dev/docker/Dockerfile.centos8-dynamic-build index 785c6c545b85..faef8d3916b4 100644 --- a/dev/docker/Dockerfile.centos8-dynamic-build +++ b/dev/docker/Dockerfile.centos8-dynamic-build @@ -38,11 +38,11 @@ RUN set -ex; \ wget -nv ${mirror_host}/hadoop/common/hadoop-2.8.5/hadoop-2.8.5.tar.gz?action=download -O /opt/hadoop-2.8.5.tar.gz; \ git clone --depth=1 https://github.com/apache/gluten /opt/gluten; \ cd /opt/gluten/.github/workflows/util/; \ - ./install-resources.sh 3.3; \ - ./install-resources.sh 3.4; \ - ./install-resources.sh 3.5; \ - ./install-resources.sh 3.5-scala2.13; \ - ./install-resources.sh 4.0; \ + ./install-spark-resources.sh 3.3; \ + ./install-spark-resources.sh 3.4; \ + ./install-spark-resources.sh 3.5; \ + ./install-spark-resources.sh 3.5-scala2.13; \ + ./install-spark-resources.sh 4.0; \ if [ "$(uname -m)" = "aarch64" ]; then \ export CPU_TARGET="aarch64"; \ fi; \ diff --git a/dev/docker/Dockerfile.centos9-dynamic-build b/dev/docker/Dockerfile.centos9-dynamic-build index 25866155cc96..125843217052 100644 --- a/dev/docker/Dockerfile.centos9-dynamic-build +++ b/dev/docker/Dockerfile.centos9-dynamic-build @@ -36,11 +36,11 @@ RUN set -ex; \ wget -nv ${mirror_host}/hadoop/common/hadoop-2.8.5/hadoop-2.8.5.tar.gz?action=download -O /opt/hadoop-2.8.5.tar.gz; \ git clone --depth=1 https://github.com/apache/gluten /opt/gluten; \ cd /opt/gluten/.github/workflows/util/; \ - ./install-resources.sh 3.3; \ - ./install-resources.sh 3.4; \ - ./install-resources.sh 3.5; \ - ./install-resources.sh 3.5-scala2.13; \ - ./install-resources.sh 4.0; \ + ./install-spark-resources.sh 3.3; \ + ./install-spark-resources.sh 3.4; \ + ./install-spark-resources.sh 3.5; \ + ./install-spark-resources.sh 3.5-scala2.13; \ + ./install-spark-resources.sh 4.0; \ if [ "$(uname -m)" = "aarch64" ]; then \ export CPU_TARGET="aarch64"; \ fi; \ From 2b10ebcc57c07303455277c7688aa52ae668db8f Mon Sep 17 00:00:00 2001 From: Yao-MR Date: Thu, 16 Apr 2026 20:17:15 +0800 Subject: [PATCH 2/4] Refactor: unify architecture variables and simplify Spark resource preparation steps --- .../workflows/util/install-spark-resources.sh | 2 ++ .github/workflows/velox_backend_enhanced.yml | 9 -------- .github/workflows/velox_backend_x86.yml | 22 ------------------- dev/docker/Dockerfile.centos8-dynamic-build | 5 +++-- dev/docker/Dockerfile.centos9-dynamic-build | 5 +++-- 5 files changed, 8 insertions(+), 35 deletions(-) diff --git a/.github/workflows/util/install-spark-resources.sh b/.github/workflows/util/install-spark-resources.sh index 43677e4a5af6..31defd69b845 100755 --- a/.github/workflows/util/install-spark-resources.sh +++ b/.github/workflows/util/install-spark-resources.sh @@ -118,11 +118,13 @@ if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then # Spark-4.0, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix cd ${INSTALL_DIR} && \ install_spark "4.0.1" "3" "2.12" + mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 ;; 4.1) # Spark-4.x, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix cd ${INSTALL_DIR} && \ install_spark "4.1.1" "3" "2.12" + mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 ;; *) echo "Spark version is expected to be specified." diff --git a/.github/workflows/velox_backend_enhanced.yml b/.github/workflows/velox_backend_enhanced.yml index 9e6306c18e5d..fbb112007db7 100644 --- a/.github/workflows/velox_backend_enhanced.yml +++ b/.github/workflows/velox_backend_enhanced.yml @@ -228,10 +228,6 @@ jobs: with: name: arrow-jars-enhanced-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - - name: Prepare Spark Resources for Spark 3.5.5 - run: | - rm -rf /opt/shims/spark35 - bash .github/workflows/util/install-spark-resources.sh 3.5 - name: Build and Run unit test for Spark 3.5.5 (slow tests) run: | cd $GITHUB_WORKSPACE/ @@ -283,11 +279,6 @@ jobs: pip3 install setuptools==77.0.3 && \ pip3 install pyspark==3.5.5 cython && \ pip3 install pandas==2.2.3 pyarrow==20.0.0 - - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update - run: | - rm -rf /opt/shims/spark40 - bash .github/workflows/util/install-spark-resources.sh 4.0 - mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.0.0 with scala-2.13 (other tests) run: | cd $GITHUB_WORKSPACE/ diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index 09a2295587d5..3942fe49d4a9 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -19,8 +19,6 @@ on: pull_request: paths: - '.github/workflows/velox_backend_x86.yml' - - '.github/workflows/util/install-spark-deps.sh' #TODO remove after image update - - '.github/workflows/util/install-spark-resources.sh' #TODO remove after image update - 'pom.xml' - 'backends-velox/**' - 'gluten-uniffle/**' @@ -1257,11 +1255,6 @@ jobs: pip3 install setuptools==77.0.3 && \ pip3 install pyspark==3.5.5 cython && \ pip3 install pandas==2.2.3 pyarrow==20.0.0 - - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update - run: | - rm -rf /opt/shims/spark40 - bash .github/workflows/util/install-spark-resources.sh 4.0 - mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.0.0 with scala-2.13 (other tests) run: | cd $GITHUB_WORKSPACE/ @@ -1307,11 +1300,6 @@ jobs: with: name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update - run: | - rm -rf /opt/shims/spark40 - bash .github/workflows/util/install-spark-resources.sh 4.0 - mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.0 (slow tests) run: | cd $GITHUB_WORKSPACE/ @@ -1365,11 +1353,6 @@ jobs: pip3 install setuptools==77.0.3 && \ pip3 install pyspark==3.5.5 cython && \ pip3 install pandas==2.2.3 pyarrow==20.0.0 - - name: Prepare Spark Resources for Spark 4.1.0 #TODO remove after image update - run: | - rm -rf /opt/shims/spark41 - bash .github/workflows/util/install-spark-resources.sh 4.1 - mv /opt/shims/spark41/spark_home/assembly/target/scala-2.12 /opt/shims/spark41/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.1.0 with scala-2.13 (other tests) run: | cd $GITHUB_WORKSPACE/ @@ -1415,11 +1398,6 @@ jobs: with: name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - - name: Prepare Spark Resources for Spark 4.1.0 #TODO remove after image update - run: | - rm -rf /opt/shims/spark41 - bash .github/workflows/util/install-spark-resources.sh 4.1 - mv /opt/shims/spark41/spark_home/assembly/target/scala-2.12 /opt/shims/spark41/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.0 (slow tests) run: | cd $GITHUB_WORKSPACE/ diff --git a/dev/docker/Dockerfile.centos8-dynamic-build b/dev/docker/Dockerfile.centos8-dynamic-build index faef8d3916b4..b87e234bab23 100644 --- a/dev/docker/Dockerfile.centos8-dynamic-build +++ b/dev/docker/Dockerfile.centos8-dynamic-build @@ -43,8 +43,9 @@ RUN set -ex; \ ./install-spark-resources.sh 3.5; \ ./install-spark-resources.sh 3.5-scala2.13; \ ./install-spark-resources.sh 4.0; \ - if [ "$(uname -m)" = "aarch64" ]; then \ - export CPU_TARGET="aarch64"; \ + ARCH=$(uname -m); \ + if [[ "$ARCH" == "aarch64" || "$ARCH" == "ppc64le" ]]; then \ + export CPU_TARGET="$ARCH"; \ fi; \ cd /opt/gluten; \ source /opt/rh/gcc-toolset-11/enable; \ diff --git a/dev/docker/Dockerfile.centos9-dynamic-build b/dev/docker/Dockerfile.centos9-dynamic-build index 125843217052..2034bf01e77f 100644 --- a/dev/docker/Dockerfile.centos9-dynamic-build +++ b/dev/docker/Dockerfile.centos9-dynamic-build @@ -41,8 +41,9 @@ RUN set -ex; \ ./install-spark-resources.sh 3.5; \ ./install-spark-resources.sh 3.5-scala2.13; \ ./install-spark-resources.sh 4.0; \ - if [ "$(uname -m)" = "aarch64" ]; then \ - export CPU_TARGET="aarch64"; \ + ARCH=$(uname -m); \ + if [[ "$ARCH" == "aarch64" || "$ARCH" == "ppc64le" ]]; then \ + export CPU_TARGET="$ARCH"; \ fi; \ cd /opt/gluten; \ source /opt/rh/gcc-toolset-12/enable; \ From 8220eff4c1ed05849438c6a9065cb25448164e27 Mon Sep 17 00:00:00 2001 From: Yao-MR Date: Fri, 17 Apr 2026 17:32:45 +0800 Subject: [PATCH 3/4] Refactor: unify architecture variables and simplify Spark resource preparation steps --- .github/workflows/velox_backend_enhanced.yml | 5 +++++ .github/workflows/velox_backend_x86.yml | 22 ++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/.github/workflows/velox_backend_enhanced.yml b/.github/workflows/velox_backend_enhanced.yml index fbb112007db7..7f37ceebc054 100644 --- a/.github/workflows/velox_backend_enhanced.yml +++ b/.github/workflows/velox_backend_enhanced.yml @@ -279,6 +279,11 @@ jobs: pip3 install setuptools==77.0.3 && \ pip3 install pyspark==3.5.5 cython && \ pip3 install pandas==2.2.3 pyarrow==20.0.0 + - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update + run: | + rm -rf /opt/shims/spark40 + bash .github/workflows/util/install-spark-resources.sh 4.0 + mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home - name: Build and Run unit test for Spark 4.0.0 with scala-2.13 (other tests) run: | cd $GITHUB_WORKSPACE/ diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index 3942fe49d4a9..09a2295587d5 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -19,6 +19,8 @@ on: pull_request: paths: - '.github/workflows/velox_backend_x86.yml' + - '.github/workflows/util/install-spark-deps.sh' #TODO remove after image update + - '.github/workflows/util/install-spark-resources.sh' #TODO remove after image update - 'pom.xml' - 'backends-velox/**' - 'gluten-uniffle/**' @@ -1255,6 +1257,11 @@ jobs: pip3 install setuptools==77.0.3 && \ pip3 install pyspark==3.5.5 cython && \ pip3 install pandas==2.2.3 pyarrow==20.0.0 + - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update + run: | + rm -rf /opt/shims/spark40 + bash .github/workflows/util/install-spark-resources.sh 4.0 + mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.0.0 with scala-2.13 (other tests) run: | cd $GITHUB_WORKSPACE/ @@ -1300,6 +1307,11 @@ jobs: with: name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ + - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update + run: | + rm -rf /opt/shims/spark40 + bash .github/workflows/util/install-spark-resources.sh 4.0 + mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.0 (slow tests) run: | cd $GITHUB_WORKSPACE/ @@ -1353,6 +1365,11 @@ jobs: pip3 install setuptools==77.0.3 && \ pip3 install pyspark==3.5.5 cython && \ pip3 install pandas==2.2.3 pyarrow==20.0.0 + - name: Prepare Spark Resources for Spark 4.1.0 #TODO remove after image update + run: | + rm -rf /opt/shims/spark41 + bash .github/workflows/util/install-spark-resources.sh 4.1 + mv /opt/shims/spark41/spark_home/assembly/target/scala-2.12 /opt/shims/spark41/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.1.0 with scala-2.13 (other tests) run: | cd $GITHUB_WORKSPACE/ @@ -1398,6 +1415,11 @@ jobs: with: name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ + - name: Prepare Spark Resources for Spark 4.1.0 #TODO remove after image update + run: | + rm -rf /opt/shims/spark41 + bash .github/workflows/util/install-spark-resources.sh 4.1 + mv /opt/shims/spark41/spark_home/assembly/target/scala-2.12 /opt/shims/spark41/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.0 (slow tests) run: | cd $GITHUB_WORKSPACE/ From 471c1ab1c08c224744186627b76bb0b6e1397a35 Mon Sep 17 00:00:00 2001 From: Yao-MR Date: Mon, 20 Apr 2026 10:21:01 +0800 Subject: [PATCH 4/4] Refactor: remove invaild mv step --- .github/workflows/velox_backend_enhanced.yml | 1 - .github/workflows/velox_backend_x86.yml | 2 -- 2 files changed, 3 deletions(-) diff --git a/.github/workflows/velox_backend_enhanced.yml b/.github/workflows/velox_backend_enhanced.yml index 7f37ceebc054..e6406fa38bda 100644 --- a/.github/workflows/velox_backend_enhanced.yml +++ b/.github/workflows/velox_backend_enhanced.yml @@ -283,7 +283,6 @@ jobs: run: | rm -rf /opt/shims/spark40 bash .github/workflows/util/install-spark-resources.sh 4.0 - mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home - name: Build and Run unit test for Spark 4.0.0 with scala-2.13 (other tests) run: | cd $GITHUB_WORKSPACE/ diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index 09a2295587d5..607cbc7b4f97 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -1261,7 +1261,6 @@ jobs: run: | rm -rf /opt/shims/spark40 bash .github/workflows/util/install-spark-resources.sh 4.0 - mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.0.0 with scala-2.13 (other tests) run: | cd $GITHUB_WORKSPACE/ @@ -1311,7 +1310,6 @@ jobs: run: | rm -rf /opt/shims/spark40 bash .github/workflows/util/install-spark-resources.sh 4.0 - mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 - name: Build and Run unit test for Spark 4.0 (slow tests) run: | cd $GITHUB_WORKSPACE/