Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
387 changes: 194 additions & 193 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ jobs:
echo "> Build Docker Image with tag: ${{ env.IMAGE_TAG }}-ainic"
start_time=$(date +%s)
mkdir -p $GITHUB_WORKSPACE/.github/workflows/docker/ainic
cp /apps/tas/0_public/primus_docker_ci/ainic/ainic_bundle_1.117.5-a-56.tar.gz $GITHUB_WORKSPACE/.github/workflows/docker/ainic/ || { echo "Error: Failed to copy ainic bundle"; exit 1; }
# cp /apps/tas/0_public/primus_docker_ci/ainic/ainic_bundle_1.117.5-a-56.tar.gz $GITHUB_WORKSPACE/.github/workflows/docker/ainic/ || { echo "Error: Failed to copy ainic bundle"; exit 1; }
cp /apps/tas/0_public/primus_docker_ci/ainic/ainic_bundle_1.117.5-a-58.tar.gz $GITHUB_WORKSPACE/.github/workflows/docker/ainic/ || { echo "Error: Failed to copy ainic bundle"; exit 1; }
Comment on lines 116 to +118
Copy link

Copilot AI Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This step now has both a commented-out copy of the previous bundle and a new hardcoded bundle filename. To reduce churn and avoid stale comments, consider defining the bundle name once (e.g., in env:) and using it in the cp command, and remove the commented-out cp line once the migration is confirmed.

Copilot uses AI. Check for mistakes.
docker build -f $GITHUB_WORKSPACE/.github/workflows/docker/Dockerfile.ainic \
--network=host \
-t tasimage/primus:${{env.IMAGE_TAG}}-ainic \
Expand Down Expand Up @@ -194,196 +195,196 @@ jobs:
# docker rmi tasimage/primus:${{env.IMAGE_TAG}}-jax
echo "> build-docker success"

run-unittest-torch:
env:
PRIMUS_WORKDIR: /mnt/apps_proxy/tas/0_public/primus_ci/actions-runner-torch
# PRIMUS_WORKDIR: /wekafs/primus-data/primus_safe_ci/torch
needs: [code-lint]
# runs-on: [primus-lm-cicd-torch-j8knc]
runs-on: [primus-lm-cicd-torch-tas8n-a16-40]
steps:
- run: echo "🎉 Begin Primus-Turbo Checkout."
- name: Set commit hash to env
run: echo "PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT}" >> $GITHUB_ENV
- name: Checkout Repo Primus-Turbo
uses: actions/checkout@v4
with:
repository: AMD-AIG-AIMA/Primus-Turbo
submodules: "recursive"
path: Primus-Turbo
ref: ${{ env.PRIMUS_TURBO_COMMIT }}
- run: echo "Begin Primus-Turbo Install."
- name: Install Primus-Turbo
run: |
rm -rf /tmp/Primus-Turbo || true
mv Primus-Turbo /tmp/
echo "Primus-Turbo dir: /tmp/Primus-Turbo"
git config --global --add safe.directory /tmp/Primus-Turbo || true
cd /tmp/Primus-Turbo || true
start_time=$(date +%s)
echo "✅ [Pip install requirements] started at: $(date)"
mkdir -p ${PRIMUS_WORKDIR}/primus-cache
MAX_JOBS=128 pip install --cache-dir=${PRIMUS_WORKDIR}/primus-cache --no-build-isolation --no-clean -r requirements.txt
end_time=$(date +%s)
elapsed=$((end_time - start_time))
echo "✅ [Pip install requirements] ended at: $(date)"
echo "⏱️ [Pip install requirements] Total elapsed time: ${elapsed} seconds"
start_time=$(date +%s)
echo "✅ [build primus-turbo] started at: $(date)"
pip3 install --no-build-isolation -e . -v
end_time=$(date +%s)
elapsed=$((end_time - start_time))
echo "✅ [build primus-turbo] ended at: $(date)"
echo "⏱️ [build primus-turbo] Total elapsed time: ${elapsed} seconds"
- run: echo "🎉 Begin Primus Unit Test."
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Show Environment Info
run: |
echo "Hostname: $(hostname)"
echo "PWD: $(pwd)"
echo "HOME: $HOME"
echo "GITHUB_WORKSPACE: $GITHUB_WORKSPACE"
echo "Runner Temp Dir: $RUNNER_TEMP"
echo "Runner Tool Cache: $RUNNER_TOOL_CACHE"
- name: Install Primus
run: |
pip install -r requirements.txt
- name: Set UT_LOG_PATH
run: |
ts="$(date +%Y%m%d-%H%M%S)"
commit_id="${GITHUB_SHA::7}"
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/pr-${{ github.event.pull_request.number }}-${ts}-${commit_id}" >> $GITHUB_ENV
elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]]; then
echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/main-${ts}-${commit_id}" >> $GITHUB_ENV
elif [[ "${{ github.event_name }}" == "release" ]]; then
TAG_NAME="${{ github.ref }}"
TAG="${TAG_NAME#refs/tags/}"
echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/${TAG}-${ts}-${commit_id}" >> $GITHUB_ENV
else
echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/others-${ts}-${commit_id}" >> $GITHUB_ENV
fi
- name: Run CLI Shell Tests
run: |
echo "Running Primus CLI shell tests..."
bash ./tests/runner/run_all_tests.sh
- name: Run Primus Core Tests
run: |
echo "Running Primus Core tests..."
# Note: The tests `test_fp8_te_linear` and `test_te_linear` are temporarily skipped due to intermittent failures.
# Note HSA_NO_SCRATCH_RECLAIM=1 must be set to avoid RCCL perf hit (TAS-8N Node), rocm ver:70125424
export HSA_NO_SCRATCH_RECLAIM=1
pytest --maxfail=1 -s ./tests/unit_tests/ \
--deselect=tests/unit_tests/megatron/cco/test_tp_overlap.py::TPOverlapTestCase::test_fp8_te_linear \
--deselect=tests/unit_tests/megatron/cco/test_tp_overlap.py::TPOverlapTestCase::test_te_linear
- name: Run Primus Model Tests -- Megatron-LM
env:
HF_TOKEN: ${{secrets.HF_TOKEN}}
run: |
echo "Set UT_LOG_PATH: ${{ env.UT_LOG_PATH }}"
rm -rf "${{ env.UT_LOG_PATH }}"
mkdir -p "${{ env.UT_LOG_PATH }}"
# MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \
MASTER_PORT=10009 DATA_PATH=/mnt/apps_proxy/tas/0_public/data HSA_NO_SCRATCH_RECLAIM=1 \
pytest --maxfail=1 -s ./tests/trainer/test_megatron_trainer.py
- name: Run Primus Model Tests -- TorchTitan
env:
HF_TOKEN: ${{secrets.HF_TOKEN}}
run: |
echo "Set UT_LOG_PATH: ${{ env.UT_LOG_PATH }}"
rm -rf "${{ env.UT_LOG_PATH }}"
mkdir -p "${{ env.UT_LOG_PATH }}"
# MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \
MASTER_PORT=10009 DATA_PATH=/mnt/apps_proxy/tas/0_public/data HSA_NO_SCRATCH_RECLAIM=1 \
pytest --maxfail=1 -s ./tests/trainer/test_torchtitan_trainer.py
- name: Clean
run: |
rm -rf ${PRIMUS_WORKDIR}/Primus-Turbo
rm -rf ${PRIMUS_WORKDIR}/Primus
# run-unittest-torch:
# env:
# PRIMUS_WORKDIR: /mnt/apps_proxy/tas/0_public/primus_ci/actions-runner-torch
# # PRIMUS_WORKDIR: /wekafs/primus-data/primus_safe_ci/torch
# needs: [code-lint]
Comment on lines +198 to +202
Copy link

Copilot AI Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The run-unittest-torch job definition was removed (now fully commented out), so this workflow no longer runs any unit tests (the only remaining job besides build-docker is code-lint). If the intent is to temporarily disable UTs, please gate it behind a condition/input (e.g., workflow_dispatch input or an env flag) and add an inline rationale + tracking issue; otherwise restore the job so PRs still get test signal.

Copilot uses AI. Check for mistakes.
# # runs-on: [primus-lm-cicd-torch-j8knc]
# runs-on: [primus-lm-cicd-torch-tas8n-a16-40]
# steps:
# - run: echo "🎉 Begin Primus-Turbo Checkout."
# - name: Set commit hash to env
# run: echo "PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT}" >> $GITHUB_ENV
# - name: Checkout Repo Primus-Turbo
# uses: actions/checkout@v4
# with:
# repository: AMD-AIG-AIMA/Primus-Turbo
# submodules: "recursive"
# path: Primus-Turbo
# ref: ${{ env.PRIMUS_TURBO_COMMIT }}
# - run: echo "Begin Primus-Turbo Install."
# - name: Install Primus-Turbo
# run: |
# rm -rf /tmp/Primus-Turbo || true
# mv Primus-Turbo /tmp/
# echo "Primus-Turbo dir: /tmp/Primus-Turbo"
# git config --global --add safe.directory /tmp/Primus-Turbo || true
# cd /tmp/Primus-Turbo || true
# start_time=$(date +%s)
# echo "✅ [Pip install requirements] started at: $(date)"
# mkdir -p ${PRIMUS_WORKDIR}/primus-cache
# MAX_JOBS=128 pip install --cache-dir=${PRIMUS_WORKDIR}/primus-cache --no-build-isolation --no-clean -r requirements.txt
# end_time=$(date +%s)
# elapsed=$((end_time - start_time))
# echo "✅ [Pip install requirements] ended at: $(date)"
# echo "⏱️ [Pip install requirements] Total elapsed time: ${elapsed} seconds"
# start_time=$(date +%s)
# echo "✅ [build primus-turbo] started at: $(date)"
# pip3 install --no-build-isolation -e . -v
# end_time=$(date +%s)
# elapsed=$((end_time - start_time))
# echo "✅ [build primus-turbo] ended at: $(date)"
# echo "⏱️ [build primus-turbo] Total elapsed time: ${elapsed} seconds"
# - run: echo "🎉 Begin Primus Unit Test."
# - uses: actions/checkout@v4
# with:
# submodules: recursive
# - name: Show Environment Info
# run: |
# echo "Hostname: $(hostname)"
# echo "PWD: $(pwd)"
# echo "HOME: $HOME"
# echo "GITHUB_WORKSPACE: $GITHUB_WORKSPACE"
# echo "Runner Temp Dir: $RUNNER_TEMP"
# echo "Runner Tool Cache: $RUNNER_TOOL_CACHE"
# - name: Install Primus
# run: |
# pip install -r requirements.txt
# - name: Set UT_LOG_PATH
# run: |
# ts="$(date +%Y%m%d-%H%M%S)"
# commit_id="${GITHUB_SHA::7}"
# if [[ "${{ github.event_name }}" == "pull_request" ]]; then
# echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/pr-${{ github.event.pull_request.number }}-${ts}-${commit_id}" >> $GITHUB_ENV
# elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]]; then
# echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/main-${ts}-${commit_id}" >> $GITHUB_ENV
# elif [[ "${{ github.event_name }}" == "release" ]]; then
# TAG_NAME="${{ github.ref }}"
# TAG="${TAG_NAME#refs/tags/}"
# echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/${TAG}-${ts}-${commit_id}" >> $GITHUB_ENV
# else
# echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/others-${ts}-${commit_id}" >> $GITHUB_ENV
# fi
# - name: Run CLI Shell Tests
# run: |
# echo "Running Primus CLI shell tests..."
# bash ./tests/runner/run_all_tests.sh
# - name: Run Primus Core Tests
# run: |
# echo "Running Primus Core tests..."
# # Note: The tests `test_fp8_te_linear` and `test_te_linear` are temporarily skipped due to intermittent failures.
# # Note HSA_NO_SCRATCH_RECLAIM=1 must be set to avoid RCCL perf hit (TAS-8N Node), rocm ver:70125424
# export HSA_NO_SCRATCH_RECLAIM=1
# pytest --maxfail=1 -s ./tests/unit_tests/ \
# --deselect=tests/unit_tests/megatron/cco/test_tp_overlap.py::TPOverlapTestCase::test_fp8_te_linear \
# --deselect=tests/unit_tests/megatron/cco/test_tp_overlap.py::TPOverlapTestCase::test_te_linear
# - name: Run Primus Model Tests -- Megatron-LM
# env:
# HF_TOKEN: ${{secrets.HF_TOKEN}}
# run: |
# echo "Set UT_LOG_PATH: ${{ env.UT_LOG_PATH }}"
# rm -rf "${{ env.UT_LOG_PATH }}"
# mkdir -p "${{ env.UT_LOG_PATH }}"
# # MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \
# MASTER_PORT=10009 DATA_PATH=/mnt/apps_proxy/tas/0_public/data HSA_NO_SCRATCH_RECLAIM=1 \
# pytest --maxfail=1 -s ./tests/trainer/test_megatron_trainer.py
# - name: Run Primus Model Tests -- TorchTitan
# env:
# HF_TOKEN: ${{secrets.HF_TOKEN}}
# run: |
# echo "Set UT_LOG_PATH: ${{ env.UT_LOG_PATH }}"
# rm -rf "${{ env.UT_LOG_PATH }}"
# mkdir -p "${{ env.UT_LOG_PATH }}"
# # MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \
# MASTER_PORT=10009 DATA_PATH=/mnt/apps_proxy/tas/0_public/data HSA_NO_SCRATCH_RECLAIM=1 \
# pytest --maxfail=1 -s ./tests/trainer/test_torchtitan_trainer.py
# - name: Clean
# run: |
# rm -rf ${PRIMUS_WORKDIR}/Primus-Turbo
# rm -rf ${PRIMUS_WORKDIR}/Primus

run-unittest-jax:
env:
PRIMUS_WORKDIR: /wekafs/primus-data/primus_safe_ci/jax
needs: [code-lint]
runs-on: [primus-lm-cicd-jax-m42vb]
steps:
- run: echo "🎉 Begin Primus-Turbo Checkout."
- name: Set commit hash to env
run: echo "PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT}" >> $GITHUB_ENV
- name: Checkout Repo Primus-Turbo
uses: actions/checkout@v4
with:
repository: AMD-AIG-AIMA/Primus-Turbo
submodules: "recursive"
path: Primus-Turbo
ref: ${{ env.PRIMUS_TURBO_COMMIT }}
- run: echo "Begin Primus-Turbo Install."
- name: Install Primus-Turbo
run: |
mv Primus-Turbo /tmp/
echo "Primus-Turbo dir: /tmp/Primus-Turbo"
git config --global --add safe.directory /tmp/Primus-Turbo
cd /tmp/Primus-Turbo
start_time=$(date +%s)
echo "✅ [Pip install requirements] started at: $(date)"
mkdir -p ${PRIMUS_WORKDIR}/primus-cache
python3 -m pip install --upgrade pip setuptools
end_time=$(date +%s)
elapsed=$((end_time - start_time))
echo "✅ [Pip install requirements] ended at: $(date)"
echo "⏱️ [Pip install requirements] Total elapsed time: ${elapsed} seconds"
start_time=$(date +%s)
echo "✅ [build primus-turbo] started at: $(date)"
end_time=$(date +%s)
elapsed=$((end_time - start_time))
echo "✅ [build primus-turbo] ended at: $(date)"
echo "⏱️ [build primus-turbo] Torch installation causes segfault, so we skip it and actually not install turbo. Total elapsed time: ${elapsed} seconds"
- run: echo "🎉 Begin Primus Unit Test."
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Show Environment Info
run: |
echo "Hostname: $(hostname)"
echo "PWD: $(pwd)"
echo "HOME: $HOME"
echo "GITHUB_WORKSPACE: $GITHUB_WORKSPACE"
echo "Runner Temp Dir: $RUNNER_TEMP"
echo "Runner Tool Cache: $RUNNER_TOOL_CACHE"
- name: Install Primus
run: |
pip install -r requirements-jax.txt
- name: Set UT_LOG_PATH
run: |
ts="$(date +%Y%m%d-%H%M%S)"
commit_id="${GITHUB_SHA::7}"
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/pr-${{ github.event.pull_request.number }}-${ts}-${commit_id}" >> $GITHUB_ENV
elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]]; then
echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/main-${ts}-${commit_id}" >> $GITHUB_ENV
elif [[ "${{ github.event_name }}" == "release" ]]; then
TAG_NAME="${{ github.ref }}"
TAG="${TAG_NAME#refs/tags/}"
echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/${TAG}-${ts}-${commit_id}" >> $GITHUB_ENV
else
echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/others-${ts}-${commit_id}" >> $GITHUB_ENV
fi
- name: Run Shell Tests
run: |
echo "Running Primus CLI shell tests..."
bash ./tests/runner/run_all_tests.sh
- name: Run Unit Tests
env:
HF_TOKEN: ${{secrets.HF_TOKEN}}
run: |
echo "Set UT_LOG_PATH: ${{ env.UT_LOG_PATH }}"
rm -rf "${{ env.UT_LOG_PATH }}"
mkdir -p "${{ env.UT_LOG_PATH }}"
MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \
JAX_SKIP_UT=1 python ./tests/run_unit_tests.py --jax
- name: Clean
run: |
rm -rf ${PRIMUS_WORKDIR}/Primus-Turbo
rm -rf ${PRIMUS_WORKDIR}/Primus
# run-unittest-jax:
# env:
# PRIMUS_WORKDIR: /wekafs/primus-data/primus_safe_ci/jax
# needs: [code-lint]
# runs-on: [primus-lm-cicd-jax-m42vb]
Comment on lines +307 to +311
Copy link

Copilot AI Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The run-unittest-jax job is also removed/commented out, which eliminates JAX test coverage from CI entirely. If this is intentional, add a clear comment explaining why and a plan/issue to re-enable; otherwise restore the job (or at least keep a minimal smoke/unit-test subset) to prevent regressions landing unnoticed.

Copilot uses AI. Check for mistakes.
# steps:
# - run: echo "🎉 Begin Primus-Turbo Checkout."
# - name: Set commit hash to env
# run: echo "PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT}" >> $GITHUB_ENV
# - name: Checkout Repo Primus-Turbo
# uses: actions/checkout@v4
# with:
# repository: AMD-AIG-AIMA/Primus-Turbo
# submodules: "recursive"
# path: Primus-Turbo
# ref: ${{ env.PRIMUS_TURBO_COMMIT }}
# - run: echo "Begin Primus-Turbo Install."
# - name: Install Primus-Turbo
# run: |
# mv Primus-Turbo /tmp/
# echo "Primus-Turbo dir: /tmp/Primus-Turbo"
# git config --global --add safe.directory /tmp/Primus-Turbo
# cd /tmp/Primus-Turbo
# start_time=$(date +%s)
# echo "✅ [Pip install requirements] started at: $(date)"
# mkdir -p ${PRIMUS_WORKDIR}/primus-cache
# python3 -m pip install --upgrade pip setuptools
# end_time=$(date +%s)
# elapsed=$((end_time - start_time))
# echo "✅ [Pip install requirements] ended at: $(date)"
# echo "⏱️ [Pip install requirements] Total elapsed time: ${elapsed} seconds"
# start_time=$(date +%s)
# echo "✅ [build primus-turbo] started at: $(date)"
# end_time=$(date +%s)
# elapsed=$((end_time - start_time))
# echo "✅ [build primus-turbo] ended at: $(date)"
# echo "⏱️ [build primus-turbo] Torch installation causes segfault, so we skip it and actually not install turbo. Total elapsed time: ${elapsed} seconds"
# - run: echo "🎉 Begin Primus Unit Test."
# - uses: actions/checkout@v4
# with:
# submodules: recursive
# - name: Show Environment Info
# run: |
# echo "Hostname: $(hostname)"
# echo "PWD: $(pwd)"
# echo "HOME: $HOME"
# echo "GITHUB_WORKSPACE: $GITHUB_WORKSPACE"
# echo "Runner Temp Dir: $RUNNER_TEMP"
# echo "Runner Tool Cache: $RUNNER_TOOL_CACHE"
# - name: Install Primus
# run: |
# pip install -r requirements-jax.txt
# - name: Set UT_LOG_PATH
# run: |
# ts="$(date +%Y%m%d-%H%M%S)"
# commit_id="${GITHUB_SHA::7}"
# if [[ "${{ github.event_name }}" == "pull_request" ]]; then
# echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/pr-${{ github.event.pull_request.number }}-${ts}-${commit_id}" >> $GITHUB_ENV
# elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]]; then
# echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/main-${ts}-${commit_id}" >> $GITHUB_ENV
# elif [[ "${{ github.event_name }}" == "release" ]]; then
# TAG_NAME="${{ github.ref }}"
# TAG="${TAG_NAME#refs/tags/}"
# echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/${TAG}-${ts}-${commit_id}" >> $GITHUB_ENV
# else
# echo "UT_LOG_PATH=${PRIMUS_WORKDIR}/ut_out/others-${ts}-${commit_id}" >> $GITHUB_ENV
# fi
# - name: Run Shell Tests
# run: |
# echo "Running Primus CLI shell tests..."
# bash ./tests/runner/run_all_tests.sh
# - name: Run Unit Tests
# env:
# HF_TOKEN: ${{secrets.HF_TOKEN}}
# run: |
# echo "Set UT_LOG_PATH: ${{ env.UT_LOG_PATH }}"
# rm -rf "${{ env.UT_LOG_PATH }}"
# mkdir -p "${{ env.UT_LOG_PATH }}"
# MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \
# JAX_SKIP_UT=1 python ./tests/run_unit_tests.py --jax
# - name: Clean
# run: |
# rm -rf ${PRIMUS_WORKDIR}/Primus-Turbo
# rm -rf ${PRIMUS_WORKDIR}/Primus
Loading