diff --git a/examples/megatron/configs/MI355X/glm5-BF16-pretrain.yaml b/examples/megatron/configs/MI355X/glm5-BF16-pretrain.yaml index b0a6781c1..af54d557b 100644 --- a/examples/megatron/configs/MI355X/glm5-BF16-pretrain.yaml +++ b/examples/megatron/configs/MI355X/glm5-BF16-pretrain.yaml @@ -1,8 +1,3 @@ -# Generated: training args from start_training_glm5.sh merged into overrides (default env). -# Defaults used: NNODES=8, TRAIN_ITERS=10, MBS=1, GBS=32*NNODES=256, PRIMUS_PP=8, PRIMUS_VPP=1, -# PRIMUS_EP=8, PRIMUS_RECOMPUTE_LAYERS=8, TURBO_* / LEGACY_GG as in script. -# Original preset kept at: glm5-BF16-pretrain.yaml (unchanged). - work_group: ${PRIMUS_TEAM:amd} user_name: ${PRIMUS_USER:root} exp_name: ${PRIMUS_EXP_NAME:glm5-pretrain} @@ -13,6 +8,7 @@ modules: framework: megatron config: pre_trainer.yaml + # model to run model: ${PRIMUS_MODEL:glm5}.yaml overrides: # log @@ -24,13 +20,13 @@ modules: log_avg_skip_iterations: 2 log_avg_reset_interval: 50 - # --- start_training_glm5.sh → train pretrain CLI (resolved defaults) --- - num_layers: 78 + # hyper parameters (optimized for 8-node 64-GPU MI355X) train_iters: 10 micro_batch_size: 1 global_batch_size: 256 seq_length: ${PRIMUS_SEQ_LENGTH:4096} max_position_embeddings: ${PRIMUS_MAX_POSITION_EMBEDDINGS:4096} + num_layers: 78 lr: 2.2e-4 min_lr: 2.2e-5 lr_warmup_iters: 200 @@ -43,23 +39,33 @@ modules: init_method_std: 0.008 norm_epsilon: 1.0e-5 - # parallel (tensor_model_parallel_size / PP / EP / last-PP layers) + # parallel (optimized) tensor_model_parallel_size: ${PRIMUS_TP:1} pipeline_model_parallel_size: 8 expert_model_parallel_size: 8 + virtual_pipeline_model_parallel_size: 1 decoder_last_pipeline_num_layers: 8 overlap_grad_reduce: false overlap_param_gather: false gradient_accumulation_fusion: true + # recompute (BF16 memory-constrained: 8 out of ~10 layers per stage) + recompute_num_layers: 8 + recompute_granularity: full + recompute_method: block + # data mock_data: true - train_data_path: ${PRIMUS_TOKENIZED_DATA_PATH:/shared_aig/c4/tokenized/c4_en_train_text_document} + train_data_path: ${PRIMUS_TOKENIZED_DATA_PATH:null} valid_data_path: null test_data_path: null moe_use_legacy_grouped_gemm: true + moe_layer_freq: 1 + + # MLA multi_latent_attention: true + apply_rope_fusion: false # ckpt finetune: false @@ -75,7 +81,7 @@ modules: ckpt_format: torch eval_iters: 0 - # Turbo / MLA-related (script: TURBO_ATTENTION=True, TURBO_GROUPED_MLP=False, TURBO_RMS_NORM=True) + # Turbo enable_primus_turbo: true use_turbo_attention: true use_turbo_grouped_mlp: false @@ -92,28 +98,20 @@ modules: moe_use_fused_router_with_aux_score: true moe_permute_fusion: true - moe_layer_freq: 1 - - # recompute (script: --recompute_num_layers + full + block) - recompute_num_layers: 8 - recompute_granularity: full - recompute_method: block - - # Cross entropy + # Cross entropy flags cross_entropy_fusion_impl: "te" cross_entropy_loss_fusion: true - # GC / PP warmup / MTP (script: manual_gc, pp_warmup, mtp_num_layers=0) + # precision-aware optimizer (bf16 states to save memory) + use_precision_aware_optimizer: true + main_grads_dtype: bf16 + exp_avg_dtype: bf16 + exp_avg_sq_dtype: bf16 + + # training control manual_gc: true manual_gc_interval: 1 pp_warmup: true mtp_num_layers: 0 - - # profile (script: PROFILE=False, step range 6–7) - profile: false - use_pytorch_profiler: false - profile_step_end: 7 - profile_step_start: 6 - disable_wandb: true disable_tensorboard: true diff --git a/examples/megatron/configs/MI355X/glm5-FP8-pretrain.yaml b/examples/megatron/configs/MI355X/glm5-FP8-pretrain.yaml index deda46ff1..4e2fd1db2 100644 --- a/examples/megatron/configs/MI355X/glm5-FP8-pretrain.yaml +++ b/examples/megatron/configs/MI355X/glm5-FP8-pretrain.yaml @@ -1,7 +1,3 @@ -# Generated: same overrides as glm5-BF16-pretrain-new.yaml + FP8 preset from glm5-FP8-pretrain.yaml. -# Training args mirror start_training_glm5.sh (PRETRAIN_TYPE=FP8 → point EXP to this file). -# Original presets: glm5-FP8-pretrain.yaml (unchanged). - work_group: ${PRIMUS_TEAM:amd} user_name: ${PRIMUS_USER:root} exp_name: ${PRIMUS_EXP_NAME:glm5-pretrain} @@ -12,6 +8,7 @@ modules: framework: megatron config: pre_trainer.yaml + # model to run model: ${PRIMUS_MODEL:glm5}.yaml overrides: # log @@ -23,13 +20,13 @@ modules: log_avg_skip_iterations: 2 log_avg_reset_interval: 50 - # --- start_training_glm5.sh → train pretrain CLI (resolved defaults) --- - num_layers: 78 + # hyper parameters (optimized for 8-node 64-GPU MI355X) train_iters: 10 micro_batch_size: 1 global_batch_size: 256 seq_length: ${PRIMUS_SEQ_LENGTH:4096} max_position_embeddings: ${PRIMUS_MAX_POSITION_EMBEDDINGS:4096} + num_layers: 78 lr: 2.2e-4 min_lr: 2.2e-5 lr_warmup_iters: 200 @@ -42,23 +39,33 @@ modules: init_method_std: 0.008 norm_epsilon: 1.0e-5 - # parallel (tensor_model_parallel_size / PP / EP / last-PP layers) + # parallel (optimized) tensor_model_parallel_size: ${PRIMUS_TP:1} pipeline_model_parallel_size: 8 expert_model_parallel_size: 8 + virtual_pipeline_model_parallel_size: 1 decoder_last_pipeline_num_layers: 8 overlap_grad_reduce: false overlap_param_gather: false gradient_accumulation_fusion: true + # recompute (optimized: 7 out of ~10 layers per stage) + recompute_num_layers: 7 + recompute_granularity: full + recompute_method: block + # data mock_data: true - train_data_path: ${PRIMUS_TOKENIZED_DATA_PATH:/shared_aig/c4/tokenized/c4_en_train_text_document} + train_data_path: ${PRIMUS_TOKENIZED_DATA_PATH:null} valid_data_path: null test_data_path: null moe_use_legacy_grouped_gemm: true + moe_layer_freq: 1 + + # MLA multi_latent_attention: true + apply_rope_fusion: false # ckpt finetune: false @@ -74,7 +81,7 @@ modules: ckpt_format: torch eval_iters: 0 - # Turbo / MLA-related (script: TURBO_ATTENTION=True, TURBO_GROUPED_MLP=False, TURBO_RMS_NORM=True) + # Turbo enable_primus_turbo: true use_turbo_attention: true use_turbo_grouped_mlp: false @@ -91,31 +98,24 @@ modules: moe_use_fused_router_with_aux_score: true moe_permute_fusion: true - moe_layer_freq: 1 - - # recompute (script: --recompute_num_layers + full + block) - recompute_num_layers: 8 - recompute_granularity: full - recompute_method: block - - # Cross entropy + # Cross entropy flags cross_entropy_fusion_impl: "te" cross_entropy_loss_fusion: true - # GC / PP warmup / MTP (script: manual_gc, pp_warmup, mtp_num_layers=0) + # enable fp8 training + fp8: hybrid + moe_use_legacy_grouped_gemm: false + + # precision-aware optimizer (bf16 states to save memory) + use_precision_aware_optimizer: true + main_grads_dtype: bf16 + exp_avg_dtype: bf16 + exp_avg_sq_dtype: bf16 + + # training control manual_gc: true manual_gc_interval: 1 pp_warmup: true mtp_num_layers: 0 - - # profile (script: PROFILE=False, step range 6–7) - profile: false - use_pytorch_profiler: false - profile_step_end: 7 - profile_step_start: 6 - disable_wandb: true disable_tensorboard: true - - # FP8 (from glm5-FP8-pretrain.yaml) - fp8: hybrid diff --git a/examples/megatron/configs/MI355X/minimax_m2.5-BF16-pretrain.yaml b/examples/megatron/configs/MI355X/minimax_m2.5-BF16-pretrain.yaml index 3a278cd83..2ee37fed8 100644 --- a/examples/megatron/configs/MI355X/minimax_m2.5-BF16-pretrain.yaml +++ b/examples/megatron/configs/MI355X/minimax_m2.5-BF16-pretrain.yaml @@ -1,9 +1,3 @@ -# Generated: training args from start_training_minimax_m2.5.sh merged into overrides (default env). -# Defaults used: NNODES=16, TRAIN_ITERS=10, MBS=2, GBS=64*NNODES=1024, PRIMUS_PP=4, PRIMUS_VPP=2, -# PRIMUS_EP=8, PRIMUS_RECOMPUTE_LAYERS=2, APPLY_ROPE_FUSION=True, TURBO_* as in script. -# Pipeline layout for STAGE=PP*VPP=8 matches script case 8). -# Original preset kept at: minimax_m2.5-BF16-pretrain.yaml (unchanged). - work_group: ${PRIMUS_TEAM:amd} user_name: ${PRIMUS_USER:root} exp_name: ${PRIMUS_EXP_NAME:minimax_m2.5-pretrain} @@ -14,6 +8,7 @@ modules: framework: megatron config: pre_trainer.yaml + # model to run model: ${PRIMUS_MODEL:minimax_m2.5}.yaml overrides: # log @@ -25,13 +20,13 @@ modules: log_avg_skip_iterations: 2 log_avg_reset_interval: 50 - # --- start_training_minimax_m2.5.sh → train pretrain CLI (resolved defaults) --- - num_layers: 62 + # hyper parameters (optimized for 8-node 64-GPU MI355X, BF16 best: ~493 TFLOP/s) train_iters: 10 - micro_batch_size: 2 - global_batch_size: 1024 + micro_batch_size: 3 + global_batch_size: 768 seq_length: ${PRIMUS_SEQ_LENGTH:4096} max_position_embeddings: ${PRIMUS_MAX_POSITION_EMBEDDINGS:4096} + num_layers: 62 lr: 2.2e-4 min_lr: 2.2e-5 lr_warmup_iters: 200 @@ -44,27 +39,31 @@ modules: init_method_std: 0.02 norm_epsilon: 1.0e-6 - # parallel (PP=4, VPP=2, EP=8; layout for 62 layers / 8 stages) + # parallel (optimized: PP=4 with VPP=2 for reduced pipeline bubble) tensor_model_parallel_size: ${PRIMUS_TP:1} pipeline_model_parallel_size: 4 - virtual_pipeline_model_parallel_size: 2 expert_model_parallel_size: 8 + virtual_pipeline_model_parallel_size: 2 pipeline_model_parallel_layout: "Et*7|t*8|t*8|t*8|t*8|t*8|t*8|t*7,L" overlap_grad_reduce: true overlap_param_gather: true gradient_accumulation_fusion: true + # recompute (optimized: disabled for max throughput, mem ~93%) + recompute_num_layers: 0 + recompute_granularity: full + recompute_method: block + # data mock_data: true - train_data_path: ${PRIMUS_TOKENIZED_DATA_PATH:/shared_aig/c4/tokenized/c4_en_train_text_document} + train_data_path: ${PRIMUS_TOKENIZED_DATA_PATH:null} valid_data_path: null test_data_path: null moe_use_legacy_grouped_gemm: true - - # rope fusion (script: --enable_experimental --apply_rope_fusion from APPLY_ROPE_FUSION=True) - enable_experimental: true + moe_layer_freq: 1 apply_rope_fusion: true + enable_experimental: true # ckpt finetune: false @@ -97,34 +96,20 @@ modules: moe_use_fused_router_with_aux_score: true moe_permute_fusion: true - moe_layer_freq: 1 - - # recompute - recompute_num_layers: 2 - recompute_granularity: full - recompute_method: block - - # Cross entropy + # Cross entropy flags cross_entropy_fusion_impl: "te" cross_entropy_loss_fusion: true - # precision-aware optimizer (script) + # precision-aware optimizer (bf16 states to save memory) use_precision_aware_optimizer: true main_grads_dtype: bf16 exp_avg_dtype: bf16 exp_avg_sq_dtype: bf16 - # GC / PP warmup / MTP + # training control manual_gc: true manual_gc_interval: 1 pp_warmup: true mtp_num_layers: 0 - - # profile - profile: false - use_pytorch_profiler: false - profile_step_end: 7 - profile_step_start: 6 - disable_wandb: true disable_tensorboard: true diff --git a/examples/megatron/configs/MI355X/minimax_m2.5-FP8-pretrain.yaml b/examples/megatron/configs/MI355X/minimax_m2.5-FP8-pretrain.yaml index c63bdd736..f2a888e36 100644 --- a/examples/megatron/configs/MI355X/minimax_m2.5-FP8-pretrain.yaml +++ b/examples/megatron/configs/MI355X/minimax_m2.5-FP8-pretrain.yaml @@ -1,7 +1,3 @@ -# Generated: same overrides as minimax_m2.5-BF16-pretrain-new.yaml + FP8 preset from minimax_m2.5-FP8-pretrain.yaml. -# Training args mirror start_training_minimax_m2.5.sh (PRETRAIN_TYPE=FP8 → point EXP to this file). -# Original presets: minimax_m2.5-FP8-pretrain.yaml (unchanged). - work_group: ${PRIMUS_TEAM:amd} user_name: ${PRIMUS_USER:root} exp_name: ${PRIMUS_EXP_NAME:minimax_m2.5-pretrain} @@ -12,6 +8,7 @@ modules: framework: megatron config: pre_trainer.yaml + # model to run model: ${PRIMUS_MODEL:minimax_m2.5}.yaml overrides: # log @@ -23,13 +20,13 @@ modules: log_avg_skip_iterations: 2 log_avg_reset_interval: 50 - # --- start_training_minimax_m2.5.sh → train pretrain CLI (resolved defaults) --- - num_layers: 62 + # hyper parameters (optimized for 8-node 64-GPU MI355X) train_iters: 10 - micro_batch_size: 2 - global_batch_size: 1024 + micro_batch_size: 3 + global_batch_size: 768 seq_length: ${PRIMUS_SEQ_LENGTH:4096} max_position_embeddings: ${PRIMUS_MAX_POSITION_EMBEDDINGS:4096} + num_layers: 62 lr: 2.2e-4 min_lr: 2.2e-5 lr_warmup_iters: 200 @@ -42,27 +39,30 @@ modules: init_method_std: 0.02 norm_epsilon: 1.0e-6 - # parallel (PP=4, VPP=2, EP=8; layout for 62 layers / 8 stages) + # parallel (optimized: PP=4 with VPP=2 for reduced pipeline bubble) tensor_model_parallel_size: ${PRIMUS_TP:1} pipeline_model_parallel_size: 4 - virtual_pipeline_model_parallel_size: 2 expert_model_parallel_size: 8 + virtual_pipeline_model_parallel_size: 2 pipeline_model_parallel_layout: "Et*7|t*8|t*8|t*8|t*8|t*8|t*8|t*7,L" overlap_grad_reduce: true overlap_param_gather: true gradient_accumulation_fusion: true + # recompute (optimized: disabled for max throughput) + recompute_num_layers: 0 + recompute_granularity: full + recompute_method: block + # data mock_data: true - train_data_path: ${PRIMUS_TOKENIZED_DATA_PATH:/shared_aig/c4/tokenized/c4_en_train_text_document} + train_data_path: ${PRIMUS_TOKENIZED_DATA_PATH:null} valid_data_path: null test_data_path: null - moe_use_legacy_grouped_gemm: true - - # rope fusion (script: --enable_experimental --apply_rope_fusion from APPLY_ROPE_FUSION=True) - enable_experimental: true + moe_layer_freq: 1 apply_rope_fusion: true + enable_experimental: true # ckpt finetune: false @@ -95,37 +95,24 @@ modules: moe_use_fused_router_with_aux_score: true moe_permute_fusion: true - moe_layer_freq: 1 - - # recompute - recompute_num_layers: 2 - recompute_granularity: full - recompute_method: block - - # Cross entropy + # Cross entropy flags cross_entropy_fusion_impl: "te" cross_entropy_loss_fusion: true - # precision-aware optimizer (script) + # enable fp8 training + fp8: hybrid + moe_use_legacy_grouped_gemm: false + + # precision-aware optimizer (bf16 states to save memory) use_precision_aware_optimizer: true main_grads_dtype: bf16 exp_avg_dtype: bf16 exp_avg_sq_dtype: bf16 - # GC / PP warmup / MTP + # training control manual_gc: true manual_gc_interval: 1 pp_warmup: true mtp_num_layers: 0 - - # profile - profile: false - use_pytorch_profiler: false - profile_step_end: 7 - profile_step_start: 6 - disable_wandb: true disable_tensorboard: true - - # FP8 (from minimax_m2.5-FP8-pretrain.yaml) - fp8: hybrid diff --git a/examples/moe_package/run_glm5_4layers_proxy.sh b/examples/moe_package/run_glm5_4layers_proxy.sh new file mode 100644 index 000000000..08304f660 --- /dev/null +++ b/examples/moe_package/run_glm5_4layers_proxy.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +export HF_TOKEN="${HF_TOKEN:-'your_hf_token'}" # make it your own hf token +export WANDB_API_KEY="${WANDB_API_KEY:-'your_wandb_api_key'}" # make it your own wandb api key + +export PLATFORM="MI355X" # "B200" "GB200" +if [ "$PLATFORM" = "MI355X" ]; then + export DOCKER_IMAGE="docker.io/tasimage/primus:pr-563-ainic" +elif [ "$PLATFORM" = "B200" ] || [ "$PLATFORM" = "GB200" ]; then + export DOCKER_IMAGE="nvcr.io/nvidia/nemo:25.09" + EXTRA_ARGS="--use_rocm_mem_info_iters None" +else + echo "Error: unsupported PLATFORM '$PLATFORM'. Must be MI355X, B200, or GB200." >&2 + exit 1 +fi + +export NNODES=${NNODES:-1} +export TRAIN_ITERS=10 +export SLURM_TIME=01:00:00 +export SLURM_PARTITION=amd-aig +# export SLURM_NODELIST="uswslocpm2m-106-079" + +# export NCCL_DEBUG=INFO +export USING_AINIC=1 +export NCCL_IB_HCA="ionic_0:1,ionic_2:1,ionic_3:1,ionic_4:1,ionic_5:1,ionic_7:1,ionic_8:1,ionic_9:1" +export GLOO_SOCKET_IFNAME=ens9np0 +export NCCL_SOCKET_IFNAME=ens9np0 +export HSA_NO_SCRATCH_RECLAIM=1 +export NVTE_CK_USES_BWD_V3=1 +export GPU_MAX_HW_QUEUES=4 +export CLEAN_DOCKER_CONTAINER=1 + +export MBS=2 +export GBS=$((64 * NNODES)) +export PRIMUS_TOTAL_LAYERS=4 +export PRIMUS_RECOMPUTE_LAYERS=0 +export PRIMUS_MOE_LAYER_FREQ=1 +export PRIMUS_PP=1 +export PRIMUS_EP=8 +export PRIMUS_VPP=1 +export PROFILE=False +export TURBO_ATTENTION=${TURBO_ATTENTION:-True} +export TURBO_DEEPEEP=${TURBO_DEEPEEP:-True} +export LEGACY_GG=${LEGACY_GG:-True} +export TURBO_GROUPED_MLP=${TURBO_GROUPED_MLP:-True} +export PRIMUS_DETERMINISTIC=0 +export PRIMUS_TURBO_DEEPEP_TIMEOUT=600 + +# Enable NUMA binding for better memory locality (increase stability for large models) +# export ENABLE_NUMA_BINDING=1 +# export HSA_KERNARG_POOL_SIZE=12582912 + + +export PRETRAIN_TYPE=${PRETRAIN_TYPE:-BF16} # BF16 or FP8 + +export EXP=examples/megatron/configs/MI355X/glm5-${PRETRAIN_TYPE}-pretrain.yaml +export PRIMUS_TEAM=amd +PRIMUS_USER="tas-$(date +%Y%m%d)" +export PRIMUS_USER +export PRIMUS_EXP_NAME=glm5-pretrain-platform_$PLATFORM-layers_$PRIMUS_TOTAL_LAYERS-type_$PRETRAIN_TYPE-mbs_$MBS-gbs_$GBS-legacygg_$LEGACY_GG +export PRIMUS_EXP_NAME=debug_glm5_4layers-type_$PRETRAIN_TYPE-legacygg_$LEGACY_GG-turbogg_$TURBO_GROUPED_MLP + + +mkdir -p "output/$PRIMUS_TEAM/$PRIMUS_USER/$PRIMUS_EXP_NAME" +./primus-cli direct \ + -- train pretrain --config "$EXP" \ + --num_layers $PRIMUS_TOTAL_LAYERS \ + --train_iters $TRAIN_ITERS \ + --micro_batch_size $MBS \ + --global_batch_size $GBS \ + --use_turbo_attention "$TURBO_ATTENTION" \ + --use_turbo_deepep "$TURBO_DEEPEEP" \ + --use_turbo_grouped_mlp "$TURBO_GROUPED_MLP" \ + --moe_use_legacy_grouped_gemm "$LEGACY_GG" \ + --pipeline_model_parallel_size $PRIMUS_PP \ + --expert_model_parallel_size $PRIMUS_EP \ + --cross_entropy_fusion_impl "te" \ + --cross_entropy_loss_fusion True \ + --recompute_num_layers $PRIMUS_RECOMPUTE_LAYERS \ + --recompute_granularity full \ + --recompute_method block \ + --disable_last_saving True \ + --moe_layer_freq $PRIMUS_MOE_LAYER_FREQ \ + --mock_data True \ + --pp_warmup True \ + --mtp_num_layers 0 \ + --profile $PROFILE \ + --use_pytorch_profiler $PROFILE \ + --profile_step_end 7 \ + --profile_step_start 6 \ + --disable_wandb True \ + --disable_tensorboard True \ + "$EXTRA_ARGS" \ + 2>&1 | tee "output/$PRIMUS_TEAM/$PRIMUS_USER/$PRIMUS_EXP_NAME/log_node_${NODE_RANK}.txt" + + # --manual_gc True \ + # --manual_gc_interval 1 \