Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -95,5 +95,5 @@ modules:
turbo_sync_free_moe_stage: 2

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,8 @@ modules:
turbo_sync_free_moe_stage: 2

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true

# enable fp8 training
fp8: hybrid
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,5 +90,5 @@ modules:
turbo_sync_free_moe_stage: 2

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ modules:
turbo_sync_free_moe_stage: 2

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true

# enable fp8 training
fp8: hybrid
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,5 +80,5 @@ modules:
use_turbo_grouped_mlp: true

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ modules:
use_turbo_grouped_mlp: true

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true

# enable fp8 training
fp8: hybrid
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,5 +77,5 @@ modules:
use_turbo_grouped_mlp: true

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ modules:
use_turbo_grouped_mlp: true

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true

# enable fp8 training
fp8: hybrid
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,5 +80,5 @@ modules:
# sequence_parallel: 1

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ modules:
# sequence_parallel: 1

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true

# enable fp8 training
fp8: hybrid
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,5 +76,5 @@ modules:
use_turbo_attention: true

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ modules:
use_turbo_attention: true

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true

# enable fp8 training
fp8: hybrid
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,5 +72,5 @@ modules:
use_turbo_grouped_mlp: false

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ modules:
use_turbo_grouped_mlp: false

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true

# enable fp8 training
fp8: hybrid
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,5 +77,5 @@ modules:
use_turbo_grouped_mlp: true

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ modules:
use_turbo_grouped_mlp: true

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true

# enable fp8 training
fp8: hybrid
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,5 +73,5 @@ modules:
use_turbo_grouped_mlp: true

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ modules:
use_turbo_grouped_mlp: true

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true

# enable fp8 training
fp8: hybrid
Expand Down
85 changes: 85 additions & 0 deletions examples/megatron/configs/MI300X/mamba_370M-pretrain.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
work_group: ${PRIMUS_TEAM:amd}
user_name: ${PRIMUS_USER:root}
exp_name: ${PRIMUS_EXP_NAME:mamba_370M-pretrain}
workspace: ${PRIMUS_WORKSPACE:./output}

modules:
pre_trainer:
framework: megatron
config: pre_trainer.yaml

# model to run
model: mamba_370M.yaml
overrides:
# log
wandb_project: "Primus_Mamba_Pretrain"
# disable_wandb: false
# disable_tensorboard: false
stderr_sink_level: DEBUG

eval_iters: 0

log_avg_skip_iterations: 2
log_avg_reset_interval: 50

train_iters: 50
micro_batch_size: 4
global_batch_size: 256

seq_length: 2048
max_position_embeddings: 2048

lr: 3.0e-4
min_lr: 0.0
lr_warmup_iters: 50000
lr_decay_iters: 73192188
lr_decay_style: cosine
weight_decay: 0.1
adam_beta1: 0.9
adam_beta2: 0.95
eod_mask_loss: true
init_method_std: 0.02
norm_epsilon: 1.0e-5

# Mamba-specific: must provide spec
spec: ['megatron.core.models.mamba.mamba_layer_specs', 'mamba_stack_spec']

# Tokenizer
tokenizer_type: HuggingFaceTokenizer
tokenizer_model: EleutherAI/gpt-neox-20b

# parallel
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
overlap_grad_reduce: true
overlap_param_gather: true
gradient_accumulation_fusion: false

# data
mock_data: true
train_data_path: null
valid_data_path: null
test_data_path: null

# ckpt
finetune: false
auto_continue_train: false
load: null
no_load_optim: null
no_load_rng: null
save: null
save_interval: 20000
no_save_optim: null
no_save_rng: null
disable_last_saving: true
ckpt_format: torch

# Turbo - may need to disable for Mamba if not supported
enable_primus_turbo: false
use_turbo_attention: false
use_turbo_grouped_mlp: false

# Cross entropy flags
# cross_entropy_fusion_impl: "native"
# cross_entropy_loss_fusion: false
Original file line number Diff line number Diff line change
Expand Up @@ -73,5 +73,5 @@ modules:
ckpt_format: torch

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ modules:
ckpt_format: torch

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true

# enable fp8 training
fp8: hybrid
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,5 +68,5 @@ modules:
ckpt_format: torch

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ modules:
ckpt_format: torch

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true

# enable fp8 training
fp8: hybrid
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,5 +81,5 @@ modules:
use_turbo_grouped_mlp: true

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ modules:
use_turbo_grouped_mlp: true

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true

# enable fp8 training
fp8: hybrid
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,5 +74,5 @@ modules:
use_turbo_grouped_mlp: true

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ modules:
use_turbo_grouped_mlp: true

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
cross_entropy_fusion_impl: "te"
cross_entropy_loss_fusion: true

# enable fp8 training
fp8: hybrid
Expand Down
Loading
Loading