Skip to content

Commit faa2122

Browse files
Update tokenizer-model alignment validation logic
1 parent 59dc807 commit faa2122

File tree

1 file changed

+9
-20
lines changed

1 file changed

+9
-20
lines changed

torchtitan/models/utils.py

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -475,38 +475,27 @@ def validate_tokenizer_model_alignment(
475475
model_args: "BaseModelArgs",
476476
) -> None:
477477
"""
478-
Validate that tokenizer configuration matches model configuration.
478+
Validate that tokenizer configuration is compatible with model configuration.
479479
480480
Args:
481481
tokenizer: Tokenizer instance to validate. Can be None.
482482
model_args: Model arguments object containing configuration to validate against.
483483
484484
Raises:
485-
ValueError: If tokenizer and model configurations don't match.
485+
ValueError: If tokenizer vocab_size exceeds model vocab_size, which would
486+
cause index out of bounds errors during training.
486487
"""
487488
if tokenizer is None:
488489
return
489490

490-
# Validate vocab_size
491491
if hasattr(model_args, "vocab_size"):
492492
tokenizer_vocab_size = tokenizer.get_vocab_size()
493493
model_vocab_size = model_args.vocab_size
494-
if tokenizer_vocab_size != model_vocab_size:
494+
if model_vocab_size < tokenizer_vocab_size:
495495
raise ValueError(
496-
f"Tokenizer vocab_size ({tokenizer_vocab_size}) does not match "
497-
f"model vocab_size ({model_vocab_size}). "
498-
f"This mismatch will cause training errors. "
499-
f"Please ensure the tokenizer and model configuration are aligned."
500-
)
501-
502-
# Validate eos_id
503-
if hasattr(model_args, "eos_id"):
504-
tokenizer_eos_id = getattr(tokenizer, "eos_id", None)
505-
model_eos_id = model_args.eos_id
506-
if tokenizer_eos_id is not None and tokenizer_eos_id != model_eos_id:
507-
raise ValueError(
508-
f"Tokenizer eos_id ({tokenizer_eos_id}) does not match "
509-
f"model eos_id ({model_eos_id}). "
510-
f"This mismatch may cause training errors. "
511-
f"Please ensure the tokenizer and model configuration are aligned."
496+
f"Model vocab_size ({model_vocab_size}) is smaller than "
497+
f"tokenizer vocab_size ({tokenizer_vocab_size}). "
498+
f"This will cause index out of bounds errors during training. "
499+
f"The model's embedding layer must be at least as large as the "
500+
f"tokenizer's vocabulary size."
512501
)

0 commit comments

Comments
 (0)