@@ -475,38 +475,27 @@ def validate_tokenizer_model_alignment(
475475 model_args : "BaseModelArgs" ,
476476) -> None :
477477 """
478- Validate that tokenizer configuration matches model configuration.
478+ Validate that tokenizer configuration is compatible with model configuration.
479479
480480 Args:
481481 tokenizer: Tokenizer instance to validate. Can be None.
482482 model_args: Model arguments object containing configuration to validate against.
483483
484484 Raises:
485- ValueError: If tokenizer and model configurations don't match.
485+ ValueError: If tokenizer vocab_size exceeds model vocab_size, which would
486+ cause index out of bounds errors during training.
486487 """
487488 if tokenizer is None :
488489 return
489490
490- # Validate vocab_size
491491 if hasattr (model_args , "vocab_size" ):
492492 tokenizer_vocab_size = tokenizer .get_vocab_size ()
493493 model_vocab_size = model_args .vocab_size
494- if tokenizer_vocab_size != model_vocab_size :
494+ if model_vocab_size < tokenizer_vocab_size :
495495 raise ValueError (
496- f"Tokenizer vocab_size ({ tokenizer_vocab_size } ) does not match "
497- f"model vocab_size ({ model_vocab_size } ). "
498- f"This mismatch will cause training errors. "
499- f"Please ensure the tokenizer and model configuration are aligned."
500- )
501-
502- # Validate eos_id
503- if hasattr (model_args , "eos_id" ):
504- tokenizer_eos_id = getattr (tokenizer , "eos_id" , None )
505- model_eos_id = model_args .eos_id
506- if tokenizer_eos_id is not None and tokenizer_eos_id != model_eos_id :
507- raise ValueError (
508- f"Tokenizer eos_id ({ tokenizer_eos_id } ) does not match "
509- f"model eos_id ({ model_eos_id } ). "
510- f"This mismatch may cause training errors. "
511- f"Please ensure the tokenizer and model configuration are aligned."
496+ f"Model vocab_size ({ model_vocab_size } ) is smaller than "
497+ f"tokenizer vocab_size ({ tokenizer_vocab_size } ). "
498+ f"This will cause index out of bounds errors during training. "
499+ f"The model's embedding layer must be at least as large as the "
500+ f"tokenizer's vocabulary size."
512501 )
0 commit comments