@@ -468,3 +468,45 @@ def get_moe_model_nparams_and_flops(
468468 nparams = nparams - nparams_embedding
469469
470470 return nparams , num_flops_per_token
471+
472+
473+ def validate_tokenizer_model_alignment (
474+ tokenizer : "BaseTokenizer | None" ,
475+ model_args : "BaseModelArgs" ,
476+ ) -> None :
477+ """
478+ Validate that tokenizer configuration matches model configuration.
479+
480+ Args:
481+ tokenizer: Tokenizer instance to validate. Can be None.
482+ model_args: Model arguments object containing configuration to validate against.
483+
484+ Raises:
485+ ValueError: If tokenizer and model configurations don't match.
486+ """
487+ if tokenizer is None :
488+ return
489+
490+ # Validate vocab_size
491+ if hasattr (model_args , "vocab_size" ):
492+ tokenizer_vocab_size = tokenizer .get_vocab_size ()
493+ model_vocab_size = model_args .vocab_size
494+ if tokenizer_vocab_size != model_vocab_size :
495+ raise ValueError (
496+ f"Tokenizer vocab_size ({ tokenizer_vocab_size } ) does not match "
497+ f"model vocab_size ({ model_vocab_size } ). "
498+ f"This mismatch will cause training errors. "
499+ f"Please ensure the tokenizer and model configuration are aligned."
500+ )
501+
502+ # Validate eos_id
503+ if hasattr (model_args , "eos_id" ):
504+ tokenizer_eos_id = getattr (tokenizer , "eos_id" , None )
505+ model_eos_id = model_args .eos_id
506+ if tokenizer_eos_id is not None and tokenizer_eos_id != model_eos_id :
507+ raise ValueError (
508+ f"Tokenizer eos_id ({ tokenizer_eos_id } ) does not match "
509+ f"model eos_id ({ model_eos_id } ). "
510+ f"This mismatch may cause training errors. "
511+ f"Please ensure the tokenizer and model configuration are aligned."
512+ )
0 commit comments