Validate tokenizer and model alignment before training

CryptoSalamander · CryptoSalamander · commit 59dc807bd670 · 2025-11-21T08:25:53.000Z
diff --git a/torchtitan/models/utils.py b/torchtitan/models/utils.py
@@ -468,3 +468,45 @@ def get_moe_model_nparams_and_flops(
         nparams = nparams - nparams_embedding
 
     return nparams, num_flops_per_token
+
+
+def validate_tokenizer_model_alignment(
+    tokenizer: "BaseTokenizer | None",
+    model_args: "BaseModelArgs",
+) -> None:
+    """
+    Validate that tokenizer configuration matches model configuration.
+
+    Args:
+        tokenizer: Tokenizer instance to validate. Can be None.
+        model_args: Model arguments object containing configuration to validate against.
+
+    Raises:
+        ValueError: If tokenizer and model configurations don't match.
+    """
+    if tokenizer is None:
+        return
+
+    # Validate vocab_size
+    if hasattr(model_args, "vocab_size"):
+        tokenizer_vocab_size = tokenizer.get_vocab_size()
+        model_vocab_size = model_args.vocab_size
+        if tokenizer_vocab_size != model_vocab_size:
+            raise ValueError(
+                f"Tokenizer vocab_size ({tokenizer_vocab_size}) does not match "
+                f"model vocab_size ({model_vocab_size}). "
+                f"This mismatch will cause training errors. "
+                f"Please ensure the tokenizer and model configuration are aligned."
+            )
+
+    # Validate eos_id
+    if hasattr(model_args, "eos_id"):
+        tokenizer_eos_id = getattr(tokenizer, "eos_id", None)
+        model_eos_id = model_args.eos_id
+        if tokenizer_eos_id is not None and tokenizer_eos_id != model_eos_id:
+            raise ValueError(
+                f"Tokenizer eos_id ({tokenizer_eos_id}) does not match "
+                f"model eos_id ({model_eos_id}). "
+                f"This mismatch may cause training errors. "
+                f"Please ensure the tokenizer and model configuration are aligned."
+            )
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -25,6 +25,7 @@
 )
 from torchtitan.config import ConfigManager, JobConfig, TORCH_DTYPE_MAP
 from torchtitan.distributed import ParallelDims, utils as dist_utils
+from torchtitan.models.utils import validate_tokenizer_model_alignment
 from torchtitan.protocols.model_converter import build_model_converters
 from torchtitan.tools import utils
 from torchtitan.tools.logging import init_logger, logger
@@ -134,6 +135,8 @@ def __init__(self, job_config: JobConfig):
         model_args.update_from_config(job_config)
         self.model_args = model_args
 
+        validate_tokenizer_model_alignment(self.tokenizer, model_args)
+
         logger.info(
             f"Building {job_config.model.name} {job_config.model.flavor} with {model_args}"
         )