diff --git a/experimental/feature_selection_benchmark/extra_benchmark/feature_selection_benchmark_runner.py b/experimental/feature_selection_benchmark/extra_benchmark/feature_selection_benchmark_runner.py
index 2be7adc78..362e11e25 100644
--- a/experimental/feature_selection_benchmark/extra_benchmark/feature_selection_benchmark_runner.py
+++ b/experimental/feature_selection_benchmark/extra_benchmark/feature_selection_benchmark_runner.py
@@ -1,8 +1,21 @@
 """Shared infrastructure and entry point for feature selection benchmark evaluation.
 
 Usage:
-    python fs_benchmark_runner.py --mode validity --seed 42
-    python fs_benchmark_runner.py --mode stability --seed 42
+    python feature_selection_benchmark_runner.py \
+        --mode validity \
+        --method_name FSBench__RandomFeatureSelector__5__0__lgbm__3600 \
+        --data_foundry_task_id "UserTask|1386903908|anneal/019d3f7b-494a-71fa-8eb2-25d01dfb7792|/work/dlclarge1/purucker-fs_benchmark/.openml/tabarena_tasks" \
+        --repeat 0 \
+        --noise 1.0 \
+        --noise_type gaussian
+
+    python feature_selection_benchmark_runner.py \
+        --mode stability \
+        --method_name FSBench__RandomFeatureSelector__5__0__lgbm__3600 \
+        --data_foundry_task_id "UserTask|1386903908|anneal/019d3f7b-494a-71fa-8eb2-25d01dfb7792|/work/dlclarge1/purucker-fs_benchmark/.openml/tabarena_tasks" \
+        --repeat 0 \
+        --noise 1.0 \
+        --noise_type gaussian
 """
 
 from __future__ import annotations
@@ -10,18 +23,17 @@
 import argparse
 import time
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any
+from pathlib import Path
+from typing import Any
 
 import numpy as np
+import pandas as pd
 from tabarena.benchmark.feature_selection_methods.feature_selection_benchmark_utils import (
     selector_and_config_from_string,
 )
 from tabarena.benchmark.task.openml import OpenMLTaskWrapper
 from tabflow_slurm.run_tabarena_experiment import _parse_task_id
 
-if TYPE_CHECKING:
-    import pandas as pd
-
 
 @dataclass
 class FeatureSelectionResult:
@@ -36,6 +48,9 @@ class FeatureSelectionResult:
         repeat: Repeat number for the FS metric.
 
         selected_features: Names of selected features from the original_features.
+        num_classes: Number of classes in the target variable (for classification tasks).
+        num_samples: Number of samples in the dataset.
+
         elapsed_time_fs: Runtime measurement (seconds).
 
         mode: Evaluation mode ("validity" or "stability").
@@ -50,6 +65,9 @@ class FeatureSelectionResult:
     repeat: int
 
     selected_features: list[int]
+    num_classes: int
+    num_samples: int
+
     elapsed_time_fs: float
 
     mode: str
@@ -62,11 +80,11 @@ def _augment_dataset(
     original_features = list(X.columns)
 
     if mode == "validity":
-        from validity_fs_metric import get_dataset_for_validity
+        from validity_fs_metric import get_dataset_for_validity  # noqa: PLC0415
 
         X = get_dataset_for_validity(X=X, rng=rng, **kwargs)
     elif mode == "stability":
-        from stability_fs_metric import get_dataset_for_stability
+        from stability_fs_metric import get_dataset_for_stability  # noqa: PLC0415
 
         X, y = get_dataset_for_stability(X=X, y=y, rng=rng, **kwargs)
     else:
@@ -79,7 +97,7 @@ def _augment_dataset(
     return X, y, original_features
 
 
-def run_benchmark(
+def run_benchmark(  # noqa: D417
     *,
     data_foundry_task_id: str,
     mode: str,
@@ -109,7 +127,7 @@ def run_benchmark(
     feature_selector, config = selector_and_config_from_string(preprocessing_name=method_name)
 
     # Augment dataset with new feature based on mode.
-    X, y, original_features = _augment_dataset(mode=mode, X=X, rng=rng, **kwargs)
+    X, y, original_features = _augment_dataset(mode=mode, X=X, y=y, rng=rng, **kwargs)
 
     # Run Feature Selection
     start_time = time.monotonic()
@@ -135,6 +153,13 @@ def run_benchmark(
         mode_kwargs=kwargs,
     )
 
+def get_cache_path(args) -> Path:
+    """Generate the cache path based on arguments."""
+    cache_dir = Path(__file__).parent / "results"
+    cache_dir.mkdir(parents=True, exist_ok=True)  # Ensure the directory exists
+    cache_path = cache_dir / f"{args.mode}_{args.method_name}_{args.data_foundry_task_id.split('|')[3].split('/')[0]}_{args.repeat}.csv"
+    return cache_path
+
 
 def parse_args() -> argparse.Namespace:
     """Parse CLI arguments for the FS benchmark runner."""
@@ -149,14 +174,15 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--method_name",
         type=str,
-        default="FSBench__AccuracyFeatureSelector__5__0__lgbm__3600",
+        default="FSBench__RandomFeatureSelector__5__0__lgbm__3600",
         help="Feature Selection Method name [default: FSBench__AccuracyFeatureSelector__5__0__lgbm__3600]",
     )
     parser.add_argument(
         "--data_foundry_task_id",
         type=str,
-        default="anneal/019d3f7b-494a-71fa-8eb2-25d01dfb7792",
-        help="TabArena/OpenML task identifier [default: anneal/019d3f7b-494a-71fa-8eb2-25d01dfb7792]",
+        default="UserTask|1386903908|anneal/019d3f7b-494a-71fa-8eb2-25d01dfb7792|/Users/schaefer.bastian/.openml/tabarena_tasks",
+        help="TabArena/OpenML task metadata identifier [default: UserTask|1386903908|anneal/019d3f7b-494a-71fa-8eb2-25d"
+             "01dfb7792|/Users/schaefer.bastian/.openml/tabarena_tasks]",
     )
     parser.add_argument("--repeat", type=int, default=0, help="Repeat [default: 0]")
 
@@ -165,27 +191,41 @@ def parse_args() -> argparse.Namespace:
         "--noise",
         type=float,
         default=1.0,
-        nargs="+",
         help="Noise features relative to original count (validity mode only) [default: 1.0]",
     )
     parser.add_argument(
-        "--nose_type",
+        "--noise_type",
         type=str,
         choices=["gaussian", "uniform"],
         default="gaussian",
         help="Type of noise features to add (validity mode only) [default: random]",
     )
+    parser.add_argument(
+        "--ignore_cache",
+        type=bool,
+        default=False,
+        help="Whether to ignore existing cache and rerun the benchmark (useful for debugging) [default: False]",
+    )
     return parser.parse_args()
 
 
 if __name__ == "__main__":
     args = parse_args()
 
-    run_benchmark(
-        data_foundry_task_id=args.data_foundry_task_id,
-        mode=args.mode,
-        method_name=args.method_name,
-        repeat=args.repeat,
-        noise=args.noise,
-        nose_type=args.nose_type,
-    )
\ No newline at end of file
+    cache_path = get_cache_path(args)
+
+    if cache_path.exists() and not args.ignore_cache:
+        print(f"Cache exists at {cache_path}. Skipping operation.")
+    else:
+        result = run_benchmark(
+            data_foundry_task_id=args.data_foundry_task_id,
+            mode=args.mode,
+            method_name=args.method_name,
+            repeat=args.repeat,
+            noise=args.noise,
+            noise_type=args.noise_type,
+        )
+
+        print(result)
+        result = pd.DataFrame([result.__dict__])
+        result.to_csv(cache_path, index=False)
diff --git a/experimental/feature_selection_benchmark/extra_benchmark/run_setup_slurm_jobs_extra_benchmark.py b/experimental/feature_selection_benchmark/extra_benchmark/run_setup_slurm_jobs_extra_benchmark.py
new file mode 100644
index 000000000..1e690aeea
--- /dev/null
+++ b/experimental/feature_selection_benchmark/extra_benchmark/run_setup_slurm_jobs_extra_benchmark.py
@@ -0,0 +1,81 @@
+from pathlib import Path
+
+import pandas as pd
+import submitit
+from experimental.feature_selection_benchmark.tabarena_setup.fr_cluster_setup import (
+    ALL_TASK_METADATA,
+    FS_TIME_LIMIT,
+    FSBenchmarkConfig,
+)
+
+
+def run_extra_pipeline(mode, method_name, task_id, noise, noise_type):
+    """Function to run the extra pipeline."""
+    print(
+        f"Running extra pipeline with mode={mode}, method_name={method_name}, task_id={task_id}, noise={noise}, noise_type={noise_type}")
+    # Add your pipeline logic here
+
+
+if __name__ == "__main__":
+    method_names = FSBenchmarkConfig().get_default_preprocessing_configs(
+        fs_methods=[
+            "AccuracyFeatureSelector",
+            "RandomFeatureSelector",
+            "ANOVAFeatureSelector",
+            "CFSFeatureSelector",
+            "Chi2FeatureSelector",
+            "DISRFeatureSelector",
+            "GainRatioFeatureSelector",
+            "GiniFeatureSelector",
+            "ImpurityFeatureSelector",
+            "InformationGainFeatureSelector",
+            "INTERACTFeatureSelector",
+            "MarkovBlanketFeatureSelector",
+            "MIFeatureSelector",
+            "mRMRFeatureSelector",
+            "PearsonCorrelationFeatureSelector",
+            "ReliefFFeatureSelector",
+            "RFImportanceFeatureSelector",
+            "SequentialBackwardEliminationFeatureSelector",
+            "SequentialForwardSelectionFeatureSelector",
+            "SymmetricalUncertaintyFeatureSelector",
+            "LassoFeatureSelector",  # just for regression but with label encoder for classification?
+            "LaplacianScoreFeatureSelector",  # OOM, Segmentation fault issues
+            "ConsistencyFeatureSelector",
+            # selected_indices = np.where(S)[0].tolist(), UnboundLocalError: cannot access local variable 'S' where it is not associated with a value
+            "JMIFeatureSelector",
+            # time limit computed incorrectly, and error at remaining.remove(best_idx), ValueError: list.remove(x): x not in list
+            "OneRFeatureSelector",
+            # major OOM errors (tries to allocate one major array), wrong time limit computation,  max(accuracies, key=accuracies.get) -> max() iterable argument is empty
+            "ElasticNetFeatureSelector",  # Only for classification
+            "CMIMFeatureSelector",  # problems with time limit and fallback of features
+            # "tTestFeatureSelector", # Does not work for regression
+            "CARTFeatureSelector",  # Only implemented for classification, OOM problems as well
+        ]
+    )
+    task_ids = pd.read_csv(ALL_TASK_METADATA)["task_id_str"].tolist()
+    # Define the parameter grid
+    modes = ["validity", "stability"]
+    noises = [0.5, 0.75, 1.0]
+    noise_types = ["gaussian"]
+
+    # Create a SLURM executor
+    executor = submitit.AutoExecutor(folder=Path("slurm_logs"))
+    executor.update_parameters(
+        timeout_min=FS_TIME_LIMIT,  # Job timeout in minutes
+        slurm_partition="default",  # SLURM partition
+        cpus_per_task=8,  # Number of CPUs per task
+        mem_gb=32,  # Memory in GB
+    )
+
+    # Submit jobs
+    with executor.batch():
+        for mode in modes:
+            for method_name in method_names:
+                for task_id in task_ids:
+                    if mode == "validity":
+                        for noise in noises:
+                            for noise_type in noise_types:
+                                executor.submit(run_extra_pipeline, mode, method_name, task_id, noise, noise_type)
+                    else:
+                        executor.submit(run_extra_pipeline, mode, method_name, task_id, noise, noise_type)
diff --git a/tabarena/tabarena/benchmark/experiment/experiment_runner.py b/tabarena/tabarena/benchmark/experiment/experiment_runner.py
index be473f908..75efe3430 100644
--- a/tabarena/tabarena/benchmark/experiment/experiment_runner.py
+++ b/tabarena/tabarena/benchmark/experiment/experiment_runner.py
@@ -82,11 +82,19 @@ def __init__(
         self.eval_metric: Scorer = get_metric(metric=self.eval_metric_name, problem_type=self.task.problem_type)
         self.model: AbstractExecModel | None = None
         self.task_split_idx = self.task.get_split_idx(fold=self.fold, repeat=self.repeat, sample=self.sample)
-        self.X, self.y, self.X_test, self.y_test = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample)
+
+        if self.task.lazy_load_data:
+            assert input_format == "openml", "Lazy load data only works with input_format='openml'"
+            self.X, self.y, self.X_test, self.y_test = None, None, None, None
+            _, y, _, _ = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample)
+        else:
+            self.X, self.y, self.X_test, self.y_test = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample)
+            y = self.y
+
         if input_format == "csv":
             self.X = self.task.to_csv_format(X=self.X)
             self.X_test = self.task.to_csv_format(X=self.X_test)
-        self.label_cleaner = LabelCleaner.construct(problem_type=self.task.problem_type, y=self.y)
+        self.label_cleaner = LabelCleaner.construct(problem_type=self.task.problem_type, y=y)
         if cacher is None:
             cacher = CacheFunctionDummy()
         self.cacher = cacher
@@ -105,8 +113,19 @@ def split_seed(self):
         """We use the split index as a source for a seed that creates different randomness per split."""
         return self.task_split_idx
 
+    def _lazy_load_for_run_model_fit(self):
+        X, y, X_test, _ = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample)
+        return X, y, X_test
+
+
     def run_model_fit(self) -> dict:
-        return self.model.fit_custom(X=self.X, y=self.y, X_test=self.X_test, split_seed=self.split_seed)
+        if self.task.lazy_load_data:
+            lazy_load_function = self._lazy_load_for_run_model_fit
+            X, y, X_test = None, None, None
+        else:
+            lazy_load_function = None
+            X, y, X_test = self.X, self.y, self.X_test
+        return self.model.fit_custom(X=X, y=y, X_test=X_test, split_seed=self.split_seed, lazy_load_function=lazy_load_function)
 
     def run(self) -> dict:
         out = self._run()
@@ -157,11 +176,19 @@ def _run(self) -> dict:
                 self.handle_failure(exc=exc)
             raise
         out = self.post_fit(out=out)
+
+        if self.task.lazy_load_data:
+            _, _, _, y_test = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample)
+        else:
+            y_test = self.y_test
         out["metric_error"] = self.evaluate(
-            y_true=self.y_test,
+            y_true=y_test,
             y_pred=out["predictions"],
             y_pred_proba=out["probabilities"],
         )
+        if self.task.lazy_load_data:
+            del y_test
+
         out = self.post_evaluate(out=out)
         out["experiment_metadata"] = self._experiment_metadata(time_start=time_start, time_start_str=time_start_str)
         out = self.convert_to_output(out=out)
@@ -269,7 +296,12 @@ def post_evaluate(self, out: dict) -> dict:
                 simulation_artifact["pred_proba_dict_test"] = self.label_cleaner.transform_proba(out["probabilities"], as_pandas=True)
                 if self.task.problem_type == "binary":
                     simulation_artifact["pred_proba_dict_test"] = simulation_artifact["pred_proba_dict_test"].iloc[:, 1]
-            simulation_artifact["y_test"] = self.label_cleaner.transform(self.y_test)
+
+            if self.task.lazy_load_data:
+                _, _, _, y_test = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample)
+            else:
+                y_test = self.y_test
+            simulation_artifact["y_test"] = self.label_cleaner.transform(y_test)
 
             if self.optimize_simulation_artifacts_memory:
                 # optimize memory
@@ -296,7 +328,12 @@ def post_evaluate(self, out: dict) -> dict:
             simulation_artifact["metric"] = self.eval_metric_name
 
             if self.compute_bag_info and (self.model.can_get_per_child_oof and self.model.can_get_per_child_val_idx):
-                simulation_artifact["bag_info"] = self.model.bag_artifact(X_test=self.X_test)
+                if self.task.lazy_load_data:
+                    _, _, X_test, _ = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample)
+                else:
+                    X_test = self.X_test
+
+                simulation_artifact["bag_info"] = self.model.bag_artifact(X_test=X_test)
 
 
             simulation_artifact["pred_proba_dict_val"] = {self.method: simulation_artifact["pred_proba_dict_val"]}
diff --git a/tabarena/tabarena/benchmark/experiment/experiment_runner_api.py b/tabarena/tabarena/benchmark/experiment/experiment_runner_api.py
index 216f9e3a6..2b7055182 100644
--- a/tabarena/tabarena/benchmark/experiment/experiment_runner_api.py
+++ b/tabarena/tabarena/benchmark/experiment/experiment_runner_api.py
@@ -85,16 +85,12 @@ def _parse_repetitions_mode_and_args(
         metadata_task_ids = tasks_metadata["task_id"].astype(str).tolist()
         for task in tasks:
             t_id = task.task_id_str if isinstance(task, UserTask) else str(task)
-            assert t_id in metadata_task_ids, (
-                f"Task ID '{t_id}' from `tasks` not found in `tasks_metadata`"
-            )
+            assert t_id in metadata_task_ids, f"Task ID '{t_id}' from `tasks` not found in `tasks_metadata`"
 
             task_meta = tasks_metadata[metadata_task_ids == t_id].iloc[0]
             n_folds = int(task_meta["num_folds"])
             n_repeats = int(task_meta["tabarena_num_repeats"])
-            fold_repeat_pairs = [
-                (f, r) for r in range(n_repeats) for f in range(n_folds)
-            ]
+            fold_repeat_pairs = [(f, r) for r in range(n_repeats) for f in range(n_folds)]
             fold_repeat_pairs_per_task.append(fold_repeat_pairs)
         return fold_repeat_pairs_per_task
 
@@ -113,17 +109,12 @@ def _parse_repetitions_mode_and_args(
             assert all(isinstance(rep, tuple) for rep in repetitions_mode_args), (
                 "If `repetitions_mode_args` for 'matrix' is a list, all elements must be tuples"
             )
-            repetitions_mode_args = [
-                _clean_repetitions_mode_args_for_matrix(rep)
-                for rep in repetitions_mode_args
-            ]
+            repetitions_mode_args = [_clean_repetitions_mode_args_for_matrix(rep) for rep in repetitions_mode_args]
         else:
             assert isinstance(repetitions_mode_args, tuple), (
                 "If `repetitions_mode_args` for 'matrix' is not a list, it must be a tuple"
             )
-            repetitions_mode_args = [
-                _clean_repetitions_mode_args_for_matrix(repetitions_mode_args)
-            ] * len(tasks)
+            repetitions_mode_args = [_clean_repetitions_mode_args_for_matrix(repetitions_mode_args)] * len(tasks)
         return [[(f, r) for f in e[0] for r in e[1]] for e in repetitions_mode_args]
 
     if repetitions_mode == "individual":
@@ -133,15 +124,11 @@ def _parse_repetitions_mode_and_args(
         assert isinstance(repetitions_mode_args, list), (
             "If `repetitions_mode` is 'individual', `repetitions_mode_args` must be a list"
         )
-        assert len(repetitions_mode_args) > 0, (
-            "`repetitions_mode_args` for 'individual' must not be empty"
-        )
+        assert len(repetitions_mode_args) > 0, "`repetitions_mode_args` for 'individual' must not be empty"
 
         if isinstance(repetitions_mode_args[0], tuple):
             assert all(
-                isinstance(rep, tuple)
-                and (len(rep) == 2)
-                and all(isinstance(i, int) for i in rep)
+                isinstance(rep, tuple) and (len(rep) == 2) and all(isinstance(i, int) for i in rep)
                 for rep in repetitions_mode_args
             ), (
                 "If `repetitions_mode_args` for 'individual' is a list of tuples, all elements must be tuples of integers of (fold_index, repeat_index) pairs"
@@ -323,9 +310,7 @@ def run_experiments_new(
     """
     if run_mode == "aws":
         if s3_kwargs is None:
-            raise ValueError(
-                f"s3_kwargs parameter is required when mode is 'aws', got {s3_kwargs}"
-            )
+            raise ValueError(f"s3_kwargs parameter is required when mode is 'aws', got {s3_kwargs}")
         if s3_kwargs.get("bucket") is None or s3_kwargs.get("bucket") == "":
             raise ValueError(
                 f"bucket parameter in s3_kwargs is required when mode is 'aws', got {s3_kwargs.get('bucket')}"
@@ -334,9 +319,7 @@ def run_experiments_new(
     elif run_mode == "local":
         base_cache_path = output_dir
     else:
-        raise ValueError(
-            f"Invalid mode: {run_mode}. Supported modes are 'local' and 'aws'."
-        )
+        raise ValueError(f"Invalid mode: {run_mode}. Supported modes are 'local' and 'aws'.")
 
     assert all(isinstance(exp, Experiment) for exp in model_experiments), (
         "All `model_experiments` elements must be instances of Experiment class"
@@ -373,23 +356,15 @@ def run_experiments_new(
         task, tabarena_task_name, eval_metric_name = None, None, None
         print(f"Starting Dataset {dataset_index + 1}/{len(tasks)}...")
 
-        for split_index, (fold, repeat) in enumerate(
-            fold_repeat_pairs_per_task[dataset_index], start=1
-        ):
-            subtask_cache_name = ExperimentBatchRunner._subtask_name(
-                fold=fold, repeat=repeat
-            )
+        for split_index, (fold, repeat) in enumerate(fold_repeat_pairs_per_task[dataset_index], start=1):
+            subtask_cache_name = ExperimentBatchRunner._subtask_name(fold=fold, repeat=repeat)
             print(
                 f"Starting Split {split_index}/{len(fold_repeat_pairs_per_task[dataset_index])} (Fold {fold}, Repeat {repeat})..."
             )
 
             for me_index, model_experiment in enumerate(model_experiments, start=1):
                 cur_experiment_idx += 1
-                cache_task_key = (
-                    task_id_or_object
-                    if isinstance(task_id_or_object, int)
-                    else task_id_or_object.task_id
-                )
+                cache_task_key = task_id_or_object if isinstance(task_id_or_object, int) else task_id_or_object.task_id
                 print(
                     f"Starting Model {me_index}/{len(model_experiments)}..."
                     f"\n\t"
@@ -405,9 +380,7 @@ def run_experiments_new(
                 cache_name = "results"
                 cache_prefix = f"data/{model_experiment.name}/{cache_task_key}/{subtask_cache_name}"
                 cache_path = f"{base_cache_path}/{cache_prefix}"
-                cacher = CacheFunctionPickle(
-                    cache_name=cache_name, cache_path=cache_path
-                )
+                cacher = CacheFunctionPickle(cache_name=cache_name, cache_path=cache_path)
                 cache_exists = cacher.exists
 
                 # Check cache state
@@ -420,13 +393,9 @@ def run_experiments_new(
                 if cache_mode == "only":
                     out = cacher.load_cache()
                 else:
-                    if (task is None) and (
-                        (cache_mode == "ignore") or (not cache_exists)
-                    ):
+                    if (task is None) and ((cache_mode == "ignore") or (not cache_exists)):
                         if isinstance(task_id_or_object, int):
-                            if (s3_kwargs is not None) and (
-                                "dataset_cache" in s3_kwargs
-                            ):
+                            if (s3_kwargs is not None) and ("dataset_cache" in s3_kwargs):
                                 assert isinstance(s3_kwargs["dataset_cache"], str), (
                                     "'s3_kwargs `dataset_cache` must be a str!"
                                 )
@@ -435,9 +404,7 @@ def run_experiments_new(
                                     s3_dataset_cache=s3_kwargs["dataset_cache"],
                                 )
                             else:
-                                task = OpenMLTaskWrapper.from_task_id(
-                                    task_id=task_id_or_object
-                                )
+                                task = OpenMLTaskWrapper.from_task_id(task_id=task_id_or_object)
                             # TODO: maybe add a prefix to this.
                             tabarena_task_name = task.task.get_dataset().name
                         else:
@@ -445,6 +412,7 @@ def run_experiments_new(
                             task = OpenMLTaskWrapper(
                                 task=task_id_or_object.load_local_openml_task(),
                                 use_task_eval_metric=True,
+                                lazy_load_data=True,
                             )
 
                         eval_metric_name = task.eval_metric
@@ -455,6 +423,13 @@ def run_experiments_new(
                         from tabarena.benchmark.experiment.experiment_constructor import (
                             AGModelBagExperiment,
                         )
+                        from tabarena.benchmark.task.user_task import TabArenaOpenMLSupervisedTask
+
+                        if not isinstance(task.task, TabArenaOpenMLSupervisedTask):
+                            raise ValueError(
+                                "`dynamic_tabarena_validation_protocol` is only "
+                                "implemented for `TabArenaOpenMLSupervisedTask`!"
+                            )
 
                         if not isinstance(model_experiment, AGModelBagExperiment):
                             # TODO: add support
@@ -469,6 +444,20 @@ def run_experiments_new(
                             **task.get_validation_split_kwargs(),
                         )
 
+                        # FIXME: move this somewhere else and allow to enable/disable this.
+                        # Load text cache into memory for the current task
+                        from tabarena.benchmark.preprocessing.text_feature_generators import (
+                            SemanticTextFeatureGenerator,
+                        )
+
+                        cache_path = SemanticTextFeatureGenerator.get_text_cache_dir(task_id_str=str(task.task_id))
+                        if cache_path.exists():
+                            print("[LOADING TEXT CACHE] Loading text embedding cache into memory...")
+                            SemanticTextFeatureGenerator._embedding_look_up = (
+                                SemanticTextFeatureGenerator.load_embedding_cache(path=cache_path)
+                            )
+                            SemanticTextFeatureGenerator.only_load_from_cache = True
+
                     try:
                         out = model_experiment.run(
                             task=task,
@@ -491,10 +480,7 @@ def run_experiments_new(
 
                 # Safety check for results with non-finite metric errors
                 if (out is not None) and (
-                    not (
-                        np.isfinite(out["metric_error"])
-                        and np.isfinite(out["metric_error_val"])
-                    )
+                    not (np.isfinite(out["metric_error"]) and np.isfinite(out["metric_error_val"]))
                 ):
                     print(
                         "Non-finite final metric error detected: "
diff --git a/tabarena/tabarena/benchmark/models/ag/ebm/ebm_model.py b/tabarena/tabarena/benchmark/models/ag/ebm/ebm_model.py
index 4928868a6..6f91604a3 100644
--- a/tabarena/tabarena/benchmark/models/ag/ebm/ebm_model.py
+++ b/tabarena/tabarena/benchmark/models/ag/ebm/ebm_model.py
@@ -113,7 +113,6 @@ def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
             problem_type=self.problem_type,
             num_classes=self.num_classes,
             hyperparameters=self._get_model_params(),
-            features=self._features,
             **kwargs,
         )
 
diff --git a/tabarena/tabarena/benchmark/models/ag/knn_new/knn_model.py b/tabarena/tabarena/benchmark/models/ag/knn_new/knn_model.py
index 290d805fd..4449a6393 100644
--- a/tabarena/tabarena/benchmark/models/ag/knn_new/knn_model.py
+++ b/tabarena/tabarena/benchmark/models/ag/knn_new/knn_model.py
@@ -86,3 +86,18 @@ def _fit(self, X, y, num_cpus=-1, time_limit=None, sample_weight=None, **kwargs)
             self.model = self._get_model_type()(**params).fit(X, y)
         else:
             self.model = self._fit_with_samples(X=X, y=y, model_params=params, time_limit=time_limit - (time.time() - time_start))
+
+    # Higher mem_error_threshold of 0.4 for TabArena.
+    def _validate_fit_memory_usage(
+        self,
+        mem_error_threshold: float = 0.4,
+        mem_warning_threshold: float = 0.35,
+        mem_size_threshold: int = 1e7,
+        **kwargs,
+    ):
+        return super()._validate_fit_memory_usage(
+            mem_error_threshold=mem_error_threshold,
+            mem_warning_threshold=mem_warning_threshold,
+            mem_size_threshold=mem_size_threshold,
+            **kwargs,
+        )
\ No newline at end of file
diff --git a/tabarena/tabarena/benchmark/models/ag/realmlp/realmlp_model.py b/tabarena/tabarena/benchmark/models/ag/realmlp/realmlp_model.py
index b5c720bdf..65d33aaf9 100644
--- a/tabarena/tabarena/benchmark/models/ag/realmlp/realmlp_model.py
+++ b/tabarena/tabarena/benchmark/models/ag/realmlp/realmlp_model.py
@@ -4,17 +4,16 @@
 import logging
 import time
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Literal
+from typing import Literal
 
 import pandas as pd
+import numpy as np
 from autogluon.common.utils.resource_utils import ResourceManager
-from autogluon.core.models import AbstractModel
+from autogluon.tabular.models.abstract.abstract_torch_model import AbstractTorchModel
 from sklearn.impute import SimpleImputer
 
 from autogluon.tabular import __version__
 
-if TYPE_CHECKING:
-    import numpy as np
 
 logger = logging.getLogger(__name__)
 
@@ -31,7 +30,7 @@ def set_logger_level(logger_name: str, level: int):
 
 
 # pip install pytabkit
-class RealMLPModel(AbstractModel):
+class RealMLPModel(AbstractTorchModel):
     """RealMLP is an improved multilayer perception (MLP) model
     through a bag of tricks and better default hyperparameters.
 
@@ -56,6 +55,8 @@ def __init__(self, **kwargs):
         self._indicator_columns = None
         self._features_bool = None
         self._bool_to_cat = None
+        self._cat_col_names = None
+        self._category_mapping = None
 
     def get_model_cls(self, default_hyperparameters: Literal["td", "td_s"] = "td"):
         from pytabkit import (
@@ -77,6 +78,12 @@ def get_model_cls(self, default_hyperparameters: Literal["td", "td_s"] = "td"):
             model_cls = RealMLP_TD_S_Regressor
         return model_cls
 
+    def get_device(self) -> str:
+        return self.model.device
+
+    def _set_device(self, device: str):
+        self.model.to(device)
+
     def _fit(
         self,
         X: pd.DataFrame,
@@ -190,8 +197,7 @@ def _fit(
         # FIXME: In rare cases can cause exceptions if name_categories=False, unknown why
         extra_fit_kwargs = {}
         if name_categories:
-            cat_col_names = X.select_dtypes(include="category").columns.tolist()
-            extra_fit_kwargs["cat_col_names"] = cat_col_names
+            extra_fit_kwargs["cat_col_names"] = self._cat_col_names
 
         if X_val is not None:
             X_val = self.preprocess(X_val)
@@ -274,6 +280,28 @@ def _preprocess(
         if self._bool_to_cat and self._features_bool:
             # FIXME: Use CategoryFeatureGenerator? Or tell the model which is category
             X[self._features_bool] = X[self._features_bool].astype("category")
+
+        if is_train:
+            self._cat_col_names = X.select_dtypes(include="category").columns.tolist()
+
+        # Avoid bad dtype for cat categories in later ordinal encoding.
+        # Maps unseen categories to a new high integer.
+        if self._cat_col_names is not None:
+            if self._category_mapping is None:
+                self._category_mapping = {}
+                for col in self._cat_col_names:
+                    cats = X[col].cat.categories
+                    self._category_mapping[col] = {cat: code for code, cat in enumerate(cats)}
+
+            if self._category_mapping is not None:
+                for col in self._cat_col_names:
+                    mapping = self._category_mapping[col]
+                    unseen_code = len(mapping)
+                    nan_mask = X[col].isna()
+                    X[col] = X[col].astype(object)
+                    X[col] = X[col].map(mapping).fillna(unseen_code).astype(int).astype("category")
+                    X.loc[nan_mask, col] = np.nan
+
         return X
 
     def _set_default_params(self):
@@ -330,6 +358,7 @@ def _estimate_memory_usage_static(
         X: pd.DataFrame,
         hyperparameters: dict | None = None,
         num_classes: int = 1,
+        overhead_for_large_data: float = 1.5,
         **kwargs,
     ) -> int:
         """RealMLP memory estimation logic."""
@@ -373,7 +402,13 @@ def _estimate_memory_usage_static(
         res = alg_interface.get_required_resources(
             ds, n_cv=1, n_refit=0, n_splits=1, split_seeds=[0], n_train=n_samples
         )
-        return int(res.gpu_ram_gb * 1e9)
+
+        est = int(res.gpu_ram_gb * 1e9)
+
+        if n_samples > 250_000:
+            est = int(est * overhead_for_large_data)
+
+        return est
 
     def _validate_fit_memory_usage(self, mem_error_threshold: float = 1, **kwargs):
         return super()._validate_fit_memory_usage(
diff --git a/tabarena/tabarena/benchmark/models/ag/sap_rpt_oss/sap_rpt_oss_model.py b/tabarena/tabarena/benchmark/models/ag/sap_rpt_oss/sap_rpt_oss_model.py
index 9e94b89cd..074b6845e 100644
--- a/tabarena/tabarena/benchmark/models/ag/sap_rpt_oss/sap_rpt_oss_model.py
+++ b/tabarena/tabarena/benchmark/models/ag/sap_rpt_oss/sap_rpt_oss_model.py
@@ -4,7 +4,7 @@
 from typing import TYPE_CHECKING
 
 from autogluon.common.utils.resource_utils import ResourceManager
-from autogluon.core.models import AbstractModel
+from autogluon.tabular.models.abstract.abstract_torch_model import AbstractTorchModel
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -14,7 +14,7 @@
 
 
 # FIXME: model is for some reason super slow for 200 features and 50k samples (363616)
-class SAPRPTOSSModel(AbstractModel):
+class SAPRPTOSSModel(AbstractTorchModel):
     """ConTextTab Model: https://github.com/SAP-samples/sap-rpt-1-oss."""
 
     ag_key = "SAP-RPT-OSS"
@@ -69,11 +69,17 @@ def _set_default_params(self):
             "checkpoint": "2025-11-04_sap-rpt-one-oss.pt",
             "max_context_size": 8192,
             "bagging": 8,
-            "test_chunk_size": 1000,  # TODO, optimize based on dataset/VRAM?
+            "test_chunk_size": 4000,  # TODO, optimize based on dataset/VRAM?
         }
         for param, val in default_params.items():
             self._set_default_param_value(param, val)
 
+    def get_device(self) -> str:
+        return self.model.model.device
+
+    def _set_device(self, device: str):
+        self.model.model.to(device)
+
     @classmethod
     def supported_problem_types(cls) -> list[str] | None:
         return ["binary", "multiclass", "regression"]
@@ -90,7 +96,7 @@ def get_minimum_resources(
     ) -> dict[str, int | float]:
         return {
             "num_cpus": 1,
-            "num_gpus": 1 if is_gpu_available else 0,
+            "num_gpus": 0.5 if is_gpu_available else 0,
         }
 
     @classmethod
@@ -100,7 +106,6 @@ def _get_default_ag_args_ensemble(cls, **kwargs) -> dict:
         """
         default_ag_args_ensemble = super()._get_default_ag_args_ensemble(**kwargs)
         extra_ag_args_ensemble = {
-            "fold_fitting_strategy": "sequential_local",
             "refit_folds": True,
         }
         default_ag_args_ensemble.update(extra_ag_args_ensemble)
diff --git a/tabarena/tabarena/benchmark/models/ag/tabdpt/tabdpt_model.py b/tabarena/tabarena/benchmark/models/ag/tabdpt/tabdpt_model.py
index dea8d2e23..c2c26cafe 100644
--- a/tabarena/tabarena/benchmark/models/ag/tabdpt/tabdpt_model.py
+++ b/tabarena/tabarena/benchmark/models/ag/tabdpt/tabdpt_model.py
@@ -4,7 +4,7 @@
 
 from autogluon.common.utils.resource_utils import ResourceManager
 from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION
-from autogluon.core.models import AbstractModel
+from autogluon.tabular.models.abstract.abstract_torch_model import AbstractTorchModel
 from autogluon.features.generators import LabelEncoderFeatureGenerator
 
 if TYPE_CHECKING:
@@ -12,8 +12,7 @@
     import pandas as pd
 
 
-# FIXME: Add CPU loading support (.to(device))
-class TabDPTModel(AbstractModel):
+class TabDPTModel(AbstractTorchModel):
     ag_key = "TA-TABDPT"
     ag_name = "TA-TabDPT"
     seed_name = "seed"
@@ -23,6 +22,8 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self._feature_generator = None
         self._predict_hps = None
+        self._use_flash_og = None
+
 
     def _fit(
         self,
@@ -84,6 +85,23 @@ def _use_flash() -> bool:
 
         return capability != (7, 5)
 
+    def _post_fit(self, **kwargs):
+        super()._post_fit(**kwargs)
+        self._use_flash_og = self.model.use_flash
+        return self
+
+    def get_device(self) -> str:
+        return self.model.device
+
+    def _set_device(self, device: str):
+        self.model.to(device)
+        if device == "cpu":
+            self.model.use_flash = False
+            self.model.model.use_flash = False
+        else:
+            self.model.use_flash = self._use_flash_og
+            self.model.model.use_flash = self._use_flash_og
+
     def _get_default_resources(self) -> tuple[int, int]:
         # Use only physical cores for better performance based on benchmarks
         num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
@@ -97,7 +115,7 @@ def get_minimum_resources(
     ) -> dict[str, int | float]:
         return {
             "num_cpus": 1,
-            "num_gpus": 1 if is_gpu_available else 0,
+            "num_gpus": 0.5 if is_gpu_available else 0,
         }
 
     def _predict_proba(self, X, **kwargs) -> np.ndarray:
@@ -133,8 +151,46 @@ def _more_tags(self) -> dict:
     def _get_default_ag_args_ensemble(cls, **kwargs) -> dict:
         default_ag_args_ensemble = super()._get_default_ag_args_ensemble(**kwargs)
         extra_ag_args_ensemble = {
-            "fold_fitting_strategy": "sequential_local",
             "refit_folds": True,
         }
         default_ag_args_ensemble.update(extra_ag_args_ensemble)
         return default_ag_args_ensemble
+
+    # FIXME: This is copied from TabPFN, but TabDPT is not the same
+    @classmethod
+    def _estimate_memory_usage_static(
+        cls,
+        *,
+        X: pd.DataFrame,
+        hyperparameters: dict | None = None,
+        **kwargs,
+    ) -> int:
+        """Heuristic memory estimate based on TabPFN's memory estimate logic in:
+        https://github.com/PriorLabs/TabPFN/blob/57a2efd3ebdb3886245e4d097cefa73a5261a969/src/tabpfn/model/memory.py#L147.
+
+        This is based on GPU memory usage, but hopefully with overheads it also approximates CPU memory usage.
+        """
+        # TODO: update, this is not correct anymore, consider using internal TabPFN functions directly.
+        features_per_group = 3  # Based on TabPFNv2 default (unused)
+        n_layers = 12  # Based on TabPFNv2 default
+        embedding_size = 192  # Based on TabPFNv2 default
+        dtype_byte_size = 2  # Based on TabPFNv2 default
+
+        model_mem = 14489108  # Based on TabPFNv2 default
+
+        n_samples, n_features = X.shape[0], min(X.shape[1], 500)
+        n_feature_groups = (n_features) / features_per_group + 1  # TODO: Unsure how to calculate this
+
+        X_mem = n_samples * n_feature_groups * dtype_byte_size
+        activation_mem = n_samples * n_feature_groups * embedding_size * n_layers * dtype_byte_size
+
+        baseline_overhead_mem_est = 1e9  # 1 GB generic overhead
+
+        # Add some buffer to each term + 1 GB overhead to be safe
+        memory_estimate = model_mem + 4 * X_mem + 2 * activation_mem + baseline_overhead_mem_est
+
+        # TabDPT memory estimation is very inaccurate because it is using TabPFN memory estimate. Double it to be safe.
+        memory_estimate = memory_estimate * 2
+
+        # Note: This memory estimate is way off if `context_size` is not None
+        return int(memory_estimate)
diff --git a/tabarena/tabarena/benchmark/models/ag/tabicl/tabicl_model.py b/tabarena/tabarena/benchmark/models/ag/tabicl/tabicl_model.py
index d6e9dea21..8c87c6770 100644
--- a/tabarena/tabarena/benchmark/models/ag/tabicl/tabicl_model.py
+++ b/tabarena/tabarena/benchmark/models/ag/tabicl/tabicl_model.py
@@ -5,7 +5,7 @@
 
 from autogluon.common.utils.pandas_utils import get_approximate_df_mem_usage
 from autogluon.common.utils.resource_utils import ResourceManager
-from autogluon.core.models import AbstractModel
+from autogluon.tabular.models.abstract.abstract_torch_model import AbstractTorchModel
 from autogluon.tabular import __version__
 
 if TYPE_CHECKING:
@@ -14,7 +14,7 @@
 logger = logging.getLogger(__name__)
 
 
-class TabICLModelBase(AbstractModel):
+class TabICLModelBase(AbstractTorchModel):
     """TabICL is a foundation model for tabular data using in-context learning
     that is scalable to larger datasets than TabPFNv2. It is pretrained purely on synthetic data.
     TabICL currently only supports classification tasks.
@@ -238,6 +238,19 @@ def _set_default_params(self):
         for param, val in default_params.items():
             self._set_default_param_value(param, val)
 
+    def get_device(self) -> str:
+        return self.model.device_.type
+
+    # TODO: Better to have an official TabICL method for this
+    def _set_device(self, device: str):
+        device = self.to_torch_device(device)
+        self.model.device_ = device
+        self.model.device = self.model.device_.type
+        self.model.model_ = self.model.model_.to(self.model.device_)
+        self.model.inference_config_.COL_CONFIG.device = self.model.device_
+        self.model.inference_config_.ROW_CONFIG.device = self.model.device_
+        self.model.inference_config_.ICL_CONFIG.device = self.model.device_
+
 class TabICLv2Model(TabICLModelBase):
     """TabICLv2 model as used on TabArena."""
 
diff --git a/tabarena/tabarena/benchmark/models/ag/tabm/_tabm_internal.py b/tabarena/tabarena/benchmark/models/ag/tabm/_tabm_internal.py
index 58e164165..609aa7258 100644
--- a/tabarena/tabarena/benchmark/models/ag/tabm/_tabm_internal.py
+++ b/tabarena/tabarena/benchmark/models/ag/tabm/_tabm_internal.py
@@ -43,7 +43,7 @@ def get_tabm_auto_batch_size(n_train: int) -> int:
         return 256
     if n_train < 108_000:
         return 512
-    return 1024
+    return 768 # Adjust to be lower to fit on 80 GB for very large datasets.
 
 
 class RTDLQuantileTransformer(BaseEstimator, TransformerMixin):
diff --git a/tabarena/tabarena/benchmark/models/ag/tabm/tabm_model.py b/tabarena/tabarena/benchmark/models/ag/tabm/tabm_model.py
index 3f974d5d3..2895abff9 100644
--- a/tabarena/tabarena/benchmark/models/ag/tabm/tabm_model.py
+++ b/tabarena/tabarena/benchmark/models/ag/tabm/tabm_model.py
@@ -10,14 +10,14 @@
 
 import pandas as pd
 from autogluon.common.utils.resource_utils import ResourceManager
-from autogluon.core.models import AbstractModel
+from autogluon.tabular.models.abstract.abstract_torch_model import AbstractTorchModel
 
 from autogluon.tabular import __version__
 
 logger = logging.getLogger(__name__)
 
 
-class TabMModel(AbstractModel):
+class TabMModel(AbstractTorchModel):
     """TabM is an efficient ensemble of MLPs that is trained simultaneously with mostly shared parameters.
 
     TabM is one of the top performing methods overall on TabArena-v0.1: https://tabarena.ai
@@ -143,6 +143,14 @@ def _preprocess(
 
         return X
 
+    def get_device(self) -> str:
+        return self.model.device_.type
+
+    def _set_device(self, device: str):
+        device = self.to_torch_device(device)
+        self.model.device_ = device
+        self.model.model_ = self.model.model_.to(device)
+
     def _set_default_params(self):
         default_params = dict(
             random_state=0,
@@ -260,9 +268,16 @@ def _estimate_tabm_ram(
         mem_ds = n_samples * (4 * n_numerical + 8 * len(cat_sizes))
 
         # some safety constants and offsets (the 5 is probably excessive)
-        return (
+        res = (
             5 * mem_ds + 1.2 * mem_forward_backward + 1.2 * mem_params + 0.3 * (1024**3)
         )
+        # Safety overhead
+        res = res * 1.5
+        logger.log(
+            40,
+            f"\tEstimated memory usage {res/1e9:4}.",
+        )
+        return res
 
     @classmethod
     def get_tabm_auto_batch_size(cls, n_samples: int) -> int:
@@ -277,7 +292,7 @@ def get_tabm_auto_batch_size(cls, n_samples: int) -> int:
             return 256
         if n_samples < 108_000:
             return 512
-        return 1024
+        return 768 # Adjust to be lower to fit on 80 GB for very large datasets.
 
     @classmethod
     def _class_tags(cls):
diff --git a/tabarena/tabarena/benchmark/models/ag/tabpfnv2_5/tabpfnv2_5_model.py b/tabarena/tabarena/benchmark/models/ag/tabpfnv2_5/tabpfnv2_5_model.py
index 688e8c2d9..7404f023a 100644
--- a/tabarena/tabarena/benchmark/models/ag/tabpfnv2_5/tabpfnv2_5_model.py
+++ b/tabarena/tabarena/benchmark/models/ag/tabpfnv2_5/tabpfnv2_5_model.py
@@ -6,7 +6,7 @@
 from typing import TYPE_CHECKING
 
 from autogluon.common.utils.resource_utils import ResourceManager
-from autogluon.core.models import AbstractModel
+from autogluon.tabular.models.abstract.abstract_torch_model import AbstractTorchModel
 from autogluon.features.generators import LabelEncoderFeatureGenerator
 
 if TYPE_CHECKING:
@@ -17,7 +17,7 @@
 _HAS_LOGGED_TABPFN_LICENSE: bool = False
 
 
-class TabPFNModel(AbstractModel):
+class TabPFNModel(AbstractTorchModel):
     """TabPFN-2.5 is a tabular foundation model that is developed and maintained by PriorLabs: https://priorlabs.ai/.
 
     This class is an abstract template for various TabPFN versions as subclasses.
@@ -267,6 +267,12 @@ def _set_default_params(self):
         for param, val in default_params.items():
             self._set_default_param_value(param, val)
 
+    def get_device(self) -> str:
+        return self.model.devices_[0].type
+
+    def _set_device(self, device: str):
+        self.model.to(device)
+
     @classmethod
     def supported_problem_types(cls) -> list[str] | None:
         return ["binary", "multiclass", "regression"]
diff --git a/tabarena/tabarena/benchmark/models/wrapper/AutoGluon_class.py b/tabarena/tabarena/benchmark/models/wrapper/AutoGluon_class.py
index 9efc1c992..928b2c641 100644
--- a/tabarena/tabarena/benchmark/models/wrapper/AutoGluon_class.py
+++ b/tabarena/tabarena/benchmark/models/wrapper/AutoGluon_class.py
@@ -50,14 +50,11 @@ def _resolve_validation_protocol(
         init_kwargs = copy.deepcopy(self.init_kwargs)
         fit_kwargs = copy.deepcopy(self.fit_kwargs)
 
-        # TODO: think about if we can reset the index here without breaking simulation artifacts
-        train_data = X.copy()
-
         num_folds = fit_kwargs.pop("num_bag_folds", None)
         num_repeats = fit_kwargs.pop("num_bag_folds", None)
 
         custom_splits, num_folds, num_repeats = self.resolve_validation_splits(
-            X=train_data.reset_index(drop=True),
+            X=X.reset_index(drop=True),
             y=y.reset_index(drop=True),
             num_folds=num_folds,
             num_repeats=num_repeats
@@ -89,9 +86,18 @@ def _resolve_validation_protocol(
                     feature_generator_kwargs[param] = value
             fit_kwargs["feature_generator"] = feature_generator_cls(**feature_generator_kwargs)
 
+        # TODO: think about if we can reset the index here without breaking simulation artifacts
+        if self._can_use_data_in_place:
+            train_data = X
+            if X_val is not None:
+                tuning_data = X_val
+        else:
+            train_data = X.copy()
+            if X_val is not None:
+                tuning_data = X_val.copy()
+
         train_data[self.label] = y
         if X_val is not None:
-            tuning_data = X_val.copy()
             tuning_data[self.label] = y_val
             fit_kwargs["tuning_data"] = tuning_data
 
@@ -115,7 +121,6 @@ def _fit(
             X_val=X_val,
             y_val=y_val,
         )
-        del X, y, X_val, y_val # encourage memory release
 
         self.predictor = TabularPredictor(
             label=self.label,
diff --git a/tabarena/tabarena/benchmark/models/wrapper/abstract_class.py b/tabarena/tabarena/benchmark/models/wrapper/abstract_class.py
index a86eba329..900e37af3 100644
--- a/tabarena/tabarena/benchmark/models/wrapper/abstract_class.py
+++ b/tabarena/tabarena/benchmark/models/wrapper/abstract_class.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from typing import Callable
+
 import numpy as np
 import pandas as pd
 from autogluon.core.data.label_cleaner import LabelCleaner, LabelCleanerDummy
@@ -9,7 +11,6 @@
 from tabarena.utils.time_utils import Timer
 from tabarena.benchmark.models.wrapper.validation_utils import TabArenaValidationProtocolExecMixin
 
-
 class AbstractExecModel(TabArenaValidationProtocolExecMixin):
     can_get_error_val = False
     can_get_oof = False
@@ -44,6 +45,7 @@ def __init__(
         self._feature_generator = None
         self.failure_artifact = None
         self.shuffle_features = shuffle_features
+        self._can_use_data_in_place = False
 
     def transform_y(self, y: pd.Series) -> pd.Series:
         return self.label_cleaner.transform(y)
@@ -76,42 +78,28 @@ def _preprocess_fit_transform(self, X: pd.DataFrame, y: pd.Series):
     def post_fit(self, X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame):
         pass
 
-    def fit_custom(self, X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame, *, split_seed: int | None = None) -> dict:
-        og_index = X_test.index
-        inv_perm = None
-
-        if self.shuffle_test:
-            perm, inv_perm = _make_perm(len(X_test), seed=self.shuffle_seed)
-            X_test = X_test.iloc[perm]
-        if self.reset_index_test:
-            X_test = X_test.reset_index(drop=True)
-        if self.shuffle_features:
-            assert split_seed is not None, "If shuffle_features is True, split_seed must not be None!"
-            features = list(X.columns)
-            rng = np.random.default_rng(seed=split_seed)
-            rng.shuffle(features)
-            X, X_test = X[features], X_test[features]
-
-        out = self._fit_custom(X=X, y=y, X_test=X_test)
-
-        if self.shuffle_test:
-            # Inverse-permute outputs back to original X_test order
-            out["predictions"] = _apply_inv_perm(out["predictions"], inv_perm, index=og_index)
-            if out["probabilities"] is not None:
-                out["probabilities"] = _apply_inv_perm(out["probabilities"], inv_perm, index=og_index)
-        elif self.reset_index_test:
-            out["predictions"].index = og_index
-            if out["probabilities"] is not None:
-                out["probabilities"].index = og_index
-
-        return out
-
     # TODO: Prateek, Add a toggle here to see if user wants to fit or fit and predict, also add model saving functionality
     # TODO: Nick: Temporary name
-    def _fit_custom(self, X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame) -> dict:
+    def fit_custom(
+            self,
+            X: pd.DataFrame | None,
+            y: pd.Series | None,
+            X_test: pd.DataFrame | None,
+            *,
+            split_seed: int | None = None,
+            lazy_load_function: Callable | None = None
+    ) -> dict:
         """
         Calls the fit function of the inheriting class and proceeds to perform predictions based on the problem type
 
+        Arguments
+        ---------
+        split_seed:
+            If not None, the seed that is different per split to use for shuffling features.
+        lazy_load_function:
+            If not None, a function that one can call to load X, y, X_test lazily (e.g. to save memory by not
+            loading them until needed). If provided, X, y, and X_test arguments must be None.
+
         Returns
         -------
         dict
@@ -119,9 +107,38 @@ def _fit_custom(self, X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame) -> di
         """
         from tabarena.utils.memory_utils import CpuMemoryTracker, GpuMemoryTracker
 
+        if lazy_load_function is not None:
+            assert X is None and y is None and X_test is None, "If lazy_load_function is provided, X and y must be None"
+            X, y, _ = lazy_load_function()
+            self._can_use_data_in_place = True
+
+        shuffled_features = None
+        if self.shuffle_features:
+            assert split_seed is not None, "If shuffle_features is True, split_seed must not be None!"
+            shuffled_features = list(X.columns)
+            rng = np.random.default_rng(seed=split_seed)
+            rng.shuffle(shuffled_features)
+            X = X[shuffled_features]
+
         with CpuMemoryTracker() as cpu_tracker, GpuMemoryTracker(device=0) as gpu_tracker, Timer() as timer_fit:
             self.fit(X, y)
 
+        # Reload all, allows X,y to be used in-place
+        if lazy_load_function is not None:
+            del X, y, X_test  # Free memory from previous load
+            X, y, X_test = lazy_load_function()
+
+        og_index = X_test.index
+        inv_perm = None
+        if self.shuffle_test:
+            perm, inv_perm = _make_perm(len(X_test), seed=self.shuffle_seed)
+            X_test = X_test.iloc[perm]
+        if self.reset_index_test:
+            X_test = X_test.reset_index(drop=True)
+        if shuffled_features is not None:
+            X_test = X_test[shuffled_features]
+            X = X[shuffled_features]
+
         self.post_fit(X=X, y=y, X_test=X_test)
 
         if self.problem_type in ["binary", "multiclass"]:
@@ -152,6 +169,16 @@ def _fit_custom(self, X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame) -> di
             gpu_tracking_enabled=gpu_tracker.enabled,
         )
 
+        if self.shuffle_test:
+            # Inverse-permute outputs back to original X_test order
+            out["predictions"] = _apply_inv_perm(out["predictions"], inv_perm, index=og_index)
+            if out["probabilities"] is not None:
+                out["probabilities"] = _apply_inv_perm(out["probabilities"], inv_perm, index=og_index)
+        elif self.reset_index_test:
+            out["predictions"].index = og_index
+            if out["probabilities"] is not None:
+                out["probabilities"].index = og_index
+
         return out
 
     def fit(self, X: pd.DataFrame, y: pd.Series, X_val=None, y_val=None):
diff --git a/tabarena/tabarena/benchmark/models/wrapper/validation_utils.py b/tabarena/tabarena/benchmark/models/wrapper/validation_utils.py
index e6b24d841..54b752781 100644
--- a/tabarena/tabarena/benchmark/models/wrapper/validation_utils.py
+++ b/tabarena/tabarena/benchmark/models/wrapper/validation_utils.py
@@ -139,9 +139,9 @@ def resolve_validation_splits(
 
         stratify_on_data = None
         if self.stratify_on is not None:
-            stratify_on_data = (
-                X[self.stratify_on] if self.stratify_on in X.columns else y
-            )
+            stratify_on_data = X[self.stratify_on] if self.stratify_on in X.columns else y
+            # Enforce categorical dtype for stratification column, as some splitting logic relies on it.
+            stratify_on_data = stratify_on_data.astype("category")
 
         groups_data = None
         group_labels = None
@@ -149,14 +149,10 @@ def resolve_validation_splits(
             raise NotImplementedError
 
         if self.time_on is not None:
-            groups_data, num_folds_new = self.time_on_to_groups_data(
-                X=X, time_on=self.time_on, num_folds=num_folds
-            )
+            groups_data, num_folds_new = self.time_on_to_groups_data(X=X, time_on=self.time_on, num_folds=num_folds)
             num_repeats = 1
             logger.info(
-                f"\n\tFolds time-based grouping: before={num_folds}; "
-                f"after={num_folds_new}"
-                f"\n\tnum_repeats set to 1!"
+                f"\n\tFolds time-based grouping: before={num_folds}; after={num_folds_new}\n\tnum_repeats set to 1!"
             )
             num_folds = num_folds_new
             # Set group labels as needed for time split
@@ -167,6 +163,19 @@ def resolve_validation_splits(
             group_labels = self.group_labels
 
         if groups_data is not None:
+            if num_repeats is None:
+                num_repeats = 1
+
+            n_groups = groups_data.nunique()
+            if n_groups < num_folds:
+                logger.info(
+                    f"Number of unique groups in the data ({n_groups}) is less than the "
+                    f"number of folds ({num_folds})! Adjusting the number of folds to be equal to the number of "
+                    f"unique groups, and setting num_repeats to 1."
+                )
+                num_folds = n_groups
+                num_repeats = 1
+
             custom_splits = self._resolve_group_splits(
                 X=X,
                 num_folds=num_folds,
@@ -179,28 +188,54 @@ def resolve_validation_splits(
         # Sanity checks for custom splits
         if custom_splits is not None:
             for train_idx, test_idx in custom_splits:
+
+                assert len(train_idx) > 0, "Train split is empty!"
+                assert len(test_idx) > 0, "Test split is empty!"
+
                 if stratify_on_data is not None:
-                    stratify_values = stratify_on_data.unique()
-                    train_stratify_values = set(
-                        stratify_on_data.iloc[train_idx].unique()
-                    )
+                    stratify_values = set(stratify_on_data.unique())
+                    train_stratify_values = set(stratify_on_data.iloc[train_idx].unique())
                     test_stratify_values = set(stratify_on_data.iloc[test_idx].unique())
-                    assert (
-                        train_stratify_values
-                        == test_stratify_values
-                        == set(stratify_values)
-                    ), (
-                        f"Stratification values in train and test splits do not match!"
+
+                    assert train_stratify_values == stratify_values, (
+                        "[Missing Train Stratification Values] "
+                        "Stratification values in train split do not match overall stratification values!"
                         f"\n\tOverall stratification values: {stratify_values}"
                         f"\n\tTrain stratification values: {train_stratify_values}"
+                    )
+                    assert test_stratify_values.issubset(train_stratify_values), (
+                        "[Unseen Test Stratification Values] "
+                        "Stratification values in test split are not a subset of train stratification values!"
+                        f"\n\tTrain stratification values: {train_stratify_values}"
                         f"\n\tTest stratification values: {test_stratify_values}"
                     )
 
+                    if train_stratify_values != stratify_values:
+                        # Check if test has all labels for binary, as metrics require it.
+                        if len(stratify_values) == 2:
+                            raise ValueError(
+                                "[Binary Metric Missing Stratification Values in Test] "
+                                "Stratification values in train and test splits do not match!"
+                                f"\n\tOverall stratification values: {stratify_values}"
+                                f"\n\tTrain stratification values: {train_stratify_values}"
+                                f"\n\tTest stratification values: {test_stratify_values}"
+                            )
+
+                        # For multi-stratify values, we do not allow missing a stratify value in the test split
+                        logger.warning(
+                            "[Stratification Value Missing in Test Data] "
+                            "Stratification values in train and test splits are not identical."
+                            "This means the validation data is likely missing some classes."
+                            f"\n\tOverall stratification values: {stratify_values}"
+                            f"\n\tTrain stratification values: {train_stratify_values}"
+                            f"\n\tTest stratification values: {test_stratify_values}"
+                        )
+
         return custom_splits, num_folds, num_repeats
 
     def _resolve_number_of_splits(
-        self, *, num_folds: int, num_repeats: int, num_group_instances: int
-    ) -> tuple[int, int]:
+        self, *, num_folds: int, num_repeats: int | None, num_group_instances: int
+    ) -> tuple[int, int | None]:
         """Determine the number of splits we want to use.
 
         Parameters
@@ -213,25 +248,30 @@ def _resolve_number_of_splits(
             The number of group instances in the data.
         """
         new_num_folds, new_num_repeats = None, None
+        new_num_folds_reason, new_num_repeats_reason = "", ""
         if num_group_instances <= self.max_samples_for_tiny_data:
             new_num_folds = self.tiny_data_num_folds
             new_num_repeats = self.tiny_data_num_repeats
+            new_num_folds_reason += "Tiny data"
+            new_num_repeats_reason += "Tiny data"
         else:
             # We want these by default for all other data in our benchmark.
             assert num_folds == 8
             assert (num_repeats == 1) or (num_repeats is None)
 
+
+
         if new_num_folds is not None:
             logger.info(
-                f"\nUpdating num_bag_folds from {new_num_folds} to {new_num_folds}"
-                f" since number of group instances is less than num_bag_folds."
+                f"\nUpdating num_bag_folds from {num_folds} to {new_num_folds} "
+                f"because: {new_num_folds_reason}"
             )
             num_folds = new_num_folds
 
         if new_num_repeats is not None:
             logger.info(
-                f"\nUpdating num_bag_sets from {num_repeats} to {new_num_repeats}"
-                f" since number of group instances is less than num_bag_folds."
+                f"\nUpdating new_num_repeats from {num_repeats} to {new_num_repeats}"
+                f"because: {new_num_repeats_reason}"
             )
             num_repeats = new_num_repeats
 
@@ -314,17 +354,13 @@ def group_on_to_groups_data(*, X: pd.DataFrame, group_on: str | list[str]):
         return groups_data.copy()
 
     @staticmethod
-    def time_on_to_groups_data(
-        *, X: pd.DataFrame, time_on: str, num_folds: int
-    ) -> tuple[pd.Series, int]:
+    def time_on_to_groups_data(*, X: pd.DataFrame, time_on: str, num_folds: int) -> tuple[pd.Series, int]:
         """Go from time column to a group column for splits."""
         time_data = X[time_on]
 
         if pd.api.types.is_datetime64_any_dtype(time_data):
-            time_data = time_data.view("int64")
-        assert pd.api.types.is_numeric_dtype(time_data), (
-            "Time_on column is not datetime or numeric!"
-        )
+            time_data = time_data.astype("int64")
+        assert pd.api.types.is_numeric_dtype(time_data), "Time_on column is not datetime or numeric!"
 
         return split_time_index_into_intervals(
             time_data=time_data,
@@ -345,7 +381,7 @@ def _get_group_columns_to_drop(self) -> list[str]:
             cols += self.group_on if isinstance(self.group_on, list) else [self.group_on]
         return cols
 
-    def get_num_group_instances(self, X: pd.DataFrame):
+    def get_num_group_instances(self, X: pd.DataFrame, *, group_labels: None = None) -> int:
         """Compute the number of rows that represent how much (multi-instance) samples
         the data has. This is used to determine which splits to use.
         """
@@ -405,9 +441,7 @@ def split_time_index_into_intervals(
 
     n_unique = len(counts)
     if n_unique < 2:
-        raise ValueError(
-            "Need at least 2 unique time values to create at least 2 intervals."
-        )
+        raise ValueError("Need at least 2 unique time values to create at least 2 intervals.")
     actual_n_intervals = min(goal_n_intervals, n_unique)
     if actual_n_intervals < 2:
         raise ValueError("Could not create at least 2 intervals.")
diff --git a/tabarena/tabarena/benchmark/preprocessing/model_agnostic_default_preprocessing.py b/tabarena/tabarena/benchmark/preprocessing/model_agnostic_default_preprocessing.py
index 724c7cd59..660bbd5c2 100644
--- a/tabarena/tabarena/benchmark/preprocessing/model_agnostic_default_preprocessing.py
+++ b/tabarena/tabarena/benchmark/preprocessing/model_agnostic_default_preprocessing.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import pandas as pd
 from autogluon.common.features.types import (
     R_BOOL,
@@ -31,6 +33,9 @@
 )
 from tabarena.benchmark.task.user_task import GroupLabelTypes
 
+if TYPE_CHECKING:
+    from autogluon.common.features.feature_metadata import FeatureMetadata
+
 
 # TODO: we likely need some kind of off-loading logic for text features
 class TabArenaModelAgnosticPreprocessing(AutoMLPipelineFeatureGenerator):
@@ -39,12 +44,12 @@ class TabArenaModelAgnosticPreprocessing(AutoMLPipelineFeatureGenerator):
     def __init__(
         self,
         *,
-        enable_datetime_features: bool = False,
-        enable_text_ngram_features: bool = False,
-        enable_text_special_features: bool = True,
         enable_sematic_text_features: bool = True,
-        enable_statistical_text_features: bool = True,
         enable_new_datetime_features: bool = True,
+        enable_text_special_features: bool = False,
+        enable_statistical_text_features: bool = False,
+        enable_text_ngram_features: bool = False,
+        enable_datetime_features: bool = False,
         group_cols: str | list[str] | None = None,
         group_labels: GroupLabelTypes | None = None,
         group_time_on: str | None = None,
@@ -107,6 +112,35 @@ def __init__(
             **kwargs,
         )
 
+    def fit_transform(
+        self, X: pd.DataFrame, y: pd.Series | None = None, feature_metadata_in: FeatureMetadata = None, **kwargs
+    ) -> pd.DataFrame:
+        """Rename columns with '.' before AutoGluon stores feature metadata.
+
+        AutoGluon's ``AbstractFeatureGenerator.fit_transform`` records ``features_in``
+        from the *original* X before calling ``_fit_transform``.  We must therefore
+        rename at the public API level so that the stored metadata matches what the
+        downstream generators will see.
+
+        The ``"."`` character is reserved as the source-column separator in text
+        feature names produced downstream (e.g. ``TextSpecialFeatureGenerator``
+        produces ``{col}.char_count``).  Sanitizing raw column names here prevents
+        parsing ambiguity in
+        ``TextEmbeddingDimensionalityReductionFeatureGenerator._parse_source_column``.
+        """
+        self._dot_rename_map_: dict[str, str] = {c: str(c).replace(".", "_") for c in X.columns if "." in str(c)}
+        if self._dot_rename_map_:
+            X = X.rename(columns=self._dot_rename_map_)
+            if feature_metadata_in is not None:
+                feature_metadata_in = feature_metadata_in.rename_features(rename_map=self._dot_rename_map_)
+        return super().fit_transform(X, y=y, feature_metadata_in=feature_metadata_in, **kwargs)
+
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        """Apply the same dot-renaming as fit before passing to parent transform."""
+        if self._dot_rename_map_:
+            X = X.rename(columns=self._dot_rename_map_)
+        return super().transform(X)
+
     def _get_category_feature_generator(self):
         # Pass categorical columns through *without* encoding.
         # Cat handling is deferred to TabArenaModelSpecificPreprocessing.
@@ -129,43 +163,25 @@ def _get_category_feature_generator(self):
 # TODO: maybe better cardinality threshold but we assume we only
 #  run on well-curated data for now
 class StringFixAsTypeFeatureGenerator(AsTypeFeatureGenerator):
-    """Custom AsTypeFeatureGenerator to fix string dtype handling and column name sanitization.
+    """Custom AsTypeFeatureGenerator to fix string dtype handling.
 
     The default string detection from AutoGluon is hardcoded in a weird way. Thus, we
     overwrite it here before passing feature metadata to the rest of the pipeline.
     We overwrite it such that we believe the dtype of the input dataframe.
 
-    Additionally, any input column whose name contains ``"."`` is renamed so that
-    ``"."`` is replaced by ``"_"``.  The ``"."`` character is reserved as the
-    source-column separator in text feature names produced downstream (e.g.
-    ``TextSpecialFeatureGenerator`` produces ``{col}.char_count``).  Sanitizing
-    raw column names here prevents parsing ambiguity in
-    ``TextEmbeddingDimensionalityReductionFeatureGenerator._parse_source_column``.
-
     We further adjust the original logic to better handle unseen categories or suddenly appearing
-    nan values at test time.
-
+    nan values at test time:
+
+    * **Categorical columns** — unknown category values at test time are preserved
+      (not silently mapped to NaN) by converting through ``object`` dtype.
+    * **Bool columns** — columns with exactly 2 unique values at fit time are
+      bool-encoded to int8 (``true_val`` → 1, else → 0).  If a bool column gains
+      additional unique values at test time, the normal bool encoding still applies
+      and the unseen values are mapped to 0 (False).  A warning is logged.
+    * **Int columns** — NaN values that appear at test time but were absent during
+      fit are imputed to 0.
     """
 
-    def fit_transform(self, X: pd.DataFrame, y: pd.Series | None = None, **kwargs) -> pd.DataFrame:
-        """Rename columns with '.' before AutoGluon stores feature metadata.
-
-        AutoGluon's ``AbstractFeatureGenerator.fit_transform`` records ``features_in``
-        from the *original* X before calling ``_fit_transform``.  We must therefore
-        rename at the public API level so that the stored metadata matches what the
-        parent's ``_fit_transform`` will see.
-        """
-        self._dot_rename_map_: dict[str, str] = {c: str(c).replace(".", "_") for c in X.columns if "." in str(c)}
-        if self._dot_rename_map_:
-            X = X.rename(columns=self._dot_rename_map_)
-        return super().fit_transform(X, y=y, **kwargs)
-
-    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
-        """Apply the same dot-renaming as fit before passing to parent transform."""
-        if self._dot_rename_map_:
-            X = X.rename(columns=self._dot_rename_map_)
-        return super().transform(X)
-
     def _handle_nan_in_int_only_at_test_time(self, X: pd.DataFrame) -> pd.DataFrame:
         """Handle int features that contain null values at inference time but not at fit time.
         This logic is copied from the original AsTypeFeatureGenerator._transform.
@@ -188,16 +204,13 @@ def _handle_nan_in_int_only_at_test_time(self, X: pd.DataFrame) -> pd.DataFrame:
 
         return X
 
-    def _handle_dtype_mismatch_at_test_time(self, X: pd.DataFrame, bool_cols_with_extra_cats: set) -> pd.DataFrame:
+    def _handle_dtype_mismatch_at_test_time(self, X: pd.DataFrame) -> pd.DataFrame:
         """Handle situation where dtypes of test data do not match those of training data.
 
         The logic is split between cat and non-cat features to avoid the issue where
         astype(CategoricalDtype(categories=[...])) silently maps unknown categories to NaN.
         By converting through object dtype first, we ensure that all values are preserved as valid categories,
         even if they were not seen during training.
-        bool_cols_with_extra_cats are excluded from non_cat_type_map because they
-        are still typed as int8 in _type_map_real_opt but have not been bool-encoded;
-        trying to astype them to int8 would silently discard the extra category values.
         """
         # TODO: Confirm this works with sparse and other feature types!
         # FIXME: Address situation where test-time invalid type values cause crash:
@@ -210,9 +223,7 @@ def _handle_dtype_mismatch_at_test_time(self, X: pd.DataFrame, bool_cols_with_ex
             col: dtype for col, dtype in self._type_map_real_opt.items() if isinstance(dtype, pd.CategoricalDtype)
         }
         non_cat_type_map = {
-            col: dtype
-            for col, dtype in self._type_map_real_opt.items()
-            if not isinstance(dtype, pd.CategoricalDtype) and col not in bool_cols_with_extra_cats
+            col: dtype for col, dtype in self._type_map_real_opt.items() if not isinstance(dtype, pd.CategoricalDtype)
         }
         if non_cat_type_map:
             try:
@@ -225,38 +236,35 @@ def _handle_dtype_mismatch_at_test_time(self, X: pd.DataFrame, bool_cols_with_ex
                 X[col] = X[col].astype(object).astype(pd.CategoricalDtype(ordered=dtype.ordered))
         return X
 
-    def _handle_bool_cols_with_extra_cats_at_test_time(self, X: pd.DataFrame) -> tuple[pd.DataFrame, set]:
-        """Handle situation where bool columns gain extra categories at test time.
+    def _handle_bool_cols_with_unseen_values_at_test_time(self, X: pd.DataFrame) -> pd.DataFrame:
+        """Handle bool columns that gain unseen values at test time.
 
-        If a bool column gains more than the expected 2 unique non-null values at test time,
-        we skip bool-encoding for that column and convert it to categorical at the end of the transform method.
-        This is because encoding a 3rd value through the bool path (== true_val → 1, else → 0) silently maps unknown categories to 0 (false).
-        By skipping bool-encoding and converting to categorical, we ensure that all values are preserved as valid categories,
-        even if they were not seen during training.
+        Bool columns are always bool-encoded (``true_val`` → 1, else → 0)
+        regardless of whether unseen values appear.  This means unseen values
+        are silently mapped to 0 (False), which keeps the output dtype
+        identical to training (int8) and avoids downstream dtype mismatches.
+        A warning is logged for each affected column.
         """
-        bool_cols_with_extra_cats = {
+        bool_cols_with_unseen = {
             col for col in self._bool_features if col in X.columns and X[col].dropna().nunique() > 2
         }
-        if bool_cols_with_extra_cats:
-            saved_extra = {col: self._bool_features.pop(col) for col in bool_cols_with_extra_cats}
-            self._set_bool_features_val()
+        for col in bool_cols_with_unseen:
+            self._log(
+                level=20,
+                msg=f"WARNING: Bool column '{col}' has more than 2 unique non-null values at test time. "
+                "Unseen values will be mapped to 0 (False). "
+                "Consider passing this column with >2 values at train time to avoid bool encoding, or"
+                "force to treat this as a numerical column!",
+            )
         if self._bool_features:
             X = self._convert_to_bool(X)
-        if bool_cols_with_extra_cats:
-            self._bool_features.update(saved_extra)
-            self._set_bool_features_val()
 
-        return X, bool_cols_with_extra_cats
+        return X
 
     def _transform(self, X: pd.DataFrame) -> pd.DataFrame:
         """Override the default handling for unseen values!"""
-        # Identify bool columns that gained more than the expected 2 unique non-null values
-        # at test time.  Encoding a 3rd value through the bool path (== true_val → 1, else → 0)
-        # silently maps unknown categories to 0 (false).  We instead skip bool-encoding for
-        # those columns and convert them to categorical at the end of this method.
-        bool_cols_with_extra_cats: set[str] = set()
         if self._bool_features:
-            X, bool_cols_with_extra_cats = self._handle_bool_cols_with_extra_cats_at_test_time(X)
+            X = self._handle_bool_cols_with_unseen_values_at_test_time(X)
 
         # This means we have unobserved nans/categories
         if self._type_map_real_opt != X.dtypes.to_dict():
@@ -264,18 +272,12 @@ def _transform(self, X: pd.DataFrame) -> pd.DataFrame:
                 X = self._handle_nan_in_int_only_at_test_time(X)
 
             if self._type_map_real_opt:
-                X = self._handle_dtype_mismatch_at_test_time(X, bool_cols_with_extra_cats=bool_cols_with_extra_cats)
-
-        # Convert bool columns that gained extra categories to categorical so that
-        # all values (including novel ones) are preserved rather than silently mapped to 0.
-        for col in bool_cols_with_extra_cats:
-            if col in X.columns:
-                X[col] = X[col].astype(object).astype(pd.CategoricalDtype(ordered=False))
+                X = self._handle_dtype_mismatch_at_test_time(X)
 
         return X
 
     def _fit_transform(self, X: pd.DataFrame, **kwargs) -> tuple[pd.DataFrame, dict]:
-        # X arrives here with '.' already replaced by '_' (done in fit_transform above).
+        # X arrives here with '.' already replaced by '_' (done in TabArenaModelAgnosticPreprocessing.fit_transform).
         X, type_group_map_special = super()._fit_transform(X=X, **kwargs)
 
         found_text_cols = type_group_map_special.get("text", [])
diff --git a/tabarena/tabarena/benchmark/preprocessing/model_specific_default_preprocessing.py b/tabarena/tabarena/benchmark/preprocessing/model_specific_default_preprocessing.py
index 0e391753d..1c4281834 100644
--- a/tabarena/tabarena/benchmark/preprocessing/model_specific_default_preprocessing.py
+++ b/tabarena/tabarena/benchmark/preprocessing/model_specific_default_preprocessing.py
@@ -40,6 +40,7 @@ class TabArenaModelSpecificPreprocessing:
     """
 
     hp_key_kwargs: str = "ag.model_specific_feature_generator_kwargs"
+    use_pca: bool = False
 
     @staticmethod
     def add_to_hyperparameters(hyperparameters: dict) -> dict:
@@ -85,24 +86,28 @@ def get_model_specific_generator() -> list:
           ``S_TEXT_EMBEDDING`` / ``S_TEXT_SPECIAL`` features, grouped by source column.
         """
         # TODO: figure out how to more easily pass IdentityFeatureGenerator / dont drop other columns.
-        bulk_kwargs = dict(
-            generators=[
-                # Cat/Ordinal Encoding
-                [
-                    # The other features are consumed, and thus can be dropped.
-                    IdentityFeatureGenerator(
-                        infer_features_in_args=NoCatAsStringCategoryFeatureGenerator.get_infer_features_in_args_to_drop()
-                    ),
-                    NoCatAsStringCategoryFeatureGenerator(),
-                ],
-                # PCA
+
+        generators = [
+            [
+                # The other features are consumed, and thus can be dropped.
+                IdentityFeatureGenerator(
+                    infer_features_in_args=NoCatAsStringCategoryFeatureGenerator.get_infer_features_in_args_to_drop()
+                ),
+                NoCatAsStringCategoryFeatureGenerator(),
+            ],
+        ]
+        if TabArenaModelSpecificPreprocessing.use_pca:
+            generators.append(
                 [
                     IdentityFeatureGenerator(
                         infer_features_in_args=TextEmbeddingDimensionalityReductionFeatureGenerator.get_infer_features_in_args_to_drop()
                     ),
                     TextEmbeddingDimensionalityReductionFeatureGenerator(),
-                ],
-            ],
+                ]
+            )
+
+        bulk_kwargs = dict(
+            generators=generators,
             verbosity=2,
         )
 
@@ -113,19 +118,8 @@ class NoCatAsStringCategoryFeatureGenerator(CategoryFeatureGenerator):
     """CategoryFeatureGenerator that does not treat each string column as a category.
 
 
-    CategoryFeatureGenerator that preserves unseen categories instead of NaN.
-
-    At transform time, category values not seen during fit are mapped to the integer code
-    n (max_seen_code + 1) rather than being silently converted to NaN.  This is achieved
-    by two cooperating changes:
-
-    1. ``_generate_features_category`` replaces each unseen non-NaN value with the
-       sentinel ``_UNSEEN_CAT`` and adds that sentinel to the category list, so the
-       Categorical dtype keeps the value rather than encoding it as NaN.
-
-    2. ``TabArenaCategoryMemoryMinimizeFeatureGenerator`` (used as the post-generator
-       instead of the plain ``CategoryMemoryMinimizeFeatureGenerator``) detects codes
-       >= n at transform time and maps them to the integer n.
+    CategoryFeatureGenerator that preserves unseen categories to be handled by
+    the downstream model instead of setting them to NaN.
     """
 
     def __init__(self, **kwargs) -> None:
diff --git a/tabarena/tabarena/benchmark/preprocessing/text_feature_generators.py b/tabarena/tabarena/benchmark/preprocessing/text_feature_generators.py
index d9bb73a85..49e24b9c8 100644
--- a/tabarena/tabarena/benchmark/preprocessing/text_feature_generators.py
+++ b/tabarena/tabarena/benchmark/preprocessing/text_feature_generators.py
@@ -4,6 +4,8 @@
 import unicodedata
 import warnings
 from collections import defaultdict
+from pathlib import Path
+from typing import TYPE_CHECKING, ClassVar
 
 import numpy as np
 import pandas as pd
@@ -19,6 +21,10 @@
 from sklearn.decomposition import PCA
 from tqdm import tqdm
 
+if TYPE_CHECKING:
+    from sentence_transformers import SentenceTransformer
+
+
 # Non-printable ASCII control characters excluding whitespace (tab \x09, LF \x0a, CR \x0d).
 _CONTROL_CHAR_RE = re.compile(r"[\x00-\x08\x0e-\x1f\x7f]")
 
@@ -39,26 +45,146 @@ def sanitize_text(text_data: pd.Series, fillna_str: str = "Missing Data") -> pd.
     )
 
 
+class TabArenaDefaultTextEncoder:
+    @staticmethod
+    def get_default_encoder():
+        """Get the default sentence transformer model for encoding text features."""
+        import torch
+        from sentence_transformers import SentenceTransformer
+
+        return SentenceTransformer(
+            "Qwen/Qwen3-Embedding-8B",
+            truncate_dim=32,  # minimal MRL dimension for Qwen3-Embedding
+            model_kwargs={"dtype": torch.float16, "attn_implementation": "sdpa"},
+            processor_kwargs={"padding_side": "left"},
+        )
+
+    # TODO: could optimize this much more + ideally compute on-the-fly
+    # Length-bucket thresholds (chars) and batch sizes, ordered from longest to shortest.
+    # Texts longer than the threshold get the corresponding batch size.
+    _LENGTH_BUCKETS: ClassVar[list[tuple[int, int]]] = [
+        (20_000, 8),
+        (15_000, 16),
+        (5_000, 32),
+        (500, 64),
+        (0, 128),
+    ]
+
+    @staticmethod
+    def encode_texts(*, texts: list[str], encoder_model: SentenceTransformer) -> np.ndarray:
+        """Encode texts with adaptive batch sizes based on text length.
+
+        Texts are sorted by character length and split into buckets.
+        Longer texts use smaller batch sizes to limit peak memory, while
+        shorter texts use larger batch sizes for throughput.
+        """
+        # guess-timate + overhead for characters per token
+        max_chars = int(encoder_model.max_seq_length * 3)
+        long_texts = [t for t in texts if len(t) > max_chars]
+        if long_texts:
+            warnings.warn(
+                f"{len(long_texts)} of {len(texts)} text value(s) exceed "
+                f"~{max_chars:,} characters and may be truncated by the model's "
+                f"{encoder_model.max_seq_length:,}-token context window. "
+                f"Longest text: {max(len(t) for t in long_texts):,} characters.",
+                stacklevel=2,
+            )
+
+        # Sort by character length (good proxy for token length, avoids tokenization overhead).
+        sorted_indices = sorted(range(len(texts)), key=lambda k: len(texts[k]))
+        sorted_texts = [texts[k] for k in sorted_indices]
+
+        print(f"Encoding {len(texts)} unique text values...")
+        print(f"\tShortest text: {len(sorted_texts[0])} chars, longest text: {len(sorted_texts[-1])} chars.")
+        print(f"\tAverage text length: {sum(len(t) for t in sorted_texts) / len(sorted_texts):.1f} chars.")
+
+        # Split sorted texts into length buckets and encode each with its own batch size.
+        buckets = TabArenaDefaultTextEncoder._LENGTH_BUCKETS
+        all_embs_parts: list[np.ndarray] = []
+        start = len(sorted_texts)  # walk backwards (longest first)
+        for char_threshold, batch_size in buckets:
+            # Find the first text that is shorter than the threshold.
+            end = start
+            start = end
+            while start > 0 and len(sorted_texts[start - 1]) >= char_threshold:
+                start -= 1
+            bucket_texts = sorted_texts[start:end]
+            if not bucket_texts:
+                continue
+            print(f"\tBucket >={char_threshold} chars: {len(bucket_texts)} texts, batch_size={batch_size}")
+            embs = encoder_model.encode(
+                bucket_texts,
+                prompt_name="query",
+                convert_to_numpy=True,
+                normalize_embeddings=True,
+                batch_size=batch_size,
+                show_progress_bar=True,
+            )
+            all_embs_parts.append(embs)
+
+        # Reverse parts so they follow the original sorted order (shortest first).
+        all_embs_parts.reverse()
+        all_embs_sorted = np.concatenate(all_embs_parts, axis=0)
+
+        # Unsort back to original ordering.
+        return all_embs_sorted[np.argsort(sorted_indices)]
+
+    @staticmethod
+    def get_text_to_encode(*, X: pd.DataFrame, seen_texts: set[str]) -> list[str]:
+        """Collect unique (column, value) pairs from *X* that are not present in *embedding_look_up*."""
+        unseen_keys = []
+
+        # Pass 1: discover unseen (col, value)
+        for col in tqdm(X.columns, desc="Collecting text to encode..."):
+            s = sanitize_text(X[col])
+
+            for val in s.unique():
+                if val not in seen_texts:
+                    unseen_keys.append(val)
+
+        return unseen_keys
+
+    @staticmethod
+    def get_cache_data_for_dataset(*, X: pd.DataFrame, seen_texts: set[str]) -> dict:
+        """Get the cache data for the given dataset, which is a dict mapping (col, value) pairs to their embeddings."""
+        text_to_encode = TabArenaDefaultTextEncoder.get_text_to_encode(
+            X=X,
+            seen_texts=seen_texts,
+        )
+        if not text_to_encode:
+            return {}
+
+        new_embeddings = TabArenaDefaultTextEncoder.encode_texts(
+            texts=text_to_encode,
+            encoder_model=TabArenaDefaultTextEncoder.get_default_encoder(),
+        )
+        return dict(zip(text_to_encode, new_embeddings))
+
+
 class SemanticTextFeatureGenerator(AbstractFeatureGenerator):
-    """Create semantic text embeddings using a pre-trained sentencetransformer model."""
+    """Create semantic text embeddings using a pre-trained sentencetransformer model.
 
-    _embedding_look_up: dict[str, np.ndarray]
-    """Cache for the embeddings of unique text values."""
+    Uses ``Qwen/Qwen3-Embedding-0.6B`` with Matryoshka Representation Learning
+    (MRL) to produce compact ``_MRL_DIM``-dimensional embeddings per text value.
+    """
+
+    _embedding_look_up: dict[str, np.ndarray] = {}
+    """Class-level cache for the embeddings of unique text values, shared across all instances within a process."""
     _expected_columns: list[str]
     """Expected columns during transform, set during fit."""
     _feature_names: list[str]
     """Stable feature names for the generated embedding features."""
+    only_load_from_cache: bool = False
+    """Whether to only load embeddings from cache and crash for on-the-fly encoding of unseen text values."""
 
     def _fit_transform(self, X: pd.DataFrame, **kwargs) -> tuple[pd.DataFrame, dict]:
         """See parameters of the parent class AbstractFeatureGenerator for more details
         on the parameters.
         """
-        from sentence_transformers import SentenceTransformer
-        import torch
-
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        self._encoder_model = SentenceTransformer("intfloat/e5-small-v2", device=device)
-
+        if self.only_load_from_cache:
+            self._encoder_model = None
+        else:
+            self._encoder_model = TabArenaDefaultTextEncoder.get_default_encoder()
         X_out = self._transform(X, is_train=True)
         return X_out, {S_TEXT_EMBEDDING: list(X_out.columns)}
 
@@ -66,9 +192,6 @@ def _transform(self, X: pd.DataFrame, *, is_train: bool = False) -> pd.DataFrame
         """See parameters of the parent class AbstractFeatureGenerator for more details
         on the parameters.
         """
-        if is_train and not hasattr(self, "_embedding_look_up"):
-            self._embedding_look_up = {}
-
         n_rows = len(X)
         n_cols = len(X.columns)
 
@@ -76,44 +199,27 @@ def _transform(self, X: pd.DataFrame, *, is_train: bool = False) -> pd.DataFrame
         if n_rows == 0 or n_cols == 0:
             raise ValueError("Input DataFrame is empty!")
 
-        unseen_keys: list[tuple[str, str]] = []
-        seen_unseen: set[tuple[str, str]] = set()
-
-        # Pass 1: discover unseen (col, value)
-        for col in tqdm(X.columns, desc="Collecting text to encode..."):
-            s = sanitize_text(X[col])
-
-            for val in s.unique():
-                key = (col, val)
-                if key not in self._embedding_look_up and key not in seen_unseen:
-                    seen_unseen.add(key)
-                    unseen_keys.append(key)
+        # Encode text
+        unseen_text = TabArenaDefaultTextEncoder.get_text_to_encode(X=X, seen_texts=set(self._embedding_look_up.keys()))
+        if unseen_text:
 
-        # Encode unseen
-        if unseen_keys:
-            texts_to_encode = [f"query: {col} = {val}" for col, val in unseen_keys]
+            if self.only_load_from_cache:
+                raise ValueError(
+                    "Cache miss for text values during transform with only_load_from_cache=True. "
+                    f"Unseen text values: {unseen_text[:10]} (showing up to 10)."
+                )
 
-            embeddings = self._encoder_model.encode(
-                texts_to_encode,
-                convert_to_numpy=True,
-                normalize_embeddings=False,
-                batch_size=128,
-                show_progress_bar=True,
-                precision="float32",
+            embeddings = TabArenaDefaultTextEncoder.encode_texts(
+                texts=list(unseen_text),
+                encoder_model=self._encoder_model,
             )
-
-            self._embedding_look_up.update(zip(unseen_keys, embeddings))
+            self._embedding_look_up.update(zip(unseen_text, embeddings))
 
         # Infer embedding dimension
         emb_dim = len(next(iter(self._embedding_look_up.values())))
-
         # --- Stable feature names: source column is the prefix ---
         if is_train:
-            self._feature_names = [
-                f"{col}.semantic_embedding_{i}"
-                for col in X.columns
-                for i in range(emb_dim)
-            ]
+            self._feature_names = [f"{col}.semantic_embedding_{i}" for col in X.columns for i in range(emb_dim)]
             self._expected_columns = list(X.columns)
         elif list(X.columns) != self._expected_columns:
             raise ValueError(
@@ -122,10 +228,9 @@ def _transform(self, X: pd.DataFrame, *, is_train: bool = False) -> pd.DataFrame
                 f"Got: {list(X.columns)}"
             )
 
+        # Pass 2: build matrix (optimized for repeated values)
         # Preallocate
         semantic_embedding = np.empty((n_rows, n_cols * emb_dim), dtype=np.float32)
-
-        # Pass 2: build matrix (optimized for repeated values)
         for j, col in tqdm(
             enumerate(X.columns),
             desc="Building semantic embedding matrix...",
@@ -135,9 +240,7 @@ def _transform(self, X: pd.DataFrame, *, is_train: bool = False) -> pd.DataFrame
             arr = s.to_numpy()
 
             uniques, inverse = np.unique(arr, return_inverse=True)
-            unique_embs = np.vstack(
-                [self._embedding_look_up[(col, val)] for val in uniques]
-            )
+            unique_embs = np.vstack([self._embedding_look_up[val] for val in uniques])
             col_matrix = unique_embs[inverse]
 
             start = j * emb_dim
@@ -165,6 +268,26 @@ def get_default_infer_features_in_args() -> dict:
     def _more_tags(self):
         return {"feature_interactions": True}
 
+    @staticmethod
+    def save_embedding_cache(cache: dict[str, np.ndarray], path: str | Path) -> None:
+        keys = list(cache.keys())
+        embs = np.vstack(list(cache.values()))
+        df = pd.DataFrame(embs, index=pd.Index(keys, name="text"))
+        df.to_parquet(path)
+
+    @staticmethod
+    def load_embedding_cache(path: str | Path) -> dict[str, np.ndarray]:
+        df = pd.read_parquet(path)
+        return dict(zip(df.index, df.to_numpy()))
+
+    @staticmethod
+    def get_text_cache_dir(task_id_str: str) -> Path:
+        import openml
+
+        base_path = (openml.config._root_cache_directory / "tabarena_text_cache").expanduser().resolve() / "text_cache"
+        Path(base_path).mkdir(parents=True, exist_ok=True)
+
+        return base_path / f"{task_id_str}_cache.parquet"
 
 class StatisticalTextFeatureGenerator(AbstractFeatureGenerator):
     """Generate a statistical embedding of text features using skrub.
@@ -177,7 +300,7 @@ class StatisticalTextFeatureGenerator(AbstractFeatureGenerator):
     ``SemanticTextFeatureGenerator``.
     """
 
-    MAX_N_OUTPUT_FEATURES = 384  # Same as intfloat/e5-small-v2
+    MAX_N_OUTPUT_FEATURES = 32
 
     def _fit_transform(self, X: pd.DataFrame, **kwargs) -> tuple[pd.DataFrame, dict]:
         from skrub import StringEncoder, TableVectorizer
@@ -211,9 +334,7 @@ def _transform(self, X: pd.DataFrame, *, is_train: bool = False) -> pd.DataFrame
                 X = self._vectorizer.fit_transform(X)
             # TableVectorizer produces "{col}_{i}"; remap to "{col}.{i}" so that
             # the source column prefix is separated by "." (the project convention).
-            self._col_rename_map_: dict[str, str] = {
-                c: re.sub(r"_(\d+)$", r".\1", c) for c in X.columns
-            }
+            self._col_rename_map_: dict[str, str] = {c: re.sub(r"_(\d+)$", r".\1", c) for c in X.columns}
         else:
             X = self._vectorizer.transform(X)
 
@@ -288,9 +409,7 @@ def _parse_source_column(feature_name: str) -> str:
         """
         return feature_name.split(".", 1)[0]
 
-    def _make_batch_plan(
-        self, feature_names: list[str]
-    ) -> list[tuple[str, int, list[str]]]:
+    def _make_batch_plan(self, feature_names: list[str]) -> list[tuple[str, int, list[str]]]:
         """Build a PCA batch plan grouped by source column.
 
         Parameters
@@ -317,9 +436,7 @@ def _make_batch_plan(
                 plan.append((src_col, 0, feats))
             else:
                 max_n_int = int(max_n)
-                sub_batches = [
-                    feats[i : i + max_n_int] for i in range(0, len(feats), max_n_int)
-                ]
+                sub_batches = [feats[i : i + max_n_int] for i in range(0, len(feats), max_n_int)]
                 for sub_idx, sub_feats in enumerate(sub_batches):
                     plan.append((src_col, sub_idx, sub_feats))
 
@@ -352,14 +469,10 @@ def _transform(self, X: pd.DataFrame) -> pd.DataFrame:
         X_out = self._transform_inference(X)
         missing_output = [c for c in self.feature_names_out_ if c not in X_out.columns]
         if missing_output:
-            raise ValueError(
-                f"Transformed output is missing expected columns: {missing_output[:10]}"
-            )
+            raise ValueError(f"Transformed output is missing expected columns: {missing_output[:10]}")
         return X_out[self.feature_names_out_]
 
-    def _fit_preprocess_and_transform(
-        self, X: pd.DataFrame, y: pd.Series
-    ) -> pd.DataFrame:
+    def _fit_preprocess_and_transform(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
         X = X.copy()
 
         self.pre_pca_feature_names_ = list(X)
@@ -379,9 +492,7 @@ def _fit_preprocess_and_transform(
 
         transformed_batches: list[pd.DataFrame] = []
 
-        for src_col, sub_batch_idx, batch_cols in tqdm(
-            batch_plan, desc="Fitting PCA batches..."
-        ):
+        for src_col, sub_batch_idx, batch_cols in tqdm(batch_plan, desc="Fitting PCA batches..."):
             X_batch = X[batch_cols]
 
             n_samples, n_features = X_batch.shape
@@ -405,9 +516,7 @@ def _fit_preprocess_and_transform(
             )
             X_pca = X_pca[:, :keep_count]
 
-            output_cols = [
-                f"{src_col}.dr{sub_batch_idx}_{i}" for i in range(keep_count)
-            ]
+            output_cols = [f"{src_col}.dr{sub_batch_idx}_{i}" for i in range(keep_count)]
 
             X_pca_df = pd.DataFrame(X_pca, index=X.index, columns=output_cols)
 
@@ -422,8 +531,7 @@ def _fit_preprocess_and_transform(
         X = pd.concat(transformed_batches, axis=1)
         self._log(
             20,
-            f"Total PCA features generated: {X.shape[1]}"
-            f" from {len(self.pre_pca_feature_names_)} original features.",
+            f"Total PCA features generated: {X.shape[1]} from {len(self.pre_pca_feature_names_)} original features.",
         )
         return X
 
@@ -482,14 +590,10 @@ def _standard_scale_transform(
     @staticmethod
     def _encode_target_for_correlation(y: pd.Series) -> np.ndarray:
         if pd.api.types.is_numeric_dtype(y):
-            y_num = pd.to_numeric(y, errors="coerce").to_numpy(
-                dtype=np.float64, copy=False
-            )
+            y_num = pd.to_numeric(y, errors="coerce").to_numpy(dtype=np.float64, copy=False)
         else:
             # Deterministic encoding for non-numeric labels.
-            y_num = pd.Series(pd.factorize(y)[0], index=y.index).to_numpy(
-                dtype=np.float64, copy=False
-            )
+            y_num = pd.Series(pd.factorize(y)[0], index=y.index).to_numpy(dtype=np.float64, copy=False)
 
         if np.isnan(y_num).any():
             # Fill NaNs with mean to keep correlation computation vectorized/stable.
diff --git a/tabarena/tabarena/benchmark/task/openml/task_wrapper.py b/tabarena/tabarena/benchmark/task/openml/task_wrapper.py
index e79fc2003..5aa30e953 100644
--- a/tabarena/tabarena/benchmark/task/openml/task_wrapper.py
+++ b/tabarena/tabarena/benchmark/task/openml/task_wrapper.py
@@ -19,10 +19,15 @@
 
 
 class OpenMLTaskWrapper:
-    def __init__(self, task: OpenMLSupervisedTask, *, use_task_eval_metric: bool = False):
+    def __init__(self, task: OpenMLSupervisedTask, *, use_task_eval_metric: bool = False, lazy_load_data: bool = False):
         assert isinstance(task, OpenMLSupervisedTask)
         self.task: OpenMLSupervisedTask = task
+        self.lazy_load_data = lazy_load_data
         self.X, self.y = get_task_data(task=self.task)
+        self._n_rows, self._n_cols = self.X.shape
+        if self.lazy_load_data:
+            del self.X, self.y
+
         self.problem_type = get_ag_problem_type(self.task)
         self.label = self.task.target_name
 
@@ -70,7 +75,11 @@ def get_split_dimensions(self) -> tuple[int, int, int]:
         return n_repeats, n_folds, n_samples
 
     def combine_X_y(self) -> pd.DataFrame:
-        return pd.concat([self.X, self.y.to_frame(name=self.label)], axis=1)
+        if self.lazy_load_data:
+            X, y = get_task_data(task=self.task)
+        else:
+            X, y = self.X, self.y
+        return pd.concat([X, y.to_frame(name=self.label)], axis=1)
 
     def save_data(self, path: str, file_type='.csv', train_indices=None, test_indices=None):
         data = self.combine_X_y()
@@ -86,8 +95,8 @@ def save_metadata(self, path: str):
         metadata = dict(
             label=self.label,
             problem_type=self.problem_type,
-            num_rows=len(self.X),
-            num_cols=len(self.X.columns),
+            num_rows=self._n_rows,
+            num_cols=self._n_cols,
             task_id=self.task.task_id,
             dataset_id=self.task.dataset_id,
             openml_url=self.task.openml_url,
@@ -133,10 +142,20 @@ def get_train_test_split(
     ) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
         if train_indices is None or test_indices is None:
             train_indices, test_indices = self.get_split_indices(fold=fold, repeat=repeat, sample=sample)
-        X_train = self.X.loc[train_indices]
-        y_train = self.y[train_indices]
-        X_test = self.X.loc[test_indices]
-        y_test = self.y[test_indices]
+
+        if self.lazy_load_data:
+            X, y = get_task_data(task=self.task)
+            X_train = X.loc[train_indices].copy()
+            y_train = y[train_indices].copy()
+            X_test = X.loc[test_indices].copy()
+            y_test = y[test_indices].copy()
+            del X, y
+        else:
+            X, y = self.X, self.y
+            X_train = X.loc[train_indices]
+            y_train = y[train_indices]
+            X_test = X.loc[test_indices]
+            y_test = y[test_indices]
 
         if train_size is not None:
             X_train, y_train = self.subsample(X=X_train, y=y_train, size=train_size, random_state=random_state)
diff --git a/tabarena/tabarena/benchmark/task/user_task.py b/tabarena/tabarena/benchmark/task/user_task.py
index d1f5ef47d..d848b3b2a 100644
--- a/tabarena/tabarena/benchmark/task/user_task.py
+++ b/tabarena/tabarena/benchmark/task/user_task.py
@@ -5,7 +5,8 @@
 from collections import OrderedDict
 from collections.abc import Iterable
 from copy import deepcopy
-from dataclasses import asdict, dataclass, fields, replace
+from dataclasses import MISSING, asdict, dataclass, fields, replace
+from enum import StrEnum
 from pathlib import Path
 from typing import Annotated, Literal
 
@@ -26,17 +27,18 @@
     OpenMLSupervisedTask,
     TaskType,
 )
-from enum import StrEnum
 
 SplitIndex = Annotated[str, "format: r{int}f{int}"]
 
 SplitTimeHorizonTypes = str | int | float
 SplitTimeHorizonUnitTypes = Literal["steps", "days", "weeks", "months", "years"] | str
 
+
 class GroupLabelTypes(StrEnum):
     PER_SAMPLE = "per_sample"
     PER_GROUP = "per_group"
 
+
 @dataclass
 class TabArenaTaskMetadata:
     """Metadata about the task to run.
@@ -108,6 +110,16 @@ class TabArenaTaskMetadata:
      identifier of a local task (see `UserTask.task_id_str`).
     """
 
+    # -- Feature dtype flags (added later; default to None for backward compat) --
+    has_datetime: bool | None = None
+    """Whether the dataset contains datetime feature columns."""
+    has_text: bool | None = None
+    """Whether the dataset contains text (string/object) feature columns."""
+    has_categorical: bool | None = None
+    """Whether the dataset contains categorical feature columns."""
+    has_numeric: bool | None = None
+    """Whether the dataset contains numeric feature columns."""
+
     @property
     def n_splits(self):
         """Get the number of splits in the task."""
@@ -130,6 +142,30 @@ def split_index(self) -> SplitIndex:
             )
         return self.split_indices[0]
 
+    def has_supported_dtypes(self, *, required_dtypes: list[str] | None, forbidden_dtypes: list[str] | None) -> bool:
+        """Check if the dataset contains only allowed dtypes based on the feature dtype flags."""
+        if required_dtypes is not None:
+            if "datetime" in required_dtypes and not self.has_datetime:
+                return False
+            if "text" in required_dtypes and not self.has_text:
+                return False
+            if "categorical" in required_dtypes and not self.has_categorical:
+                return False
+            if "numeric" in required_dtypes and not self.has_numeric:
+                return False
+
+        if forbidden_dtypes is not None:
+            if self.has_datetime and "datetime" in forbidden_dtypes:
+                return False
+            if self.has_text and "text" in forbidden_dtypes:
+                return False
+            if self.has_categorical and "categorical" in forbidden_dtypes:
+                return False
+            if self.has_numeric and "numeric" in forbidden_dtypes:
+                return False
+
+        return True
+
     def to_dict(self, *, exclude_splits_metadata: bool = False) -> dict:
         """Convert the task metadata to a dictionary for better visualization."""
         res = asdict(self)
@@ -155,29 +191,29 @@ def from_row(row: pd.Series) -> TabArenaTaskMetadata:
         """Reconstruct TabArenaTaskMetadata from a single dataframe row."""
         row_dict = row.to_dict()
 
-        # Identify TabArenaTaskMetadata fields (excluding splits_metadata)
-        task_field_names = {
-            f.name for f in fields(TabArenaTaskMetadata) if f.name != "splits_metadata"
+        # Identify TabArenaTaskMetadata fields (excluding splits_metadata).
+        # Fields with defaults are optional for backward compatibility with
+        # older serialized metadata that may not contain newer columns.
+        all_task_fields = {f.name for f in fields(TabArenaTaskMetadata) if f.name != "splits_metadata"}
+        required_task_fields = {
+            f.name
+            for f in fields(TabArenaTaskMetadata)
+            if f.name != "splits_metadata" and f.default is MISSING and f.default_factory is MISSING
         }
-        if not all(name in row_dict for name in task_field_names):
+        if not all(name in row_dict for name in required_task_fields):
             raise ValueError(
                 "Metadata row is missing required TabArenaTaskMetadata fields: "
-                f"{task_field_names - row_dict.keys()}"
+                f"{required_task_fields - row_dict.keys()}"
             )
-        task_kwargs = {
-            key: row_dict[key] for key in task_field_names if key in row_dict
-        }
+        task_kwargs = {key: row_dict[key] for key in all_task_fields if key in row_dict}
 
         # Identify SplitMetadata fields
         split_field_names = {f.name for f in fields(SplitMetadata)}
         if not all(name in row_dict for name in split_field_names):
             raise ValueError(
-                "Metadata row is missing required SplitMetadata fields: "
-                f"{split_field_names - row_dict.keys()}"
+                f"Metadata row is missing required SplitMetadata fields: {split_field_names - row_dict.keys()}"
             )
-        split_kwargs = {
-            key: row_dict[key] for key in split_field_names if key in row_dict
-        }
+        split_kwargs = {key: row_dict[key] for key in split_field_names if key in row_dict}
         # Reconstruct SplitMetadata
         split_metadata = SplitMetadata(**split_kwargs)
 
@@ -310,7 +346,9 @@ def _get_dataset_stats(
         # Resolve instance groups
         if self.group_on is not None:
             num_instance_groups = self.get_num_instance_groups(
-                X=oml_dataset, group_on=self.group_on, group_labels=self.group_labels,
+                X=oml_dataset,
+                group_on=self.group_on,
+                group_labels=self.group_labels,
             )
 
         return (
@@ -351,9 +389,7 @@ def compute_metadata(
         splits_metadata = {}
         for repeat_i, splits in self.split.split.items():
             for fold_i, samples_for_split in splits.items():
-                assert len(samples_for_split) == 1, (
-                    "Only one sample per split is supported so far!."
-                )
+                assert len(samples_for_split) == 1, "Only one sample per split is supported so far!."
                 train_idx, test_idx = samples_for_split[0]
 
                 (
@@ -390,12 +426,8 @@ def compute_metadata(
                 if task_problem_type is None:
                     task_problem_type = split_problem_type
                 else:
-                    assert task_problem_type == split_problem_type, (
-                        "All splits must have the same problem type."
-                    )
-                s_index = SplitMetadata.get_split_index(
-                    repeat_i=repeat_i, fold_i=fold_i
-                )
+                    assert task_problem_type == split_problem_type, "All splits must have the same problem type."
+                s_index = SplitMetadata.get_split_index(repeat_i=repeat_i, fold_i=fold_i)
                 splits_metadata[s_index] = SplitMetadata(
                     repeat=repeat_i,
                     fold=fold_i,
@@ -418,6 +450,20 @@ def compute_metadata(
             max_n_classes = max(num_classes_list)
             class_consistency_over_splits = min_n_classes == max_n_classes
 
+        # Detect feature dtype flags (exclude target column)
+        feature_df = oml_dataset.drop(columns=[target_name])
+
+        # FIXME: make this less strict?
+        if len(feature_df.select_dtypes(include=["object"]).columns) > 0:
+            raise ValueError(
+                "Object dtype columns are not supported. Please convert them to string dtype or categorical dtype!"
+            )
+
+        has_datetime = len(feature_df.select_dtypes(include=["datetime64"]).columns) > 0
+        has_text = len(feature_df.select_dtypes(include=["string"]).columns) > 0
+        has_categorical = len(feature_df.select_dtypes(include=["category"]).columns) > 0
+        has_numeric = len(feature_df.select_dtypes(include=["number"]).columns) > 0
+
         self._task_metadata = TabArenaTaskMetadata(
             dataset_name=dataset_name,
             eval_metric=eval_metric,
@@ -441,14 +487,16 @@ def compute_metadata(
             num_instance_groups=full_num_instance_groups,
             split_time_horizon=self.split_time_horizon,
             split_time_horizon_unit=self.split_time_horizon_unit,
+            has_datetime=has_datetime,
+            has_text=has_text,
+            has_categorical=has_categorical,
+            has_numeric=has_numeric,
         )
 
         return self._task_metadata
 
 
-class TabArenaOpenMLClassificationTask(
-    TabArenaTaskMetadataMixin, OpenMLClassificationTask
-):
+class TabArenaOpenMLClassificationTask(TabArenaTaskMetadataMixin, OpenMLClassificationTask):
     """A local OpenMLClassificationTask with additional metadata for TabArena."""
 
 
@@ -457,9 +505,7 @@ class TabArenaOpenMLRegressionTask(TabArenaTaskMetadataMixin, OpenMLRegressionTa
 
 
 # For typing
-TabArenaOpenMLSupervisedTask = (
-    TabArenaOpenMLClassificationTask | TabArenaOpenMLRegressionTask
-)
+TabArenaOpenMLSupervisedTask = TabArenaOpenMLClassificationTask | TabArenaOpenMLRegressionTask
 
 
 # Patch Functions for OpenML Dataset
@@ -487,9 +533,7 @@ def __init__(self, *, task_name: str, task_cache_path: Path | None = None) -> No
             If None, the default OpenML cache directory is used.
         """
         self.task_name = task_name
-        self._task_name_hash = hashlib.sha256(
-            self.task_name.encode("utf-8")
-        ).hexdigest()
+        self._task_name_hash = hashlib.sha256(self.task_name.encode("utf-8")).hexdigest()
         self._task_cache_path = task_cache_path
 
     @property
@@ -497,11 +541,7 @@ def task_cache_path(self) -> Path:
         """Path to use for caching the local OpenML tasks."""
         if self._task_cache_path is not None:
             return self._task_cache_path
-        return (
-            (openml.config._root_cache_directory / "tabarena_tasks")
-            .expanduser()
-            .resolve()
-        )
+        return (openml.config._root_cache_directory / "tabarena_tasks").expanduser().resolve()
 
     @staticmethod
     def from_task_id_str(task_id_str: str) -> UserTask:
@@ -536,12 +576,7 @@ def _local_dataset_id(self) -> str:
 
     @property
     def _local_cache_path(self) -> Path:
-        return (
-            Path(openml.config._root_cache_directory)
-            / "local"
-            / "datasets"
-            / self._local_dataset_id
-        )
+        return Path(openml.config._root_cache_directory) / "local" / "datasets" / self._local_dataset_id
 
     def get_dataset_name(self, dataset_name: str | None = None) -> str:
         """Get the dataset name to use for the local OpenML dataset."""
@@ -636,9 +671,7 @@ def create_local_openml_task(
         self._validate_splits(splits=splits, n_samples=len(dataset))
 
         task_type = (
-            TaskType.SUPERVISED_CLASSIFICATION
-            if problem_type == "classification"
-            else TaskType.SUPERVISED_REGRESSION
+            TaskType.SUPERVISED_CLASSIFICATION if problem_type == "classification" else TaskType.SUPERVISED_REGRESSION
         )
         extra_kwargs = {}
         if task_type == TaskType.SUPERVISED_CLASSIFICATION:
@@ -650,23 +683,29 @@ def create_local_openml_task(
             raise NotImplementedError(f"Task type {task_type:d} not supported.")
 
         dataset_name = self.get_dataset_name(dataset_name=dataset_name)
-        print(
-            f"Creating local OpenML task {self.task_id} with dataset '{dataset_name}'..."
-        )
+        print(f"Creating local OpenML task {self.task_id} with dataset '{dataset_name}'...")
         local_dataset = openml_create_datasets_without_arff_dump(
             name=dataset_name,
             data=dataset,
             default_target_attribute=target_feature,
         )
         # Cache data to disk
-        parquet_file = self._local_cache_path / "data.pq"
-        parquet_file.parent.mkdir(parents=True, exist_ok=True)
-        dataset.to_parquet(parquet_file)
+        #   This ensures to keep the dtypes of the original dataframe (and not lose it via parquet or similar)
+        #   Moreover, this skips that OpenML itself has do pickle dump the dataset again.
+        pickle_file = self._local_cache_path / "data.pkl.py3"
+        pickle_file.parent.mkdir(parents=True, exist_ok=True)
+        with pickle_file.open("wb") as fh:
+            pickle.dump(
+                (dataset, [dataset[c].dtype.name == "category" for c in dataset.columns], list(dataset.columns)),
+                fh,
+                pickle.HIGHEST_PROTOCOL,
+            )
         del dataset  # Free memory
 
         # We only need local_dataset.get_data() from the OpenMLDataset, thus, we make
         # sure with the code below that get_data() returns the data.
-        local_dataset.parquet_file = parquet_file
+        local_dataset.data_pickle_file = pickle_file
+        local_dataset.cache_format = "pickle"
         local_dataset.data_file = "ignored"  # not used for local datasets
 
         # Create the task
@@ -710,9 +749,7 @@ def create_local_openml_task(
         return task
 
     @staticmethod
-    def _validate_splits(
-        *, splits: dict[int, dict[int, tuple[list, list]]], n_samples: int
-    ) -> None:
+    def _validate_splits(*, splits: dict[int, dict[int, tuple[list, list]]], n_samples: int) -> None:
         """Validate the splits passed by the user."""
         if not isinstance(splits, dict):
             raise ValueError("Splits must be a dictionary.")
@@ -723,28 +760,16 @@ def _validate_splits(
                 raise ValueError(f"Splits for repeat {repeat_id} must be a dictionary.")
             test_indices_per_repeat = set()
             for split_id, (train_indices, test_indices) in split_dict.items():
-                if not isinstance(train_indices, list) or not isinstance(
-                    test_indices, list
-                ):
+                if not isinstance(train_indices, list) or not isinstance(test_indices, list):
                     raise ValueError(f"Indices for split {split_id} must be lists.")
-                if not all(
-                    isinstance(idx, int) for idx in train_indices + test_indices
-                ):
-                    raise ValueError(
-                        f"All indices in split {split_id} must be integers."
-                    )
+                if not all(isinstance(idx, int) for idx in train_indices + test_indices):
+                    raise ValueError(f"All indices in split {split_id} must be integers.")
                 if len(train_indices) == 0 or len(test_indices) == 0:
-                    raise ValueError(
-                        f"Train and test indices in split {split_id} must not be empty."
-                    )
+                    raise ValueError(f"Train and test indices in split {split_id} must not be empty.")
                 if set(train_indices) & set(test_indices):
-                    raise ValueError(
-                        f"Train and test indices in split {split_id} must not overlap."
-                    )
+                    raise ValueError(f"Train and test indices in split {split_id} must not overlap.")
                 if any(np.array(train_indices + test_indices) < 0):
-                    raise ValueError(
-                        f"Indices in split {split_id} must be non-negative."
-                    )
+                    raise ValueError(f"Indices in split {split_id} must be non-negative.")
                 if any(np.array(train_indices + test_indices) >= n_samples):
                     raise ValueError(
                         f"Indices in split {split_id} must not exceed the dataset size (0 to {n_samples - 1})."
@@ -777,9 +802,7 @@ def save_local_openml_task(self, task: OpenMLSupervisedTask) -> None:
     def load_local_openml_task(self) -> TabArenaOpenMLSupervisedTask:
         """Load a local OpenML task from disk."""
         if not self.openml_task_path.exists():
-            raise FileNotFoundError(
-                f"Cached task file {self.openml_task_path} does not exist!"
-            )
+            raise FileNotFoundError(f"Cached task file {self.openml_task_path} does not exist!")
 
         with self.openml_task_path.open("rb") as f:
             task: OpenMLSupervisedTask = pickle.load(f)
@@ -830,14 +853,22 @@ def openml_create_datasets_without_arff_dump(
     unsupported_cols = data.select_dtypes(include=["datetime64", "timedelta64"]).columns
     # select_dtypes doesn't support "period" or "interval" as strings, so detect manually
     unsupported_cols = unsupported_cols.append(
-        pd.Index(
-            col for col in data.columns
-            if isinstance(data[col].dtype, (pd.PeriodDtype, pd.IntervalDtype))
-        )
+        pd.Index(col for col in data.columns if isinstance(data[col].dtype, (pd.PeriodDtype, pd.IntervalDtype)))
     )
-    if len(unsupported_cols) > 0:
+    # Cast categories of categorical columns to string so that
+    # attributes_arff_from_df can handle them (e.g. integer categories).
+    cat_cols_to_fix = [
+        col
+        for col in data.select_dtypes(include=["category"]).columns
+        if not pd.api.types.is_string_dtype(data[col].cat.categories)
+    ]
+
+    if len(unsupported_cols) > 0 or len(cat_cols_to_fix) > 0:
         data = data.copy()
+    if len(unsupported_cols) > 0:
         data[unsupported_cols] = data[unsupported_cols].astype(str)
+    for col in cat_cols_to_fix:
+        data[col] = data[col].cat.rename_categories(str)
 
     # infer the type of data for each column of the DataFrame
     attributes_ = attributes_arff_from_df(data)
@@ -846,9 +877,7 @@ def openml_create_datasets_without_arff_dump(
     _validated_data_attributes(ignore_attributes, attributes_, "ignore_attribute")
 
     default_target_attributes = _expand_parameter(default_target_attribute)
-    _validated_data_attributes(
-        default_target_attributes, attributes_, "default_target_attribute"
-    )
+    _validated_data_attributes(default_target_attributes, attributes_, "default_target_attribute")
 
     return OpenMLDataset(
         name=name,
diff --git a/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata.py b/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata.py
index b5feac3c3..851ae205a 100644
--- a/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata.py
+++ b/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata.py
@@ -48,6 +48,13 @@
     tabstar_metadata,
 )
 
+from tabarena.nips2025_utils.artifacts._tabarena_method_metadata_2026_01_23_tabprep import (
+    tabprep_gbm_metadata,
+    tabprep_lr_metadata,
+    tabprep_realtabpfnv250_metadata,
+    tabprep_tabm_metadata,
+)
+
 
 methods_2025_09_03: list[MethodMetadata] = [
     ag_140_metadata,
@@ -90,6 +97,13 @@
     # prep_gbm_v6_metadata,
 ]
 
+methods_tabprep = [
+    tabprep_gbm_metadata,
+    tabprep_lr_metadata,
+    tabprep_realtabpfnv250_metadata,  # only first 3 splits
+    tabprep_tabm_metadata,  # only first 3 splits
+]
+
 replaced_methods = [
     "ExplainableBM",
     "RealMLP_GPU",
diff --git a/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata_2026_01_23_tabprep.py b/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata_2026_01_23_tabprep.py
new file mode 100644
index 000000000..00d192af2
--- /dev/null
+++ b/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata_2026_01_23_tabprep.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+from tabarena.nips2025_utils.artifacts.method_metadata import MethodMetadata
+
+
+tabprep_gbm_metadata = MethodMetadata(
+    method="PrepLightGBM",
+    artifact_name="tabarena-2026-01-23",
+    display_name="PrepLightGBM",
+    method_type="config",
+    compute="cpu",
+    date="2026-01-23",
+    ag_key="PREP_GBM",
+    model_key="PREP_GBM",
+    # config_default="PrepLightGBM_c1_BAG_L1",  # FIXME
+    config_default="prep_LightGBM_icml_v3_c1_BAG_L1",
+    name_suffix=None,
+    has_raw=True,
+    has_processed=True,
+    has_results=True,
+    upload_as_public=True,
+    can_hpo=True,
+    is_bag=True,
+    s3_bucket="tabarena",
+    s3_prefix="cache",
+    verified=True,
+)
+
+tabprep_lr_metadata = MethodMetadata(
+    method="PrepLinearModel",
+    artifact_name="tabarena-2026-01-23",
+    display_name="PrepLinear",
+    method_type="config",
+    compute="cpu",
+    date="2026-01-23",
+    ag_key="PREP_LR",
+    model_key="PREP_LR",
+    # config_default="PrepLinearModel_c1_BAG_L1",  # FIXME
+    config_default="prep_LinearModel_icml_v3_c1_BAG_L1",
+    name_suffix=None,
+    has_raw=True,
+    has_processed=True,
+    has_results=True,
+    upload_as_public=True,
+    can_hpo=True,
+    is_bag=True,
+    s3_bucket="tabarena",
+    s3_prefix="cache",
+    verified=True,
+)
+
+
+tabprep_tabm_metadata = MethodMetadata(
+    method="PrepTabM",
+    artifact_name="tabarena-2026-01-23",
+    display_name="PrepTabM",
+    method_type="config",
+    compute="gpu",
+    date="2026-01-23",
+    ag_key="PREP_TABM",
+    model_key="PREP_TABM",
+    config_default="prep_TabM_c1_BAG_L1",  # FIXME
+    name_suffix=None,
+    has_raw=True,
+    has_processed=True,
+    has_results=True,
+    upload_as_public=True,
+    can_hpo=True,
+    is_bag=True,
+    s3_bucket="tabarena",
+    s3_prefix="cache",
+    verified=True,
+)
+
+
+tabprep_realtabpfnv250_metadata = MethodMetadata(
+    method="PrepRealTabPFN-v2.5",
+    artifact_name="tabarena-2026-01-23",
+    display_name="PrepRealTabPFN-2.5",
+    method_type="config",
+    compute="gpu",
+    date="2026-01-23",
+    ag_key="PREP_REALTABPFN-V2.5",
+    model_key="PREP_REALTABPFN-V2.5",
+    config_default="prep_RealTabPFN-v2.5_c1_BAG_L1",  # FIXME
+    name_suffix=None,
+    has_raw=True,
+    has_processed=True,
+    has_results=True,
+    upload_as_public=True,
+    can_hpo=True,
+    is_bag=False,
+    s3_bucket="tabarena",
+    s3_prefix="cache",
+    verified=True,
+)
diff --git a/tabarena/tabarena/nips2025_utils/compare.py b/tabarena/tabarena/nips2025_utils/compare.py
index f53ab2b30..3576e2d52 100644
--- a/tabarena/tabarena/nips2025_utils/compare.py
+++ b/tabarena/tabarena/nips2025_utils/compare.py
@@ -16,6 +16,7 @@ def compare_on_tabarena(
     *,
     only_valid_tasks: bool | str | list[str] = False,
     subset: str | list[str] | None = None,
+    datasets: list[str] | None = None,
     folds: list[int] | None = None,
     tabarena_context: TabArenaContext | None = None,
     tabarena_context_kwargs: dict | None = None,
@@ -62,12 +63,12 @@ def compare_on_tabarena(
             df_filter=new_results,
         )
 
-    if subset is not None or folds is not None:
+    if subset is not None or folds is not None or datasets is not None:
         if subset is None:
             subset = []
         if isinstance(subset, str):
             subset = [subset]
-        df_results = subset_tasks(df_results=df_results, subset=subset, folds=folds)
+        df_results = subset_tasks(df_results=df_results, subset=subset, folds=folds, datasets=datasets)
 
     return compare(
         df_results=df_results,
@@ -203,7 +204,12 @@ def prepare_data(
     return df_results
 
 
-def subset_tasks(df_results: pd.DataFrame, subset: list[str], folds: list[int] = None) -> pd.DataFrame:
+def subset_tasks(
+    df_results: pd.DataFrame,
+    subset: list[str],
+    folds: list[int] = None,
+    datasets: list[str] = None,
+) -> pd.DataFrame:
     from tabarena.nips2025_utils.fetch_metadata import load_task_metadata
 
     df_results = df_results.copy(deep=True)
@@ -270,6 +276,8 @@ def subset_tasks(df_results: pd.DataFrame, subset: list[str], folds: list[int] =
         else:
             raise ValueError(f"Invalid subset {subset} name!")
 
+    if datasets is not None:
+        df_results = df_results[df_results["dataset"].isin(datasets)]
     if folds is not None:
         df_results = df_results[df_results["fold"].isin(folds)]
     df_results = df_results.reset_index(drop=True)
diff --git a/tabarena/tabarena/nips2025_utils/end_to_end.py b/tabarena/tabarena/nips2025_utils/end_to_end.py
index 49efaa9ce..d4c2f56d3 100644
--- a/tabarena/tabarena/nips2025_utils/end_to_end.py
+++ b/tabarena/tabarena/nips2025_utils/end_to_end.py
@@ -14,7 +14,6 @@
     EndToEndResultsSingle,
     EndToEndSingle,
 )
-from tabarena.nips2025_utils.fetch_metadata import load_task_metadata
 from tabarena.nips2025_utils.method_processor import (
     generate_task_metadata,
     load_all_artifacts,
@@ -136,7 +135,7 @@ def from_path_raw(
         unique_types = list(unique_types_dict.keys())
 
         if task_metadata is None:
-            task_metadata = generate_task_metadata(tids=list(unique_tids))
+            task_metadata = EndToEndSingle.fetch_task_metadata(tids=list(unique_tids), verbose=verbose)
 
         log(
             f"Constructing EndToEnd from raw results... Found {len(unique_types)} unique methods: {unique_types}"
@@ -185,6 +184,7 @@ def from_path_raw_to_results(
         model_key: str | None = None,
         artifact_name: str | None = None,
         num_cpus: int | None = None,
+        verbose: bool = True,
     ) -> EndToEndResults:
         """
         Create and cache end-to-end results for all methods in the given directory.
@@ -245,10 +245,8 @@ def from_path_raw_to_results(
             all_file_paths_method[did_sid].append(file_path)
 
         if task_metadata is None:
-            print("Get task metadata...")
-            task_metadata = load_task_metadata()
-            # Below is too slow to use by default, TODO: get logic for any task that is fast
-            # task_metadata = generate_task_metadata(tids=list({r.split("/")[0] for r in all_file_paths_method}))
+            tids = list({r.split("/")[0] for r in all_file_paths_method})
+            task_metadata = EndToEndSingle.fetch_task_metadata(tids=tids, verbose=verbose)
 
         results: list[EndToEndResults] = ray_map_list(
             list_to_map=list(all_file_paths_method.values()),
diff --git a/tabarena/tabarena/nips2025_utils/end_to_end_single.py b/tabarena/tabarena/nips2025_utils/end_to_end_single.py
index e6f11b5af..114b30c3a 100644
--- a/tabarena/tabarena/nips2025_utils/end_to_end_single.py
+++ b/tabarena/tabarena/nips2025_utils/end_to_end_single.py
@@ -251,9 +251,8 @@ def from_raw(
             method_metadata.cache_raw(results_lst=results_lst)
 
         if task_metadata is None:
-            log(f"\tFetching task_metadata from OpenML...")
             tids = list({r.task_metadata["tid"] for r in results_lst})
-            task_metadata = generate_task_metadata(tids=tids)
+            task_metadata = cls.fetch_task_metadata(tids=tids, verbose=verbose)
 
         log(f"\tConverting raw results into an EvaluationRepository...")
         # processed
@@ -315,6 +314,7 @@ def from_path_raw(
         model_key: str | None = None,
         method: str | None = None,
         artifact_name: str | None = None,
+        name_prefix_raw: str | None = None,
         backend: Literal["ray", "native"] = "ray",
         verbose: bool = True,
     ) -> Self:
@@ -343,7 +343,7 @@ def from_path_raw(
 
         """
         engine = "ray" if backend == "ray" else "sequential"
-        results_lst: list[BaselineResult] = load_raw(path_raw=path_raw, engine=engine)
+        results_lst: list[BaselineResult] = load_raw(path_raw=path_raw, engine=engine, name_pattern=name_prefix_raw)
         return cls.from_raw(
             results_lst=results_lst,
             method_metadata=method_metadata,
@@ -409,6 +409,19 @@ def to_results(self) -> EndToEndResultsSingle:
             hpo_results=self.hpo_results,
         )
 
+    @staticmethod
+    def fetch_task_metadata(tids: list[int], verbose: bool = True):
+        log = print if verbose else (lambda *a, **k: None)
+        task_metadata = load_task_metadata()
+        tids_cached = set(task_metadata["tid"].unique())
+
+        tids_missing = [tid for tid in tids if tid not in tids_cached]
+        if tids_missing:
+            log(f"Note: Missing {len(tids_missing)} tasks in the cached task_metadata...")
+            log(f"\tFetching task_metadata from OpenML... (this may take ~1 minute)")
+            task_metadata = generate_task_metadata(tids=tids)
+        return task_metadata
+
     @staticmethod
     def from_path_raw_to_results(
         path_raw: str | Path | list[str | Path],
@@ -423,6 +436,7 @@ def from_path_raw_to_results(
         artifact_name: str | None = None,
         num_cpus: int | None = None,
         name_prefix_raw: str | None = None,
+        verbose: bool = True,
     ) -> EndToEndResultsSingle:
         """
         Create and cache end-to-end results for the method in the given directory.
@@ -486,10 +500,8 @@ def from_path_raw_to_results(
             all_file_paths_method[did_sid].append(file_path)
 
         if task_metadata is None:
-            print("Get task metadata...")
-            task_metadata = load_task_metadata()
-            # Below is too slow to use by default, TODO: get logic for any task that is fast
-            # task_metadata = generate_task_metadata(tids=list({r.split("/")[0] for r in all_file_paths_method}))
+            tids = list({int(r.split("/")[0]) for r in all_file_paths_method})
+            task_metadata = EndToEndSingle.fetch_task_metadata(tids=tids, verbose=verbose)
 
         import ray
         if not ray.is_initialized():
diff --git a/tabarena/tabarena/nips2025_utils/method_processor.py b/tabarena/tabarena/nips2025_utils/method_processor.py
index b60d05ec5..477a68e69 100644
--- a/tabarena/tabarena/nips2025_utils/method_processor.py
+++ b/tabarena/tabarena/nips2025_utils/method_processor.py
@@ -102,6 +102,7 @@ def get_info_from_result(result: BaselineResult) -> dict:
 
 def load_raw(
     path_raw: str | Path | list[str | Path] = None,
+    name_pattern: str | None = None,
     engine: str = "ray",
     as_holdout: bool = False,
 ) -> list[BaselineResult]:
@@ -120,7 +121,7 @@ def load_raw(
     """
 
     suffix = "results.pkl"
-    file_paths_method = fetch_all_pickles(dir_path=path_raw, suffix=suffix)
+    file_paths_method = fetch_all_pickles(dir_path=path_raw, suffix=suffix, name_pattern=name_pattern)
     if len(file_paths_method) == 0:
         # Look at every file to provide debugging info
         all_files = [p for p in Path(path_raw).rglob("*") if p.is_file()]
diff --git a/tabflow_slurm/benchmarking_setup/data_foundry_integration/data_foundry_registry.py b/tabflow_slurm/benchmarking_setup/data_foundry_integration/data_foundry_registry.py
deleted file mode 100644
index 59afa735d..000000000
--- a/tabflow_slurm/benchmarking_setup/data_foundry_integration/data_foundry_registry.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# TODO
-"""Categorizes data foundry artifacts that we aim to use."""
-from __future__ import annotations
-
-
-
diff --git a/tabflow_slurm/benchmarking_setup/data_foundry_integration/data_foundry_task_creator.py b/tabflow_slurm/benchmarking_setup/data_foundry_integration/data_foundry_task_creator.py
index 7eb681c8e..2fc6bb4ae 100644
--- a/tabflow_slurm/benchmarking_setup/data_foundry_integration/data_foundry_task_creator.py
+++ b/tabflow_slurm/benchmarking_setup/data_foundry_integration/data_foundry_task_creator.py
@@ -5,7 +5,6 @@
 
 import pandas as pd
 from data_foundry.curation_container import CuratedContainer
-from data_foundry.schema import ProblemTypeClassification
 from loguru import logger
 from tabarena.benchmark.task import UserTask
 from tqdm import tqdm
@@ -127,10 +126,41 @@ def convert_data_foundry_task_to_user_task(
     task_container = CuratedContainer.load(path_to_local_task)
 
     # Resolve task type
+    y: pd.Series = task_container.dataset[task_container.task_metadata.target_column_name]
     if task_container.task_metadata.problem_type == "regression":
         problem_type = "regression"
-    elif task_container.task_metadata.problem_type in ProblemTypeClassification:
+        # Assert y is pd.numeric
+        if not pd.api.types.is_numeric_dtype(y):
+            raise ValueError(
+                f"Target column {task_container.task_metadata.target_column_name} is not numeric for "
+                f"regression problem. ({task_container.dataset_metadata.unique_name})"
+            )
+    elif task_container.task_metadata.problem_type == "binary_classification":
+        problem_type = "classification"
+        # Assert y is pd.categorical with 2 classes
+        if not isinstance(y.dtype, pd.CategoricalDtype):
+            raise ValueError(
+                f"Target column {task_container.task_metadata.target_column_name} is not categorical "
+                f"for classification problem. ({task_container.dataset_metadata.unique_name})"
+            )
+        if y.nunique() != 2:
+            raise ValueError(
+                f"Target column {task_container.task_metadata.target_column_name} has {y.nunique()} classes, "
+                f"but expected 2 for binary classification problem. ({task_container.dataset_metadata.unique_name})"
+            )
+    elif task_container.task_metadata.problem_type == "multiclass_classification":
         problem_type = "classification"
+        if not isinstance(y.dtype, pd.CategoricalDtype):
+            raise ValueError(
+                f"Target column {task_container.task_metadata.target_column_name} is not categorical for "
+                f"classification problem. ({task_container.dataset_metadata.unique_name})"
+            )
+        if y.nunique() < 3:
+            raise ValueError(
+                f"Target column {task_container.task_metadata.target_column_name} has {y.nunique()} classes, "
+                f"but expected at least 3 for multiclass classification "
+                f"problem. ({task_container.dataset_metadata.unique_name})"
+            )
     else:
         raise ValueError(f"Unknown problem type {task_container.task_metadata.problem_type}")
 
@@ -141,7 +171,7 @@ def convert_data_foundry_task_to_user_task(
         fallback_metric = allowed_eval_metrics[0]
         if eval_metric not in allowed_eval_metrics:
             logger.info(
-                f"Objective metric {eval_metric} not in allowed for problem type {problem_type}. "
+                f"\nObjective metric {eval_metric} not allowed for problem type {problem_type}. "
                 f"Falling back to {fallback_metric}."
             )
             eval_metric = fallback_metric
@@ -237,7 +267,7 @@ def get_metadata_for_benchmark_suite(benchmark_suite_name: str, data_foundry_cac
     path_to_metadata = data_foundry_cache / f"{benchmark_suite_name}_tasks_metadata.csv"
     if not path_to_metadata.exists():
         raise FileNotFoundError(
-            f"Metadata file {path_to_metadata} does not exist. " "Please run download_data_foundry_datasets first."
+            f"Metadata file {path_to_metadata} does not exist. Please run download_data_foundry_datasets first."
         )
     return path_to_metadata
 
diff --git a/tabflow_slurm/benchmarking_setup/text_caching/__init__.py b/tabflow_slurm/benchmarking_setup/text_caching/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tabflow_slurm/benchmarking_setup/text_caching/run_text_cache.py b/tabflow_slurm/benchmarking_setup/text_caching/run_text_cache.py
new file mode 100644
index 000000000..d2088c84f
--- /dev/null
+++ b/tabflow_slurm/benchmarking_setup/text_caching/run_text_cache.py
@@ -0,0 +1,62 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+
+def pre_generate_text_cache(task_id_str: str, *, ignore_cache: bool = False) -> Path:
+    """Generate the cache as it would be generated on-the-fly during preprocessing,
+    and save it to a parquet file for later loading.
+    """
+    from tabarena.benchmark.preprocessing.model_agnostic_default_preprocessing import TabArenaModelAgnosticPreprocessing
+    from tabarena.benchmark.preprocessing.text_feature_generators import SemanticTextFeatureGenerator
+    from tabarena.benchmark.task.openml import OpenMLTaskWrapper
+    from tabarena.benchmark.task.user_task import UserTask
+
+    task_id_or_object = UserTask.from_task_id_str(task_id_str)
+    cache_path = SemanticTextFeatureGenerator.get_text_cache_dir(task_id_str=str(task_id_or_object.task_id))
+    if (not ignore_cache) and cache_path.exists():
+        print(f"Cache already exists for {task_id_str} at {cache_path}, skipping generation.")
+        return cache_path
+
+    task = OpenMLTaskWrapper(
+        task=task_id_or_object.load_local_openml_task(),
+    )
+    print(f"Loaded {task_id_str}, with {len(task.X)} rows and {len(task.X.columns)} columns.")
+    preprocessing = TabArenaModelAgnosticPreprocessing(
+        enable_sematic_text_features=True,
+        enable_raw_text_features=False,
+        enable_text_special_features=False,
+        enable_statistical_text_features=False,
+        enable_text_ngram_features=False,
+        enable_datetime_features=False,
+        verbosity=4,
+    )
+    preprocessing.fit_transform(X=task.X)
+
+    cache_path = SemanticTextFeatureGenerator.get_text_cache_dir(task_id_str=str(task.task_id))
+    SemanticTextFeatureGenerator.save_embedding_cache(
+        cache=SemanticTextFeatureGenerator._embedding_look_up, path=cache_path
+    )
+    SemanticTextFeatureGenerator._embedding_look_up.clear()
+    print(f"Cache generated and saved to: {cache_path}")
+    return cache_path
+
+
+if __name__ == "__main__":
+    import argparse
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(name)s %(levelname)s: %(message)s")
+
+    # TODO: add support for setting the OpenML cache dir here as well.
+    parser = argparse.ArgumentParser()
+    # Require tasks settings
+    parser.add_argument(
+        "--task_id_str",
+        type=str,
+        required=True,
+        help="User Task ID for a dataset with text.",
+    )
+    args = parser.parse_args()
+
+    pre_generate_text_cache(args.task_id_str)
diff --git a/tabflow_slurm/run_tabarena_experiment.py b/tabflow_slurm/run_tabarena_experiment.py
index f93a6aaec..994b4d943 100644
--- a/tabflow_slurm/run_tabarena_experiment.py
+++ b/tabflow_slurm/run_tabarena_experiment.py
@@ -48,6 +48,9 @@ def setup_slurm_job(
         import tempfile
 
         import ray
+        import os
+
+        os.environ["RAY_DISABLE_RETRIES"] = "1"
 
         ray_dir = tempfile.mkdtemp() + "/ray"
 
@@ -65,18 +68,22 @@ def setup_slurm_job(
             # Likely slower but runs at least.
             _plasma_directory = ray_dir
 
-        ray.init(
-            address="local",
-            _memory=ray_mem_in_b,
-            object_store_memory=int(ray_mem_in_b * 0.3),
-            _temp_dir=ray_dir,
-            include_dashboard=False,
-            logging_level=logging.INFO,
-            log_to_driver=True,
-            num_gpus=num_gpus,
-            num_cpus=num_cpus,
-            _plasma_directory=_plasma_directory,
-        )
+        import warnings
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=FutureWarning)
+            ray.init(
+                address="local",
+                _memory=ray_mem_in_b,
+                object_store_memory=int(ray_mem_in_b * 0.3),
+                _temp_dir=ray_dir,
+                include_dashboard=False,
+                logging_level=logging.INFO,
+                log_to_driver=True,
+                num_gpus=num_gpus,
+                num_cpus=num_cpus,
+                _plasma_directory=_plasma_directory,
+            )
     return ray_dir
 
 
@@ -389,9 +396,17 @@ def _parse_int_or_none(s):
     parser.add_argument(
         "--num_gpus",
         type=int,
-        help="Number of GPUs to use for the experiment.",
+        help="Number of GPUs to use for the experiment (SLURM node allocation and Ray).",
         default=0,
     )
+    parser.add_argument(
+        "--num_gpus_model",
+        type=_parse_int_or_none,
+        help="Number of GPUs passed to AutoGluon for model fitting. "
+        "If None, defaults to --num_gpus. Set to 0 to reserve the GPU "
+        "for preprocessing only (e.g. text embedding) while fitting models on CPU.",
+        default=None,
+    )
     parser.add_argument(
         "--memory_limit",
         type=_parse_int_or_none,
@@ -430,6 +445,9 @@ def _parse_int_or_none(s):
             f"Memory limit not provided, using detected memory size: {memory_limit} GB"
         )
 
+    num_gpus_model = args.num_gpus_model if args.num_gpus_model is not None else args.num_gpus
+    print(f"GPUs for node/Ray: {args.num_gpus}, GPUs for model fitting: {num_gpus_model}")
+
     ray_temp_dir = setup_slurm_job(
         openml_cache_dir=args.openml_cache_dir,
         setup_ray_for_slurm_shared_resources_environment=args.setup_ray_for_slurm_shared_resources_environment,
@@ -447,7 +465,7 @@ def _parse_int_or_none(s):
             output_dir=args.output_dir,
             ignore_cache=args.ignore_cache,
             num_cpus=num_cpus,
-            num_gpus=args.num_gpus,
+            num_gpus=num_gpus_model,
             memory_limit=memory_limit,
             sequential_local_fold_fitting=args.sequential_local_fold_fitting,
             dynamic_tabarena_validation_protocol=args.dynamic_tabarena_validation_protocol,
diff --git a/tabflow_slurm/setup_slurm_base_v2.py b/tabflow_slurm/setup_slurm_base_v2.py
index 77d04cc89..4f44406ff 100644
--- a/tabflow_slurm/setup_slurm_base_v2.py
+++ b/tabflow_slurm/setup_slurm_base_v2.py
@@ -2,6 +2,7 @@
 
 import json
 import re
+import warnings
 from copy import deepcopy
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -70,10 +71,7 @@ def run_script_path(self) -> str:
         """Python script to run the benchmark.
         This should point to the script that runs the benchmark for TabArena.
         """
-        return self.base_path + (
-            f"code/{self.tabarena_repo_name}/tabarena"
-            f"/tabflow_slurm/run_tabarena_experiment.py"
-        )
+        return self.base_path + (f"code/{self.tabarena_repo_name}/tabarena/tabflow_slurm/run_tabarena_experiment.py")
 
     @property
     def configs_base_path(self) -> str:
@@ -82,9 +80,7 @@ def configs_base_path(self) -> str:
         File path is f"{self.base_path}{self.configs_path_from_base_path}
         {self._safe_benchmark_name}.yaml".
         """
-        return (
-            f"code/{self.tabarena_repo_name}/tabarena/tabflow_slurm/benchmark_configs_"
-        )
+        return f"code/{self.tabarena_repo_name}/tabarena/tabflow_slurm/benchmark_configs_"
 
     def get_slurm_job_json_path(self, safe_benchmark_name: str) -> str:
         """JSON file with the job data to run used by SLURM.
@@ -92,10 +88,7 @@ def get_slurm_job_json_path(self, safe_benchmark_name: str) -> str:
         """
         # TODO: change UX for config and slurm paths.
         path_to_config_file = str(Path(self.configs_base_path).parent) + "/"
-        return (
-            f"{self.base_path}{path_to_config_file}"
-            f"slurm_run_data_{safe_benchmark_name}.json"
-        )
+        return f"{self.base_path}{path_to_config_file}slurm_run_data_{safe_benchmark_name}.json"
 
     def get_configs_path(self, safe_benchmark_name: str) -> str:
         """YAML file with the configs to run."""
@@ -111,10 +104,7 @@ def get_slurm_log_output_path(self, benchmark_name: str) -> str:
 
     def get_slurm_script_path(self, script_name: str) -> str:
         """Path to the SLURM script to run."""
-        return (
-            self.base_path
-            + f"code/{self.tabarena_repo_name}/tabarena/tabflow_slurm/{script_name}"
-        )
+        return self.base_path + f"code/{self.tabarena_repo_name}/tabarena/tabflow_slurm/{script_name}"
 
 
 @dataclass
@@ -187,6 +177,23 @@ class BenchmarkSetup2026:
     split_indices_to_run: list[str] | Literal["lite"] | None = None
     """Split indices to run in the benchmark. Adjust as needed to run only specific
     splits. If None, we run all splits. If "lite", we run only the first split."""
+    required_dtypes_to_run: list[str] | None = None
+    """Adjust as needed to run only datasets with at least one column of data types.
+    Options: "numeric", "categorical", "text", "datetime".
+    If None, we do not require any data types.
+    """
+    forbidden_dtypes_to_run: list[str] | None = None
+    """Adjust as needed to run only datasets without any columns of data types.
+    Options: "numeric", "categorical", "text", "datetime".
+    If None, we do not forbid any data types.
+    """
+    n_train_samples_to_run: tuple[int | None, int | None] | None = None
+    """Tuple of lower and upper limit for the number of training samples of datasets run in the benchmark.
+    Adjust as needed to run only datasets with a certain number of training samples.
+    If None, we run all datasets.
+    Lower limit is inclusive, upper limit is exclusive. For example, (0, 1000) runs only datasets with less
+    than 1000 training samples. If a tuple value is None, there is no limit in that direction.
+    """
 
     path_setup: PathSetup = field(default_factory=PathSetup)
     """Contains all path related to the benchmark."""
@@ -209,7 +216,12 @@ class BenchmarkSetup2026:
     """Number of CPUs to use for the job.
     If None, use all available CPUs."""
     num_gpus: int = 0
-    """Number of GPUs to use for the jobs."""
+    """Number of GPUs to use for the jobs (SLURM allocation and Ray)."""
+    num_gpus_model: int | None = None
+    """Number of GPUs passed to a model for fitting.
+    If None (default), uses the same value as ``num_gpus``.
+    Set to 0 to reserve the GPU for preprocessing (e.g. sentence-transformer
+    encoding) while fitting models on CPU only."""
     memory_limit: int | None = 32
     """Memory/RAM limit for the jobs in GB.
     If None, use all available memory."""
@@ -248,9 +260,7 @@ class BenchmarkSetup2026:
     This can be disabled by setting this to False. Warning: the model then needs
     to be able to handle this!
     """
-    preprocessing_pipelines: list[str] = field(
-        default_factory=lambda: ["tabarena_default"]
-    )
+    preprocessing_pipelines: list[str] = field(default_factory=lambda: ["tabarena_default"])
     """EXPERIMENTAL!
     Preprocessing pipelines to add to the configurations we want to run.
 
@@ -297,6 +307,11 @@ class BenchmarkSetup2026:
             }
         }
     """
+    max_predict_batch_size: int | None = 50_000
+    """Maximal batch size for the predict function of the models.
+    This is used at validation and test predict time. Thus, it trades off speed for memory usage. 
+    If None, no limit is applied.
+    """
 
     # Misc Settings
     # -------------
@@ -416,9 +431,7 @@ def _get_slurm_base_command(  # noqa: PLR0913
         partition = "--partition=" + partition
         slurm_logs = f"--output={slurm_log_output}/%A/slurm-%A_%a.out"
 
-        time_in_h = (
-            time_limit_per_config // 3600 * configs_per_job + time_limit_overhead
-        )
+        time_in_h = time_limit_per_config // 3600 * configs_per_job + time_limit_overhead
         time_in_h = f"--time={time_in_h}:00:00"
 
         # Handle GPU (same for exclusive and non-exclusive)
@@ -453,10 +466,7 @@ def _get_slurm_base_command(  # noqa: PLR0913
     @property
     def slurm_base_command(self):
         """SLURM command to run the benchmark."""
-        p_bm = self._parallel_safe_benchmark_name
-        slurm_script_path = self.path_setup.get_slurm_script_path(
-            self.slurm_setup.script_name
-        )
+        slurm_script_path = self.path_setup.get_slurm_script_path(self.slurm_setup.script_name)
 
         return self._get_slurm_base_command(
             num_cpus=self.num_cpus,
@@ -465,7 +475,7 @@ def slurm_base_command(self):
             time_limit_per_config=self.time_limit_per_config,
             configs_per_job=self._max_configs_per_job,
             time_limit_overhead=self.slurm_setup.time_limit_overhead,
-            slurm_log_output=self.path_setup.get_slurm_log_output_path(p_bm),
+            slurm_log_output=self.path_setup.get_slurm_log_output_path(self.benchmark_name),
             slurm_script_path=slurm_script_path,
             slurm_extra_gres=self.slurm_setup.extra_gres,
             slurm_exclusive_node=self.slurm_setup.exclusive_node,
@@ -504,17 +514,15 @@ def _load_task_metadata(self) -> list[TabArenaTaskMetadata]:
 
                 for repeat_i in range(n_repeats):
                     for fold_i in range(n_folds):
-
                         split_index = SplitMetadata.get_split_index(repeat_i=repeat_i, fold_i=fold_i)
                         splits_metadata = {
-                            split_index:
-                            SplitMetadata(
+                            split_index: SplitMetadata(
                                 repeat=repeat_i,
                                 fold=fold_i,
-                                num_instances_train=num_instances * 2/3,
-                                num_instances_test=num_instances * 1/3,
-                                num_instance_groups_train=num_instances * 2/3,
-                                num_instance_groups_test=num_instances * 1/3,
+                                num_instances_train=num_instances * 2 / 3,
+                                num_instances_test=num_instances * 1 / 3,
+                                num_instance_groups_train=num_instances * 2 / 3,
+                                num_instance_groups_test=num_instances * 1 / 3,
                                 num_classes_train=num_classes,
                                 num_classes_test=num_classes,
                                 num_features_train=num_features,
@@ -553,53 +561,65 @@ def _load_task_metadata(self) -> list[TabArenaTaskMetadata]:
             task_metadata = pd.read_csv(task_metadata, index_col=False)
         if isinstance(task_metadata, pd.DataFrame):
             # Parse task_metadat
-            task_metadata = [
-                TabArenaTaskMetadata.from_row(row)
-                for _, row in task_metadata.iterrows()
-            ]
+            task_metadata = [TabArenaTaskMetadata.from_row(row) for _, row in task_metadata.iterrows()]
         assert all(isinstance(x, TabArenaTaskMetadata) for x in task_metadata)
         n_rolled_up_tasks = len(task_metadata)
 
         # Unify format to be unrolled
-        task_metadata = [
-            single_ttm for ttm in task_metadata for single_ttm in ttm.unroll_splits()
-        ]
+        task_metadata = [single_ttm for ttm in task_metadata for single_ttm in ttm.unroll_splits()]
         n_unrolled_tasks = len(task_metadata)
 
         # -- Perform general filters/slices
-        task_metadata = [
-            ttm
-            for ttm in task_metadata
-            if ttm.problem_type in self.problem_types_to_run
-        ]
+        task_metadata = [ttm for ttm in task_metadata if ttm.problem_type in self.problem_types_to_run]
         n_problem_types_filtered_tasks = len(task_metadata)
 
         if self.split_indices_to_run is not None:
             if self.split_indices_to_run == "lite":
-                split_indices_to_run = [
-                    SplitMetadata.get_split_index(repeat_i=0, fold_i=0)
-                ]
+                split_indices_to_run = [SplitMetadata.get_split_index(repeat_i=0, fold_i=0)]
             else:
                 split_indices_to_run = self.split_indices_to_run
 
             # Assert split indices are valid
             split_index_pattern = re.compile(r"^r\d+f\d+$")
             for split_index in split_indices_to_run:
-                assert (
-                    split_index_pattern.match(split_index)
-                ), f"Invalid SplitIndex format: {split_index!r}, expected 'r{{int}}f{{int}}'"
+                assert split_index_pattern.match(split_index), (
+                    f"Invalid SplitIndex format: {split_index!r}, expected 'r{{int}}f{{int}}'"
+                )
+
+            task_metadata = [ttm for ttm in task_metadata if ttm.split_index in split_indices_to_run]
+        n_splits_filtered_tasks = len(task_metadata)
 
+        # Filter based on dtypes if specified
+        if (self.forbidden_dtypes_to_run is not None) or (self.required_dtypes_to_run is not None):
             task_metadata = [
-                ttm for ttm in task_metadata if ttm.split_index in split_indices_to_run
+                ttm
+                for ttm in task_metadata
+                if ttm.has_supported_dtypes(
+                    required_dtypes=self.required_dtypes_to_run,
+                    forbidden_dtypes=self.forbidden_dtypes_to_run,
+                )
             ]
-        n_splits_filtered_tasks = len(task_metadata)
+        n_dtypes_filtered_tasks = len(task_metadata)
+
+        # Filter based on training samples if specified
+        if self.n_train_samples_to_run is not None:
+            lb, ub = self.n_train_samples_to_run
+            lb = lb if lb is not None else 0
+            ub = ub if ub is not None else float("inf")
+            task_metadata = [
+                ttm
+                for ttm in task_metadata
+                if (
+                    (ttm.splits_metadata[ttm.split_index].num_instances_train < ub)
+                    and (ttm.splits_metadata[ttm.split_index].num_instances_train >= lb)
+                )
+            ]
+        n_sizes_filtered_tasks = len(task_metadata)
 
         # -- Sanity checks
         for ttm in task_metadata:
             if ttm.task_id_str is None:
-                raise ValueError(
-                    f"Task metadata for task {ttm.tabarena_task_name} does not have a task_id_str!"
-                )
+                raise ValueError(f"Task metadata for task {ttm.tabarena_task_name} does not have a task_id_str!")
 
         print(
             f"Found {len(task_metadata)} tasks from metadata."
@@ -607,6 +627,8 @@ def _load_task_metadata(self) -> list[TabArenaTaskMetadata]:
             f"\n\t(1) {n_rolled_up_tasks} datasets -> {n_unrolled_tasks} Tasks."
             f"\n\t(2) Filter to problem types: {n_problem_types_filtered_tasks}"
             f"\n\t(3) Filter to splits: {n_splits_filtered_tasks}."
+            f"\n\t(4) Filter to dtypes: {n_dtypes_filtered_tasks}."
+            f"\n\t(5) Filter to dataset size: {n_sizes_filtered_tasks}."
         )
         return task_metadata
 
@@ -616,12 +638,8 @@ def get_jobs_to_run(self):  # noqa: C901
         """
         if self.path_setup.openml_cache_path != "auto":
             Path(self.path_setup.openml_cache_path).mkdir(parents=True, exist_ok=True)
-        Path(self.path_setup.get_output_path(self.benchmark_name)).mkdir(
-            parents=True, exist_ok=True
-        )
-        Path(self.path_setup.get_slurm_log_output_path(self.benchmark_name)).mkdir(
-            parents=True, exist_ok=True
-        )
+        Path(self.path_setup.get_output_path(self.benchmark_name)).mkdir(parents=True, exist_ok=True)
+        Path(self.path_setup.get_slurm_log_output_path(self.benchmark_name)).mkdir(parents=True, exist_ok=True)
 
         task_metadata_list = self._load_task_metadata()
         configs = self.generate_configs_yaml()
@@ -629,9 +647,7 @@ def get_jobs_to_run(self):  # noqa: C901
         def yield_all_jobs():
             for ta_task_metadata in task_metadata_list:
                 task_id = ta_task_metadata.task_id_str
-                split_md = ta_task_metadata.splits_metadata[
-                    ta_task_metadata.split_index
-                ]
+                split_md = ta_task_metadata.splits_metadata[ta_task_metadata.split_index]
 
                 for config_index, config in list(enumerate(configs)):
                     yield {
@@ -650,7 +666,9 @@ def yield_all_jobs():
         # Check cache and filter invalid jobs in parallel using Ray
         if ray.is_initialized:
             ray.shutdown()
-        ray.init(num_cpus=self.num_ray_cpus)
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=FutureWarning)
+            ray.init(num_cpus=self.num_ray_cpus)
         output = ray_map_list(
             list_to_map=list(to_batch_list(jobs_to_check, 10_000)),
             func=should_run_job_batch,
@@ -665,9 +683,7 @@ def yield_all_jobs():
             track_progress=True,
             tqdm_kwargs={"desc": "Checking Cache and Filter Invalid Jobs"},
         )
-        output = [
-            item for sublist in output for item in sublist
-        ]  # Flatten the batched list
+        output = [item for sublist in output for item in sublist]  # Flatten the batched list
         to_run_job_map = {}
         for run_job, job_data in zip(output, jobs_to_check, strict=True):
             if run_job:
@@ -686,9 +702,7 @@ def yield_all_jobs():
         max_config_batch = 1
         for job_key, config_indices in to_run_job_map.items():
             to_run_jobs += len(config_indices)
-            for config_batch in to_batch_list(
-                config_indices, self.slurm_setup.configs_per_job
-            ):
+            for config_batch in to_batch_list(config_indices, self.slurm_setup.configs_per_job):
                 max_config_batch = max(max_config_batch, len(config_batch))
                 jobs.append(
                     {
@@ -703,9 +717,7 @@ def yield_all_jobs():
         print(f"Jobs with batching: {len(jobs)}")
         return jobs
 
-    def _generate_autogluon_config(
-        self, *, model_name: str, agexp_kwargs: dict, pipeline_method_kwargs: dict
-    ) -> list:
+    def _generate_autogluon_config(self, *, model_name: str, agexp_kwargs: dict, pipeline_method_kwargs: dict) -> list:
         """Parse the AutoGluon config from the models."""
         from tabarena.benchmark.experiment.experiment_constructor import (
             AGExperiment,
@@ -741,8 +753,7 @@ def _generate_model_configs(
             n_configs = self.n_random_configs
         elif not isinstance(n_configs, int):
             raise ValueError(
-                f"Invalid number of configurations for model {model_name}: {n_configs}. "
-                "Must be an integer or 'all'."
+                f"Invalid number of configurations for model {model_name}: {n_configs}. Must be an integer or 'all'."
             )
         config_generator = get_configs_generator_from_name(model_name)
         # TODO: add model agnostic time limit here
@@ -768,22 +779,22 @@ def generate_configs_yaml(self) -> list[dict]:
             "init_kwargs": {"verbosity": self.verbosity},
             "shuffle_features": self.shuffle_features,
             "fit_kwargs": dict(),
+            "model_hyperparameters": dict(),
         }
         if self.model_artifacts_base_path is not None:
-            method_kwargs["init_kwargs"]["default_base_path"] = (
-                self.model_artifacts_base_path
-            )
+            method_kwargs["init_kwargs"]["default_base_path"] = self.model_artifacts_base_path
         if not self.model_agnostic_preprocessing:
             method_kwargs["fit_kwargs"]["feature_generator"] = None
         if self.adapt_num_folds_to_n_classes:
             method_kwargs["fit_kwargs"]["adapt_num_bag_folds_to_n_classes"] = True
+        if self.max_predict_batch_size is not None:
+            method_kwargs["model_hyperparameters"]["ag.max_batch_size"] = self.max_predict_batch_size
 
         print(
             "Generating experiments for models...",
             f"\n\t`all` := number of configs: {self.n_random_configs}",
             f"\n\t{len(self.models)} models: {self.models}",
-            f"\n\t{len(self.preprocessing_pipelines)} preprocessing pipelines: "
-            f"{self.preprocessing_pipelines}",
+            f"\n\t{len(self.preprocessing_pipelines)} preprocessing pipelines: {self.preprocessing_pipelines}",
             f"\n\tMethod kwargs: {method_kwargs}",
         )
         for preprocessing_name in self.preprocessing_pipelines:
@@ -819,9 +830,7 @@ def generate_configs_yaml(self) -> list[dict]:
                 )
 
         # Verify no duplicate names
-        experiments_all = [
-            exp for exp_family_lst in experiments_lst for exp in exp_family_lst
-        ]
+        experiments_all = [exp for exp_family_lst in experiments_lst for exp in exp_family_lst]
         experiment_names = set()
         for experiment in experiments_all:
             if experiment.name not in experiment_names:
@@ -832,9 +841,7 @@ def generate_configs_yaml(self) -> list[dict]:
                     f"All experiment names must be unique!",
                 )
 
-        configs_path = self.path_setup.get_configs_path(
-            self._parallel_safe_benchmark_name
-        )
+        configs_path = self.path_setup.get_configs_path(self._parallel_safe_benchmark_name)
         YamlExperimentSerializer.to_yaml(
             experiments=experiments_all,
             path=configs_path,
@@ -857,12 +864,11 @@ def get_jobs_dict(self):
             "python": self.path_setup.python_path,
             "run_script": self.path_setup.run_script_path,
             "openml_cache_dir": self.path_setup.openml_cache_path,
-            "configs_yaml_file": self.path_setup.get_configs_path(
-                self._parallel_safe_benchmark_name
-            ),
+            "configs_yaml_file": self.path_setup.get_configs_path(self._parallel_safe_benchmark_name),
             "output_dir": self.path_setup.get_output_path(self.benchmark_name),
             "num_cpus": self.num_cpus,
             "num_gpus": self.num_gpus,
+            "num_gpus_model": self.num_gpus_model,
             "memory_limit": memory_limit,
             "setup_ray_for_slurm_shared_resources_environment": self.slurm_setup.setup_ray_for_slurm_shared_resources_environment,
             "ignore_cache": self.ignore_cache,
@@ -871,7 +877,7 @@ def get_jobs_dict(self):
         }
         return {"defaults": default_args, "jobs": jobs}
 
-    def setup_jobs(self, array_job_limit: int = 100) -> str | list[str]:
+    def setup_jobs(self, array_job_limit: int = 100) -> list[str]:
         """Setup the jobs to run by generating the SLURM job JSON file(s).
 
         If the number of jobs exceeds `slurm_setup.max_array_size`, the jobs
@@ -881,17 +887,13 @@ def setup_jobs(self, array_job_limit: int = 100) -> str | list[str]:
         strings if multiple batches are needed.
         """
         jobs_dict = self.get_jobs_dict()
-        base_json_path = self.path_setup.get_slurm_job_json_path(
-            self._parallel_safe_benchmark_name
-        )
+        base_json_path = self.path_setup.get_slurm_job_json_path(self._parallel_safe_benchmark_name)
         all_jobs = jobs_dict["jobs"]
         n_jobs = len(all_jobs)
         if n_jobs == 0:
             print("No jobs to run.")
             Path(base_json_path).unlink(missing_ok=True)
-            Path(
-                self.path_setup.get_configs_path(self._parallel_safe_benchmark_name)
-            ).unlink(missing_ok=True)
+            Path(self.path_setup.get_configs_path(self._parallel_safe_benchmark_name)).unlink(missing_ok=True)
             return "N/A"
 
         max_array_size = self.slurm_setup.max_array_size
@@ -904,27 +906,19 @@ def setup_jobs(self, array_job_limit: int = 100) -> str | list[str]:
             if len(job_batches) == 1:
                 json_path = base_json_path
             else:
-                json_path = base_json_path.replace(
-                    ".json", f"_batch{batch_idx}.json"
-                )
+                json_path = base_json_path.replace(".json", f"_batch{batch_idx}.json")
 
             batch_dict = {"defaults": jobs_dict["defaults"], "jobs": batch_jobs}
             with open(json_path, "w") as f:
                 json.dump(batch_dict, f)
 
             batch_size = len(batch_jobs)
-            run_command = (
-                f"sbatch --array=0-{batch_size - 1}%{array_job_limit}"
-                f" {self.slurm_base_command} {json_path}"
-            )
+            run_command = f"sbatch --array=0-{batch_size - 1}%{array_job_limit} {self.slurm_base_command} {json_path}"
             run_commands.append(run_command)
 
         batch_info = ""
         if len(job_batches) > 1:
-            batch_info = (
-                f"\nSplit into {len(job_batches)} array job batches"
-                f" (max {max_array_size} per batch)."
-            )
+            batch_info = f"\nSplit into {len(job_batches)} array job batches (max {max_array_size} per batch)."
         print(
             f"##### Setup Jobs for {self._parallel_safe_benchmark_name}"
             f"{batch_info}"
@@ -932,8 +926,6 @@ def setup_jobs(self, array_job_limit: int = 100) -> str | list[str]:
             f"\n" + "\n".join(run_commands) + "\n"
         )
 
-        if len(run_commands) == 1:
-            return run_commands[0]
         return run_commands
 
     @property
@@ -1023,20 +1015,12 @@ def are_model_constraints_valid(
         if (max_n_features is not None) and (n_features > max_n_features):
             return False
 
-        max_n_samples_train_per_fold = model_constraints.get(
-            "max_n_samples_train_per_fold", None
-        )
-        if (max_n_samples_train_per_fold is not None) and (
-            n_samples_train_per_fold > max_n_samples_train_per_fold
-        ):
+        max_n_samples_train_per_fold = model_constraints.get("max_n_samples_train_per_fold", None)
+        if (max_n_samples_train_per_fold is not None) and (n_samples_train_per_fold > max_n_samples_train_per_fold):
             return False
 
-        min_n_samples_train_per_fold = model_constraints.get(
-            "min_n_samples_train_per_fold", None
-        )
-        if (min_n_samples_train_per_fold is not None) and (
-            n_samples_train_per_fold < min_n_samples_train_per_fold
-        ):
+        min_n_samples_train_per_fold = model_constraints.get("min_n_samples_train_per_fold", None)
+        if (min_n_samples_train_per_fold is not None) and (n_samples_train_per_fold < min_n_samples_train_per_fold):
             return False
 
         max_n_classes = model_constraints.get("max_n_classes", None)
diff --git a/tabflow_slurm/submit_template.sh b/tabflow_slurm/submit_template.sh
index b0fb3d738..16572a6d2 100644
--- a/tabflow_slurm/submit_template.sh
+++ b/tabflow_slurm/submit_template.sh
@@ -43,6 +43,7 @@ CONFIGS_YAML_FILE=$(jq -r '.defaults.configs_yaml_file' "$JSON_FILE")
 OUTPUT_DIR=$(jq -r '.defaults.output_dir' "$JSON_FILE")
 NUM_CPUS=$(jq -r '.defaults.num_cpus' "$JSON_FILE")
 NUM_GPUS=$(jq -r '.defaults.num_gpus' "$JSON_FILE")
+NUM_GPUS_MODEL=$(jq -r '.defaults.num_gpus_model' "$JSON_FILE")
 MEMORY_LIMIT=$(jq -r '.defaults.memory_limit' "$JSON_FILE")
 SETUP_RAY=$(jq -r '.defaults.setup_ray_for_slurm_shared_resources_environment' "$JSON_FILE")
 IGNORE_CACHE=$(jq -r '.defaults.ignore_cache' "$JSON_FILE")
@@ -58,6 +59,7 @@ echo "Configs YAML File: $CONFIGS_YAML_FILE"
 echo "Output Directory: $OUTPUT_DIR"
 echo "Number of CPUs: $NUM_CPUS"
 echo "Number of GPUs: $NUM_GPUS"
+echo "Number of GPUs for model fitting: $NUM_GPUS_MODEL"
 echo "Memory Limit: $MEMORY_LIMIT"
 echo "Setup Ray for SLURM Shared Resources Environment: $SETUP_RAY"
 echo "Ignore Cache: $IGNORE_CACHE"
@@ -94,6 +96,7 @@ for CI in "${CONFIG_ARRAY[@]}"; do
         --output_dir $OUTPUT_DIR \
         --num_cpus $NUM_CPUS \
         --num_gpus $NUM_GPUS \
+        --num_gpus_model $NUM_GPUS_MODEL \
         --memory_limit $MEMORY_LIMIT \
         --setup_ray_for_slurm_shared_resources_environment $SETUP_RAY \
         --ignore_cache $IGNORE_CACHE \
diff --git a/tst/benchmark/experiment/test_validation_utils.py b/tst/benchmark/experiment/test_validation_utils.py
index ca997d519..ccb96fabe 100644
--- a/tst/benchmark/experiment/test_validation_utils.py
+++ b/tst/benchmark/experiment/test_validation_utils.py
@@ -270,6 +270,29 @@ def test_get_num_group_instances_no_group():
     assert v.get_num_group_instances(X) == 7
 
 
+@pytest.mark.skipif(not _DATA_FOUNDRY_AVAILABLE, reason="data_foundry not installed")
+def test_resolve_validation_splits_group_on_with_num_repeats_none():
+    """When group_on is set and num_repeats is None, num_repeats should default to 1.
+
+    Regression test: previously num_repeats=None was passed through to
+    _resolve_group_splits which expects an integer, causing a crash.
+    """
+    n = 600  # > 500 so tiny-data path does not override num_repeats
+    groups = [f"g{i % 10}" for i in range(n)]
+    X = pd.DataFrame({"feature": np.arange(n, dtype=float), "grp": groups})
+    y = pd.Series(np.zeros(n))
+    v = _Validation(
+        use_task_specific_validation=True,
+        group_on="grp",
+        group_labels=GroupLabelTypes.PER_SAMPLE,
+    )
+    custom_splits, folds, repeats = v.resolve_validation_splits(
+        X=X, y=y, num_folds=8, num_repeats=None,
+    )
+    assert custom_splits is not None
+    assert repeats == 1
+
+
 # ===========================================================================
 # Additional split_time_index_into_intervals tests
 # ===========================================================================
diff --git a/tst/benchmark/preprocessing/test_preprocessing.py b/tst/benchmark/preprocessing/test_preprocessing.py
index 8678789ee..60edf692c 100644
--- a/tst/benchmark/preprocessing/test_preprocessing.py
+++ b/tst/benchmark/preprocessing/test_preprocessing.py
@@ -19,6 +19,7 @@
 from tabarena.benchmark.preprocessing.text_feature_generators import (
     SemanticTextFeatureGenerator,
     StatisticalTextFeatureGenerator,
+    TabArenaDefaultTextEncoder,
     TextEmbeddingDimensionalityReductionFeatureGenerator,
     sanitize_text,
 )
@@ -295,6 +296,41 @@ def test_fit_transform_with_integer_columns(self):
         X_out = gen.fit_transform(X)
         assert len(X_out) == 3
 
+    def test_fit_transform_renames_dot_columns(self):
+        gen = _make_no_text_gen()
+        X = pd.DataFrame({"a.b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]})
+        X_out = gen.fit_transform(X.copy())
+        assert "a_b" in X_out.columns
+        assert "a.b" not in X_out.columns
+
+    def test_fit_transform_leaves_clean_columns_unchanged(self):
+        gen = _make_no_text_gen()
+        X = pd.DataFrame({"a": [1.0, 2.0, 3.0], "b_c": [4.0, 5.0, 6.0]})
+        X_out = gen.fit_transform(X.copy())
+        assert "a" in X_out.columns
+        assert "b_c" in X_out.columns
+
+    def test_fit_transform_multiple_dots_all_replaced(self):
+        gen = _make_no_text_gen()
+        X = pd.DataFrame({"a.b.c": [1.0, 2.0, 3.0]})
+        X_out = gen.fit_transform(X.copy())
+        assert "a_b_c" in X_out.columns
+
+    def test_transform_also_renames_dot_columns(self):
+        gen = _make_no_text_gen()
+        X_train = pd.DataFrame({"a.b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]})
+        gen.fit_transform(X_train.copy())
+        X_test = pd.DataFrame({"a.b": [7.0, 8.0], "c": [9.0, 10.0]})
+        X_out = gen.transform(X_test)
+        assert "a_b" in X_out.columns
+        assert "a.b" not in X_out.columns
+
+    def test_no_dot_columns_produces_empty_rename_map(self):
+        gen = _make_no_text_gen()
+        X = pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
+        gen.fit_transform(X.copy())
+        assert gen._dot_rename_map_ == {}
+
 
 # ===========================================================================
 # NoCatAsStringCategoryFeatureGenerator
@@ -444,41 +480,6 @@ def test_is_astype_feature_generator_subclass(self):
 
         assert issubclass(StringFixAsTypeFeatureGenerator, AsTypeFeatureGenerator)
 
-    def test_fit_transform_renames_dot_columns(self):
-        gen = StringFixAsTypeFeatureGenerator()
-        X = pd.DataFrame({"a.b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]})
-        X_out = gen.fit_transform(X.copy())
-        assert "a_b" in X_out.columns
-        assert "a.b" not in X_out.columns
-
-    def test_fit_transform_leaves_clean_columns_unchanged(self):
-        gen = StringFixAsTypeFeatureGenerator()
-        X = pd.DataFrame({"a": [1.0, 2.0, 3.0], "b_c": [4.0, 5.0, 6.0]})
-        X_out = gen.fit_transform(X.copy())
-        assert "a" in X_out.columns
-        assert "b_c" in X_out.columns
-
-    def test_fit_transform_multiple_dots_all_replaced(self):
-        gen = StringFixAsTypeFeatureGenerator()
-        X = pd.DataFrame({"a.b.c": [1.0, 2.0, 3.0]})
-        X_out = gen.fit_transform(X.copy())
-        assert "a_b_c" in X_out.columns
-
-    def test_transform_also_renames_dot_columns(self):
-        gen = StringFixAsTypeFeatureGenerator()
-        X_train = pd.DataFrame({"a.b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]})
-        gen.fit_transform(X_train.copy())
-        X_test = pd.DataFrame({"a.b": [7.0, 8.0], "c": [9.0, 10.0]})
-        X_out = gen.transform(X_test)
-        assert "a_b" in X_out.columns
-        assert "a.b" not in X_out.columns
-
-    def test_no_dot_columns_produces_empty_rename_map(self):
-        gen = StringFixAsTypeFeatureGenerator()
-        X = pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
-        gen.fit_transform(X.copy())
-        assert gen._dot_rename_map_ == {}
-
 
 # ===========================================================================
 # StringFixAsTypeFeatureGenerator – categorical dtype special cases
@@ -643,41 +644,106 @@ def test_float_column_unchanged_by_categorical_fix(self):
     # ------------------------------------------------------------------
 
     # ------------------------------------------------------------------
-    # Binary column gaining a third category at test time
+    # Bool columns with unseen values at test time
+    #
+    # Bool encoding always applies: true_val → 1, everything else → 0.
+    # Unseen values are mapped to 0 (False) and a warning is logged.
+    # The column stays in _bool_features and keeps its int8 dtype.
     # ------------------------------------------------------------------
 
-    def test_binary_column_gaining_third_category_not_silently_mapped(self):
-        """A binary column (bool-encoded as int8 at fit time) that gains a third value
-        at test time must not silently map that value to 0 (false).
-
-        With only 2 unique values at fit time the column is stored in _bool_features and
-        encoded as int8 via _convert_to_bool (== true_val → 1, else → 0).  A 3rd value
-        that appears at test time would silently become 0 without our fix; instead we
-        convert the whole column to categorical so all values are preserved.
-        """
+    def test_bool_col_unseen_value_mapped_to_false(self):
+        """An unseen value in a bool column must be mapped to 0 (False)."""
         X_train = pd.DataFrame({"col": pd.Categorical(["yes", "no", "yes", "no"])})
         gen = StringFixAsTypeFeatureGenerator()
         gen.fit_transform(X_train.copy())
-        assert "col" in gen._bool_features, "Expected binary column to be bool-encoded at fit time"
+        assert "col" in gen._bool_features
 
         X_test = pd.DataFrame({"col": pd.Categorical(["yes", "no", "maybe"])})
         X_out = gen.transform(X_test.copy())
-        assert X_out["col"].isna().sum() == 0, "'maybe' was converted to NaN"
-        values = set(X_out["col"].astype(object).tolist())
-        assert "maybe" in values, "'maybe' was silently discarded / mapped to 0 or 1"
+        assert X_out["col"].dtype == np.int8
+        assert X_out["col"].iloc[0] == 1  # 'yes' (true_val) → 1
+        assert X_out["col"].iloc[1] == 0  # 'no' (false_val) → 0
+        assert X_out["col"].iloc[2] == 0  # 'maybe' (unseen) → 0
+
+    def test_bool_int_col_unseen_value_mapped_to_false(self):
+        """An unseen integer in a bool int column (0/1) must be mapped to 0."""
+        X_train = pd.DataFrame({"b": [0, 1, 0, 1]})
+        gen = self._fit_gen(X_train)
+        X_out = gen.transform(pd.DataFrame({"b": [0, 1, 2]}))
+        assert X_out["b"].dtype == np.int8
+        assert list(X_out["b"]) == [0, 1, 0]  # 2 is unseen → 0
+
+    def test_bool_int_col_multiple_unseen_values_all_map_to_false(self):
+        """All unseen integer values must be mapped to 0."""
+        X_train = pd.DataFrame({"b": [0, 1, 0, 1]})
+        gen = self._fit_gen(X_train)
+        X_out = gen.transform(pd.DataFrame({"b": [0, 1, 5, 7, 9]}))
+        assert list(X_out["b"]) == [0, 1, 0, 0, 0]
+
+    def test_bool_string_col_unseen_value_mapped_to_false(self):
+        """An unseen string in a bool string column must be mapped to 0."""
+        X_train = pd.DataFrame({"b": pd.Categorical(["yes", "no", "yes", "no"])})
+        gen = self._fit_gen(X_train)
+        X_out = gen.transform(pd.DataFrame({"b": pd.Categorical(["yes", "no", "maybe"])}))
+        assert X_out["b"].dtype == np.int8
+        assert X_out["b"].iloc[0] == 1  # 'yes' → 1
+        assert X_out["b"].iloc[1] == 0  # 'no' → 0
+        assert X_out["b"].iloc[2] == 0  # 'maybe' → 0
+
+    def test_bool_col_stays_in_bool_features_after_unseen(self):
+        """A bool column with unseen values must remain in _bool_features."""
+        X_train = pd.DataFrame({"b": [0, 1, 0, 1]})
+        gen = self._fit_gen(X_train)
+        assert "b" in gen._bool_features
+
+        gen.transform(pd.DataFrame({"b": [0, 1, 2]}))
+        assert "b" in gen._bool_features, "Column must remain a bool feature"
+
+    def test_bool_col_second_transform_still_bool_encodes(self):
+        """A second transform call must still apply bool encoding."""
+        X_train = pd.DataFrame({"b": [0, 1, 0, 1]})
+        gen = self._fit_gen(X_train)
+
+        gen.transform(pd.DataFrame({"b": [0, 1, 2]}))
+
+        X_out = gen.transform(pd.DataFrame({"b": [0, 1, 3]}))
+        assert X_out["b"].dtype == np.int8
+        assert list(X_out["b"]) == [0, 1, 0]  # 3 is unseen → 0
+
+    def test_bool_col_unseen_other_bool_col_unaffected(self):
+        """A sibling bool column without unseen values must still be bool-encoded normally."""
+        X_train = pd.DataFrame({"a": [0, 1, 0, 1], "b": [0, 1, 0, 1]})
+        gen = self._fit_gen(X_train)
 
-    def test_binary_column_without_extra_categories_still_bool_encoded(self):
+        X_test = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 0]})
+        X_out = gen.transform(X_test.copy())
+        # 'a' gained unseen → still int8, unseen mapped to 0
+        assert X_out["a"].dtype == np.int8
+        assert list(X_out["a"]) == [0, 1, 0]
+        # 'b' no unseen → normal bool-encoded 0/1
+        assert X_out["b"].dtype == np.int8
+        assert set(X_out["b"].tolist()).issubset({0, 1})
+
+    def test_bool_col_without_unseen_values_still_bool_encoded(self):
         """When no new categories appear the bool-encoding path must still run normally."""
         X_train = pd.DataFrame({"col": pd.Categorical(["yes", "no", "yes", "no"])})
         gen = StringFixAsTypeFeatureGenerator()
         gen.fit_transform(X_train.copy())
-        # Only known values at test time → normal bool encoding expected
         X_test = pd.DataFrame({"col": pd.Categorical(["yes", "no", "yes"])})
         X_out = gen.transform(X_test.copy())
         assert X_out["col"].isna().sum() == 0
-        # Values should be 0/1 (int8 bool encoding), not strings
         assert set(X_out["col"].tolist()).issubset({0, 1})
 
+    def test_bool_col_with_nan_at_test_time(self):
+        """NaN in a bool column with unseen values is imputed to 0, consistent
+        with how all int columns handle test-time NaN.
+        """
+        X_train = pd.DataFrame({"b": [0, 1, 0, 1]})
+        gen = self._fit_gen(X_train)
+        X_out = gen.transform(pd.DataFrame({"b": [0, 1, 2, None]}))
+        assert X_out["b"].iloc[3] == 0
+        assert X_out["b"].dtype == np.int8
+
     def test_int_column_with_nan_at_test_time_imputed_to_zero(self):
         """Int features that were never NaN at train time must be imputed to 0 at test time."""
         X_train = pd.DataFrame({"val": [1, 2, 3, 4, 5]})
@@ -805,7 +871,7 @@ def test_transform_output_columns_match_fit(self):
         assert list(X_out_train.columns) == list(X_out_test.columns)
 
     def test_max_n_output_features_constant(self):
-        assert StatisticalTextFeatureGenerator.MAX_N_OUTPUT_FEATURES == 384
+        assert StatisticalTextFeatureGenerator.MAX_N_OUTPUT_FEATURES == 32
 
     def test_output_columns_prefixed_with_source_column(self):
         gen = StatisticalTextFeatureGenerator()
@@ -902,6 +968,183 @@ def test_transform_empty_df_raises_value_error(self):
             gen._transform(X)
 
 
+class TestSemanticTextFeatureGeneratorCacheRoundTrip:
+    """End-to-end: fit_transform → save cache to parquet → clear → load cache → transform → compare."""
+
+    EMB_DIM = 32
+
+    @pytest.fixture(autouse=True)
+    def _clean_embedding_cache(self):
+        """Isolate the class-level cache for each test."""
+        saved = dict(SemanticTextFeatureGenerator._embedding_look_up)
+        SemanticTextFeatureGenerator._embedding_look_up.clear()
+        yield
+        SemanticTextFeatureGenerator._embedding_look_up.clear()
+        SemanticTextFeatureGenerator._embedding_look_up.update(saved)
+
+    @staticmethod
+    def _deterministic_embeddings(texts: list[str]) -> np.ndarray:
+        """Hash-based deterministic 32-dim embeddings."""
+        import hashlib
+
+        embs = []
+        for t in texts:
+            seed = int(hashlib.md5(t.encode()).hexdigest(), 16) % (2**31)
+            rng = np.random.RandomState(seed)
+            emb = rng.randn(32).astype(np.float32)
+            emb /= np.linalg.norm(emb)
+            embs.append(emb)
+        return np.vstack(embs)
+
+    def test_save_load_roundtrip_produces_identical_output(self, tmp_path, monkeypatch):
+        """Full pipeline: fit_transform, save cache, clear, load cache, transform, compare."""
+        monkeypatch.setattr(TabArenaDefaultTextEncoder, "get_default_encoder", lambda: None)
+        monkeypatch.setattr(
+            TabArenaDefaultTextEncoder,
+            "encode_texts",
+            lambda *, texts, encoder_model: self._deterministic_embeddings(texts),
+        )
+
+        X = _make_text_df(n_rows=20)
+        gen = SemanticTextFeatureGenerator()
+
+        # 1. fit_transform populates _embedding_look_up and produces output
+        X_out_fit, _type_map = gen._fit_transform(X)
+        assert not X_out_fit.empty
+        cache = dict(SemanticTextFeatureGenerator._embedding_look_up)
+        assert len(cache) > 0
+
+        # 2. Save cache to parquet
+        cache_path = tmp_path / "text_cache.parquet"
+        SemanticTextFeatureGenerator.save_embedding_cache(cache=cache, path=cache_path)
+        assert cache_path.exists()
+
+        # 3. Clear class-level cache
+        SemanticTextFeatureGenerator._embedding_look_up.clear()
+        assert len(SemanticTextFeatureGenerator._embedding_look_up) == 0
+
+        # 4. Load cache from disk
+        loaded = SemanticTextFeatureGenerator.load_embedding_cache(cache_path)
+        SemanticTextFeatureGenerator._embedding_look_up.update(loaded)
+        assert set(loaded.keys()) == set(cache.keys())
+
+        # 5. Transform with loaded cache (no encoding happens)
+        X_out_cached = gen._transform(X)
+
+        # 6. Output must be identical
+        pd.testing.assert_frame_equal(X_out_fit, X_out_cached)
+
+    def test_loaded_embeddings_match_original_values(self, tmp_path, monkeypatch):
+        """Verify that individual embedding vectors survive the parquet round-trip."""
+        monkeypatch.setattr(TabArenaDefaultTextEncoder, "get_default_encoder", lambda: None)
+        monkeypatch.setattr(
+            TabArenaDefaultTextEncoder,
+            "encode_texts",
+            lambda *, texts, encoder_model: self._deterministic_embeddings(texts),
+        )
+
+        X = _make_text_df(n_rows=20)
+        gen = SemanticTextFeatureGenerator()
+        gen._fit_transform(X)
+
+        original_cache = {k: v.copy() for k, v in SemanticTextFeatureGenerator._embedding_look_up.items()}
+
+        cache_path = tmp_path / "emb_cache.parquet"
+        SemanticTextFeatureGenerator.save_embedding_cache(cache=original_cache, path=cache_path)
+        loaded_cache = SemanticTextFeatureGenerator.load_embedding_cache(cache_path)
+
+        for key in original_cache:
+            np.testing.assert_array_almost_equal(loaded_cache[key], original_cache[key], decimal=5)
+
+    def test_cache_roundtrip_with_unseen_data_at_transform(self, tmp_path, monkeypatch):
+        """Load cache from one fit, then transform data that includes new unseen text values."""
+        monkeypatch.setattr(TabArenaDefaultTextEncoder, "get_default_encoder", lambda: None)
+        monkeypatch.setattr(
+            TabArenaDefaultTextEncoder,
+            "encode_texts",
+            lambda *, texts, encoder_model: self._deterministic_embeddings(texts),
+        )
+
+        X_train = _make_text_df(n_rows=20)
+        gen = SemanticTextFeatureGenerator()
+        gen._fit_transform(X_train)
+
+        cache_path = tmp_path / "partial_cache.parquet"
+        SemanticTextFeatureGenerator.save_embedding_cache(
+            cache=SemanticTextFeatureGenerator._embedding_look_up, path=cache_path
+        )
+
+        # Clear and reload
+        SemanticTextFeatureGenerator._embedding_look_up.clear()
+        loaded = SemanticTextFeatureGenerator.load_embedding_cache(cache_path)
+        SemanticTextFeatureGenerator._embedding_look_up.update(loaded)
+
+        # Transform with data that has a mix of seen and unseen text
+        X_new = pd.DataFrame({"text": ["hello world", "brand new text", "foo bar baz", "never seen before"]})
+        X_out = gen._transform(X_new)
+
+        assert X_out.shape == (4, self.EMB_DIM)
+        assert not X_out.isnull().any().any()
+
+    def test_full_pipeline_cache_roundtrip_with_e5_model(self, tmp_path, monkeypatch):
+        """End-to-end via TabArenaModelAgnosticPreprocessing with intfloat/e5-small-v2."""
+        from sentence_transformers import SentenceTransformer
+
+        # Monkey patch Small model for tests
+        fast_model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L3-v2", truncate_dim=4)
+        monkeypatch.setattr(TabArenaDefaultTextEncoder, "get_default_encoder", lambda: fast_model)
+
+        X = pd.DataFrame(
+            {
+                "description": [
+                    f"This is a detailed text description for sample number {i} with unique content" for i in range(50)
+                ]
+            }
+        )
+
+        preprocessing = TabArenaModelAgnosticPreprocessing(
+            enable_sematic_text_features=True,
+            enable_new_datetime_features=False,
+            enable_text_special_features=False,
+            enable_statistical_text_features=False,
+            enable_text_ngram_features=False,
+            enable_datetime_features=False,
+            verbosity=0,
+        )
+
+        # 1. fit_transform through the full pipeline
+        X_out_fit = preprocessing.fit_transform(X=X)
+        assert not X_out_fit.empty
+
+        cache = dict(SemanticTextFeatureGenerator._embedding_look_up)
+        assert len(cache) > 0
+
+        # 2. Save cache to parquet
+        cache_path = tmp_path / "pipeline_cache.parquet"
+        SemanticTextFeatureGenerator.save_embedding_cache(cache=cache, path=cache_path)
+
+        # 3. Clear and reload cache from disk
+        SemanticTextFeatureGenerator._embedding_look_up.clear()
+        loaded = SemanticTextFeatureGenerator.load_embedding_cache(cache_path)
+        SemanticTextFeatureGenerator._embedding_look_up.update(loaded)
+        assert set(loaded.keys()) == set(cache.keys())
+
+        # 4. Transform same data with the loaded cache
+        preprocessing = TabArenaModelAgnosticPreprocessing(
+            enable_sematic_text_features=True,
+            enable_new_datetime_features=False,
+            enable_text_special_features=False,
+            enable_statistical_text_features=False,
+            enable_text_ngram_features=False,
+            enable_datetime_features=False,
+            verbosity=0,
+        )
+        X_out_cached = preprocessing.fit_transform(X)
+
+        # 5. Output must be identical
+        pd.testing.assert_frame_equal(X_out_fit, X_out_cached)
+
+
 # ===========================================================================
 # TextEmbeddingDimensionalityReductionFeatureGenerator
 # ===========================================================================
@@ -1374,7 +1617,7 @@ def test_all_possible_aggs_generated_when_budget_large(self):
     def test_n_top_features_limits_selection(self):
         X, y = _make_grouped_df()
         gen = GroupAggregationFeatureGenerator(group_col="gid", n_top_features=3)
-        X_out, meta = gen._fit_transform(X.copy(), y)
+        _X_out, meta = gen._fit_transform(X.copy(), y)
         assert len(meta[GROUP_INDEX_FEATURES]) == 3
 
     def test_highest_variance_feature_selected(self):
diff --git a/tst/benchmark/task/test_user_task.py b/tst/benchmark/task/test_user_task.py
index be2e02ec3..a3c57428b 100644
--- a/tst/benchmark/task/test_user_task.py
+++ b/tst/benchmark/task/test_user_task.py
@@ -2,6 +2,7 @@
 
 import functools
 import operator
+import pickle
 from pathlib import Path
 
 import numpy as np
@@ -27,9 +28,7 @@ def _isolate_openml_cache(tmp_path_factory):
     Path(openml.config._root_cache_directory).mkdir(parents=True, exist_ok=True)
 
 
-def _make_dataset(
-    problem_type: str, *, n: int = 10
-) -> tuple[pd.DataFrame, str, list[str] | None, list[bool]]:
+def _make_dataset(problem_type: str, *, n: int = 10) -> tuple[pd.DataFrame, str, list[str] | None, list[bool]]:
     dataset = pd.DataFrame(
         {
             "num": np.arange(n, dtype="int64"),
@@ -59,9 +58,7 @@ def test_user_task_as_openml_task(problem_type, expected_cls, tmp_path):
     """Test that UserTask can be converted to an OpenML task for local use.
     This does not test the splits, which are tested in another test.
     """
-    df_original, target_feature, _class_labels, cat_indicator = _make_dataset(
-        problem_type, n=10
-    )
+    df_original, target_feature, _class_labels, cat_indicator = _make_dataset(problem_type, n=10)
     splits = {0: {0: (list(range(8)), [8, 9])}}
 
     ut = UserTask(
@@ -76,9 +73,7 @@ def test_user_task_as_openml_task(problem_type, expected_cls, tmp_path):
     )
 
     # Check Task Metadata
-    assert isinstance(oml_task, expected_cls), (
-        f"Expected {expected_cls}, got {type(oml_task)}"
-    )
+    assert isinstance(oml_task, expected_cls), f"Expected {expected_cls}, got {type(oml_task)}"
     if problem_type == "classification":
         assert oml_task.task_type_id == TaskType.SUPERVISED_CLASSIFICATION
         assert oml_task.class_labels == ["neg", "pos"]
@@ -94,14 +89,13 @@ def test_user_task_as_openml_task(problem_type, expected_cls, tmp_path):
     assert isinstance(oml_dataset, openml.datasets.OpenMLDataset)
     assert oml_dataset.name == ut.get_dataset_name()
     assert oml_dataset.default_target_attribute == target_feature
-    assert oml_dataset.parquet_file == (ut._local_cache_path / "data.pq")
-    assert (ut._local_cache_path / "data.pq").exists()
+    assert oml_dataset.data_pickle_file == (ut._local_cache_path / "data.pkl.py3")
+    assert oml_dataset.cache_format == "pickle"
+    assert (ut._local_cache_path / "data.pkl.py3").exists()
     assert oml_dataset.data_file == "ignored"
 
     # Check Dataset State
-    X, y, categorical_indicator, attribute_names = oml_dataset.get_data(
-        target=oml_task.target_name
-    )
+    X, y, categorical_indicator, attribute_names = oml_dataset.get_data(target=oml_task.target_name)
     assert categorical_indicator == cat_indicator
     expected_a_names = list(df_original.columns)
     expected_a_names.remove(target_feature)
@@ -118,13 +112,7 @@ def test_user_task_as_openml_task(problem_type, expected_cls, tmp_path):
     expected_split = OpenMLSplit(
         name="User-Splits",
         description="User-defined splits for a custom task.",
-        split={
-            r: {
-                f: {0: (np.array(tr), np.array(te))}
-                for f, (tr, te) in splits[r].items()
-            }
-            for r in splits
-        },
+        split={r: {f: {0: (np.array(tr), np.array(te))} for f, (tr, te) in splits[r].items()} for r in splits},
     )
     assert oml_task.split == expected_split
 
@@ -352,9 +340,7 @@ def test_save_load_round_trip_classification(tmp_path):
     df, target, _, _ = _make_dataset("classification", n=10)
     splits = {0: {0: (list(range(8)), [8, 9])}}
     ut = UserTask(task_name="save-load-clf", task_cache_path=tmp_path)
-    task = ut.create_local_openml_task(
-        dataset=df, target_feature=target, problem_type="classification", splits=splits
-    )
+    task = ut.create_local_openml_task(dataset=df, target_feature=target, problem_type="classification", splits=splits)
     ut.save_local_openml_task(task)
     assert ut.openml_task_path.exists()
 
@@ -370,9 +356,7 @@ def test_save_load_round_trip_regression(tmp_path):
     df, target, _, _ = _make_dataset("regression", n=10)
     splits = {0: {0: (list(range(8)), [8, 9])}}
     ut = UserTask(task_name="save-load-reg", task_cache_path=tmp_path)
-    task = ut.create_local_openml_task(
-        dataset=df, target_feature=target, problem_type="regression", splits=splits
-    )
+    task = ut.create_local_openml_task(dataset=df, target_feature=target, problem_type="regression", splits=splits)
     ut.save_local_openml_task(task)
     loaded = ut.load_local_openml_task()
     assert loaded.task_id == ut.task_id
@@ -403,9 +387,7 @@ def test_create_local_openml_task_multi_repeat_multi_fold(tmp_path):
         },
     }
     ut = UserTask(task_name="multi-fold", task_cache_path=tmp_path)
-    task = ut.create_local_openml_task(
-        dataset=df, target_feature=target, problem_type="classification", splits=splits
-    )
+    task = ut.create_local_openml_task(dataset=df, target_feature=target, problem_type="classification", splits=splits)
     # Two repeats, two folds each
     assert set(task.split.split.keys()) == {0, 1}
     assert set(task.split.split[0].keys()) == {0, 1}
@@ -511,9 +493,7 @@ def test_task_metadata_from_row_missing_field_raises():
     meta = _make_task_metadata()
     df = meta.to_dataframe()
     row = df.iloc[0].drop("dataset_name")
-    with pytest.raises(
-        ValueError, match="missing required TabArenaTaskMetadata fields"
-    ):
+    with pytest.raises(ValueError, match="missing required TabArenaTaskMetadata fields"):
         TabArenaTaskMetadata.from_row(row)
 
 
@@ -531,6 +511,70 @@ def test_task_metadata_unroll_splits():
     assert unrolled[0].dataset_name == meta.dataset_name
 
 
+def test_task_metadata_from_row_backward_compat_missing_optional_fields():
+    """Old CSVs without the new dtype flag columns should load without error."""
+    meta = _make_task_metadata()
+    df = meta.to_dataframe()
+    row = df.iloc[0]
+    # Simulate an old CSV that doesn't have the new columns
+    row = row.drop(["has_datetime", "has_text", "has_categorical", "has_numeric"])
+    reconstructed = TabArenaTaskMetadata.from_row(row)
+    assert reconstructed.dataset_name == meta.dataset_name
+    # New fields default to None when absent
+    assert reconstructed.has_datetime is None
+    assert reconstructed.has_text is None
+    assert reconstructed.has_categorical is None
+    assert reconstructed.has_numeric is None
+
+
+def test_task_metadata_dtype_flags_default_to_none():
+    """New dtype flag fields default to None when not passed explicitly."""
+    meta = _make_task_metadata()
+    assert meta.has_datetime is None
+    assert meta.has_text is None
+    assert meta.has_categorical is None
+    assert meta.has_numeric is None
+
+
+def test_task_metadata_dtype_flags_round_trip():
+    """Dtype flags survive a to_dataframe / from_row round trip."""
+    s = _make_split_metadata()
+    meta = TabArenaTaskMetadata(
+        dataset_name="test",
+        problem_type="binary",
+        is_classification=True,
+        target_name="target",
+        eval_metric="roc_auc",
+        splits_metadata={s.split_index: s},
+        split_time_horizon=None,
+        split_time_horizon_unit=None,
+        stratify_on=None,
+        time_on=None,
+        group_on=None,
+        group_time_on=None,
+        group_labels=None,
+        multiclass_min_n_classes_over_splits=2,
+        multiclass_max_n_classes_over_splits=2,
+        class_consistency_over_splits=True,
+        num_instances=10,
+        num_features=2,
+        num_classes=2,
+        num_instance_groups=10,
+        tabarena_task_name="test",
+        task_id_str=None,
+        has_datetime=False,
+        has_text=True,
+        has_categorical=True,
+        has_numeric=False,
+    )
+    df = meta.to_dataframe()
+    reconstructed = TabArenaTaskMetadata.from_row(df.iloc[0])
+    assert reconstructed.has_datetime is False
+    assert reconstructed.has_text is True
+    assert reconstructed.has_categorical is True
+    assert reconstructed.has_numeric is False
+
+
 # ---------------------------------------------------------------------------
 # from_sklearn_splits_to_user_task_splits
 # ---------------------------------------------------------------------------
@@ -569,9 +613,7 @@ def test_from_sklearn_splits_multiple_repeats():
 
 def test_get_num_instance_groups_no_group():
     X = pd.DataFrame({"a": [1, 2, 3]})
-    n = TabArenaTaskMetadataMixin.get_num_instance_groups(
-        X=X, group_on=None, group_labels=None
-    )
+    n = TabArenaTaskMetadataMixin.get_num_instance_groups(X=X, group_on=None, group_labels=None)
     assert n == 3
 
 
@@ -586,9 +628,7 @@ def test_get_num_instance_groups_per_sample_label():
 
 def test_get_num_instance_groups_per_group_label():
     X = pd.DataFrame({"a": [1, 2, 3, 4], "group": ["x", "x", "y", "y"]})
-    n = TabArenaTaskMetadataMixin.get_num_instance_groups(
-        X=X, group_on="group", group_labels=GroupLabelTypes.PER_GROUP
-    )
+    n = TabArenaTaskMetadataMixin.get_num_instance_groups(X=X, group_on="group", group_labels=GroupLabelTypes.PER_GROUP)
     assert n == 2
 
 
@@ -609,9 +649,7 @@ def test_get_num_instance_groups_multi_column_group():
 def _make_multiclass_dataset(n_per_class: int = 4) -> tuple[pd.DataFrame, str]:
     """Create a 3-class classification dataset with n_per_class samples per class."""
     n = n_per_class * 3
-    labels = (
-        (["cls0"] * n_per_class) + (["cls1"] * n_per_class) + (["cls2"] * n_per_class)
-    )
+    labels = (["cls0"] * n_per_class) + (["cls1"] * n_per_class) + (["cls2"] * n_per_class)
     df = pd.DataFrame(
         {
             "num": np.arange(n, dtype="int64"),
@@ -625,9 +663,7 @@ def _make_multiclass_dataset(n_per_class: int = 4) -> tuple[pd.DataFrame, str]:
 def _make_4class_dataset(n_per_class: int = 3) -> tuple[pd.DataFrame, str]:
     """Create a 4-class dataset where fold splits can yield different class counts."""
     n = n_per_class * 4
-    labels = functools.reduce(
-        operator.iadd, ([f"cls{c}"] * n_per_class for c in range(4)), []
-    )
+    labels = functools.reduce(operator.iadd, ([f"cls{c}"] * n_per_class for c in range(4)), [])
     df = pd.DataFrame(
         {
             "num": np.arange(n, dtype="int64"),
@@ -666,9 +702,7 @@ def _task_from_user_task(
 def test_get_dataset_stats_regression_basic(tmp_path):
     """Regression: num_classes=-1, num_features excludes target, num_instance_groups==len."""
     df, target, _, _ = _make_dataset("regression", n=10)
-    task, _ = _task_from_user_task(
-        df, target, "regression", {0: {0: (list(range(8)), [8, 9])}}, tmp_path, "ds-reg"
-    )
+    task, _ = _task_from_user_task(df, target, "regression", {0: {0: (list(range(8)), [8, 9])}}, tmp_path, "ds-reg")
     n_inst, n_feat, n_cls, n_groups = task._get_dataset_stats(
         oml_dataset=df, is_classification=False, target_name=target
     )
@@ -689,9 +723,7 @@ def test_get_dataset_stats_classification_class_count(tmp_path):
         tmp_path,
         "ds-clf",
     )
-    _, _, n_cls, _ = task._get_dataset_stats(
-        oml_dataset=df, is_classification=True, target_name=target
-    )
+    _, _, n_cls, _ = task._get_dataset_stats(oml_dataset=df, is_classification=True, target_name=target)
     assert n_cls == 2
 
 
@@ -715,9 +747,7 @@ def test_get_dataset_stats_num_features_excludes_target(tmp_path):
         tmp_path,
         "ds-5col",
     )
-    _, n_feat, _, _ = task._get_dataset_stats(
-        oml_dataset=df, is_classification=True, target_name="target"
-    )
+    _, n_feat, _, _ = task._get_dataset_stats(oml_dataset=df, is_classification=True, target_name="target")
     assert n_feat == 4
 
 
@@ -735,13 +765,10 @@ def test_get_dataset_stats_slice_reports_subset_class_count(tmp_path):
         tmp_path,
         "ds-slice",
     )
-    _, _, n_cls, _ = task._get_dataset_stats(
-        oml_dataset=subset_one_class, is_classification=True, target_name=target
-    )
+    _, _, n_cls, _ = task._get_dataset_stats(oml_dataset=subset_one_class, is_classification=True, target_name=target)
     assert n_cls == 1
 
 
-
 # ---------------------------------------------------------------------------
 # compute_metadata — regression
 # ---------------------------------------------------------------------------
@@ -749,12 +776,8 @@ def test_get_dataset_stats_slice_reports_subset_class_count(tmp_path):
 
 def test_compute_metadata_regression(tmp_path):
     df, target, _, _ = _make_dataset("regression", n=10)
-    task, ut = _task_from_user_task(
-        df, target, "regression", {0: {0: (list(range(8)), [8, 9])}}, tmp_path, "cm-reg"
-    )
-    meta = task.compute_metadata(
-        tabarena_task_name="my-task", task_id_str=ut.task_id_str
-    )
+    task, ut = _task_from_user_task(df, target, "regression", {0: {0: (list(range(8)), [8, 9])}}, tmp_path, "cm-reg")
+    meta = task.compute_metadata(tabarena_task_name="my-task", task_id_str=ut.task_id_str)
 
     assert meta.problem_type == "regression"
     assert meta.is_classification is False
@@ -832,6 +855,50 @@ def test_compute_metadata_binary_dataset_level_stats(tmp_path):
     assert meta.num_instance_groups == 10  # no group_on
 
 
+def test_compute_metadata_dtype_flags(tmp_path):
+    """_make_dataset produces int64 'num' and category 'cat' feature columns."""
+    df, target, _, _ = _make_dataset("classification", n=10)
+    task, _ = _task_from_user_task(
+        df,
+        target,
+        "classification",
+        {0: {0: (list(range(8)), [8, 9])}},
+        tmp_path,
+        "cm-dtype-flags",
+    )
+    meta = task.compute_metadata()
+
+    assert meta.has_numeric is True
+    assert meta.has_categorical is True
+    assert meta.has_datetime is False
+    assert meta.has_text is False
+
+
+def test_compute_metadata_dtype_flags_with_text_and_datetime(tmp_path):
+    df = pd.DataFrame(
+        {
+            "num": np.arange(10, dtype="float64"),
+            "txt": pd.array(["hello"] * 10, dtype="string"),
+            "dt": pd.date_range("2020-01-01", periods=10, freq="D"),
+            "target": [0, 1] * 5,
+        }
+    )
+    task, _ = _task_from_user_task(
+        df,
+        "target",
+        "classification",
+        {0: {0: (list(range(8)), [8, 9])}},
+        tmp_path,
+        "cm-dtype-all",
+    )
+    meta = task.compute_metadata()
+
+    assert meta.has_numeric is True
+    assert meta.has_text is True
+    assert meta.has_datetime is True
+    assert meta.has_categorical is False
+
+
 # ---------------------------------------------------------------------------
 # compute_metadata — multiclass classification
 # ---------------------------------------------------------------------------
@@ -876,22 +943,13 @@ def test_compute_metadata_class_consistency_false(tmp_path):
     """
     df, target = _make_4class_dataset(n_per_class=8)  # 32 samples
     # Repeat 1 train/test: alternate halves of each 8-sample class block.
-    r1_train = (
-        list(range(4)) + list(range(8, 12)) + list(range(16, 20)) + list(range(24, 28))
-    )
-    r1_test = (
-        list(range(4, 8))
-        + list(range(12, 16))
-        + list(range(20, 24))
-        + list(range(28, 32))
-    )
+    r1_train = list(range(4)) + list(range(8, 12)) + list(range(16, 20)) + list(range(24, 28))
+    r1_test = list(range(4, 8)) + list(range(12, 16)) + list(range(20, 24)) + list(range(28, 32))
     splits = {
         0: {0: (list(range(24)), list(range(24, 32)))},
         1: {0: (r1_train, r1_test)},
     }
-    task, _ = _task_from_user_task(
-        df, target, "classification", splits, tmp_path, "cm-inconsistent"
-    )
+    task, _ = _task_from_user_task(df, target, "classification", splits, tmp_path, "cm-inconsistent")
     meta = task.compute_metadata()
 
     assert meta.problem_type == "multiclass"
@@ -913,9 +971,7 @@ def test_compute_metadata_multi_fold_split_indices(tmp_path):
             1: (list(range(5, 20)), list(range(5))),
         }
     }
-    task, _ = _task_from_user_task(
-        df, target, "classification", splits, tmp_path, "cm-mf"
-    )
+    task, _ = _task_from_user_task(df, target, "classification", splits, tmp_path, "cm-mf")
     meta = task.compute_metadata()
 
     assert meta.n_splits == 2
@@ -930,9 +986,7 @@ def test_compute_metadata_multi_fold_per_split_counts(tmp_path):
             1: (list(range(5, 20)), list(range(5))),
         }
     }
-    task, _ = _task_from_user_task(
-        df, target, "classification", splits, tmp_path, "cm-mf-cnt"
-    )
+    task, _ = _task_from_user_task(df, target, "classification", splits, tmp_path, "cm-mf-cnt")
     meta = task.compute_metadata()
 
     s0 = meta.splits_metadata["r0f0"]
@@ -949,9 +1003,7 @@ def test_compute_metadata_multi_repeat_split_indices(tmp_path):
         0: {0: (list(range(15)), list(range(15, 20)))},
         1: {0: (list(range(5, 20)), list(range(5)))},
     }
-    task, _ = _task_from_user_task(
-        df, target, "classification", splits, tmp_path, "cm-mr"
-    )
+    task, _ = _task_from_user_task(df, target, "classification", splits, tmp_path, "cm-mr")
     meta = task.compute_metadata()
 
     assert meta.n_splits == 2
@@ -966,9 +1018,7 @@ def test_compute_metadata_multi_repeat_split_indices(tmp_path):
 
 def test_compute_metadata_optional_fields_default_none(tmp_path):
     df, target, _, _ = _make_dataset("regression", n=10)
-    task, _ = _task_from_user_task(
-        df, target, "regression", {0: {0: (list(range(8)), [8, 9])}}, tmp_path, "cm-opt"
-    )
+    task, _ = _task_from_user_task(df, target, "regression", {0: {0: (list(range(8)), [8, 9])}}, tmp_path, "cm-opt")
     meta = task.compute_metadata()
 
     assert meta.tabarena_task_name is None
@@ -1046,17 +1096,14 @@ def test_compute_metadata_split_time_horizon_passthrough(tmp_path):
             pd.arrays.IntervalArray.from_breaks(range(11)),
             pd.IntervalDtype(subtype="int64", closed="right"),
         ),
-        # Note: complex128 is also unsupported by liac-arff, but pyarrow (parquet)
-        # cannot serialize it either, so it fails at a later stage and is excluded here.
+        # Note: complex128 is also unsupported by liac-arff and excluded here.
     ],
     ids=["datetime64", "timedelta64", "period", "interval"],
 )
-def test_create_local_openml_task_unsupported_arff_dtype_does_not_raise(
-    col_name, col_values, dtype, tmp_path
-):
+def test_create_local_openml_task_unsupported_arff_dtype_does_not_raise(col_name, col_values, dtype, tmp_path):
     """Columns with dtypes unsupported by liac-arff (datetime64, timedelta64, complex)
     must not prevent task creation — they are cast to string only for ARFF attribute
-    inference and do not affect the data persisted to parquet.
+    inference and do not affect the data persisted to pickle.
     """
     n = 10
     df = pd.DataFrame(
@@ -1071,11 +1118,53 @@ def test_create_local_openml_task_unsupported_arff_dtype_does_not_raise(
     splits = {0: {0: (list(range(8)), [8, 9])}}
     ut = UserTask(task_name=f"unsupported-dtype-{col_name}", task_cache_path=tmp_path)
     # Must not raise
-    ut.create_local_openml_task(
-        dataset=df, target_feature="target", problem_type="regression", splits=splits
-    )
+    ut.create_local_openml_task(dataset=df, target_feature="target", problem_type="regression", splits=splits)
 
-    # The parquet file must store the original dtype — the workaround must not
+    # The pickle file must store the original dtype — the workaround must not
     # modify the persisted data.
-    stored = pd.read_parquet(ut._local_cache_path / "data.pq")
+    with (ut._local_cache_path / "data.pkl.py3").open("rb") as fh:
+        stored, _, _ = pickle.load(fh)
     assert stored[col_name].dtype == df[col_name].dtype
+
+
+@pytest.mark.parametrize(
+    ("cat_values", "cat_dtype", "test_id"),
+    [
+        (
+            pd.Categorical([0, 1, 2, 1, 0, 2, 1, 0, 2, 1]),
+            "int64",
+            "int_categories",
+        ),
+        (
+            pd.Categorical([1.5, 2.5, 1.5, 2.5, 1.5, 2.5, 1.5, 2.5, 1.5, 2.5]),
+            "float64",
+            "float_categories",
+        ),
+    ],
+    ids=lambda x: x if isinstance(x, str) and "_categories" in x else "",
+)
+def test_create_local_openml_task_non_string_categorical_does_not_raise(cat_values, cat_dtype, test_id, tmp_path):
+    """Categorical columns whose categories have a non-string dtype (e.g. int, float)
+    must not break ARFF attribute inference — categories are cast to string only for
+    metadata and do not affect the data persisted to pickle.
+    """
+    n = 10
+    df = pd.DataFrame(
+        {
+            "num": np.arange(n, dtype="int64"),
+            "cat_col": cat_values,
+            "target": np.linspace(0.0, 1.0, num=n),
+        }
+    )
+    assert df["cat_col"].dtype.name == "category"
+    assert df["cat_col"].cat.categories.dtype == cat_dtype
+
+    splits = {0: {0: (list(range(8)), [8, 9])}}
+    ut = UserTask(task_name=f"non-str-cat-{test_id}", task_cache_path=tmp_path)
+    # Must not raise
+    ut.create_local_openml_task(dataset=df, target_feature="target", problem_type="regression", splits=splits)
+
+    # The pickle file must store the original data values unchanged.
+    with (ut._local_cache_path / "data.pkl.py3").open("rb") as fh:
+        stored, _, _ = pickle.load(fh)
+    pd.testing.assert_series_equal(stored["cat_col"].astype(str), df["cat_col"].astype(str), check_names=True)