diff --git a/experimental/feature_selection_benchmark/extra_benchmark/feature_selection_benchmark_runner.py b/experimental/feature_selection_benchmark/extra_benchmark/feature_selection_benchmark_runner.py index 2be7adc78..362e11e25 100644 --- a/experimental/feature_selection_benchmark/extra_benchmark/feature_selection_benchmark_runner.py +++ b/experimental/feature_selection_benchmark/extra_benchmark/feature_selection_benchmark_runner.py @@ -1,8 +1,21 @@ """Shared infrastructure and entry point for feature selection benchmark evaluation. Usage: - python fs_benchmark_runner.py --mode validity --seed 42 - python fs_benchmark_runner.py --mode stability --seed 42 + python feature_selection_benchmark_runner.py \ + --mode validity \ + --method_name FSBench__RandomFeatureSelector__5__0__lgbm__3600 \ + --data_foundry_task_id "UserTask|1386903908|anneal/019d3f7b-494a-71fa-8eb2-25d01dfb7792|/work/dlclarge1/purucker-fs_benchmark/.openml/tabarena_tasks" \ + --repeat 0 \ + --noise 1.0 \ + --noise_type gaussian + + python feature_selection_benchmark_runner.py \ + --mode stability \ + --method_name FSBench__RandomFeatureSelector__5__0__lgbm__3600 \ + --data_foundry_task_id "UserTask|1386903908|anneal/019d3f7b-494a-71fa-8eb2-25d01dfb7792|/work/dlclarge1/purucker-fs_benchmark/.openml/tabarena_tasks" \ + --repeat 0 \ + --noise 1.0 \ + --noise_type gaussian """ from __future__ import annotations @@ -10,18 +23,17 @@ import argparse import time from dataclasses import dataclass -from typing import TYPE_CHECKING, Any +from pathlib import Path +from typing import Any import numpy as np +import pandas as pd from tabarena.benchmark.feature_selection_methods.feature_selection_benchmark_utils import ( selector_and_config_from_string, ) from tabarena.benchmark.task.openml import OpenMLTaskWrapper from tabflow_slurm.run_tabarena_experiment import _parse_task_id -if TYPE_CHECKING: - import pandas as pd - @dataclass class FeatureSelectionResult: @@ -36,6 +48,9 @@ class FeatureSelectionResult: repeat: Repeat number for the FS metric. selected_features: Names of selected features from the original_features. + num_classes: Number of classes in the target variable (for classification tasks). + num_samples: Number of samples in the dataset. + elapsed_time_fs: Runtime measurement (seconds). mode: Evaluation mode ("validity" or "stability"). @@ -50,6 +65,9 @@ class FeatureSelectionResult: repeat: int selected_features: list[int] + num_classes: int + num_samples: int + elapsed_time_fs: float mode: str @@ -62,11 +80,11 @@ def _augment_dataset( original_features = list(X.columns) if mode == "validity": - from validity_fs_metric import get_dataset_for_validity + from validity_fs_metric import get_dataset_for_validity # noqa: PLC0415 X = get_dataset_for_validity(X=X, rng=rng, **kwargs) elif mode == "stability": - from stability_fs_metric import get_dataset_for_stability + from stability_fs_metric import get_dataset_for_stability # noqa: PLC0415 X, y = get_dataset_for_stability(X=X, y=y, rng=rng, **kwargs) else: @@ -79,7 +97,7 @@ def _augment_dataset( return X, y, original_features -def run_benchmark( +def run_benchmark( # noqa: D417 *, data_foundry_task_id: str, mode: str, @@ -109,7 +127,7 @@ def run_benchmark( feature_selector, config = selector_and_config_from_string(preprocessing_name=method_name) # Augment dataset with new feature based on mode. - X, y, original_features = _augment_dataset(mode=mode, X=X, rng=rng, **kwargs) + X, y, original_features = _augment_dataset(mode=mode, X=X, y=y, rng=rng, **kwargs) # Run Feature Selection start_time = time.monotonic() @@ -135,6 +153,13 @@ def run_benchmark( mode_kwargs=kwargs, ) +def get_cache_path(args) -> Path: + """Generate the cache path based on arguments.""" + cache_dir = Path(__file__).parent / "results" + cache_dir.mkdir(parents=True, exist_ok=True) # Ensure the directory exists + cache_path = cache_dir / f"{args.mode}_{args.method_name}_{args.data_foundry_task_id.split('|')[3].split('/')[0]}_{args.repeat}.csv" + return cache_path + def parse_args() -> argparse.Namespace: """Parse CLI arguments for the FS benchmark runner.""" @@ -149,14 +174,15 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--method_name", type=str, - default="FSBench__AccuracyFeatureSelector__5__0__lgbm__3600", + default="FSBench__RandomFeatureSelector__5__0__lgbm__3600", help="Feature Selection Method name [default: FSBench__AccuracyFeatureSelector__5__0__lgbm__3600]", ) parser.add_argument( "--data_foundry_task_id", type=str, - default="anneal/019d3f7b-494a-71fa-8eb2-25d01dfb7792", - help="TabArena/OpenML task identifier [default: anneal/019d3f7b-494a-71fa-8eb2-25d01dfb7792]", + default="UserTask|1386903908|anneal/019d3f7b-494a-71fa-8eb2-25d01dfb7792|/Users/schaefer.bastian/.openml/tabarena_tasks", + help="TabArena/OpenML task metadata identifier [default: UserTask|1386903908|anneal/019d3f7b-494a-71fa-8eb2-25d" + "01dfb7792|/Users/schaefer.bastian/.openml/tabarena_tasks]", ) parser.add_argument("--repeat", type=int, default=0, help="Repeat [default: 0]") @@ -165,27 +191,41 @@ def parse_args() -> argparse.Namespace: "--noise", type=float, default=1.0, - nargs="+", help="Noise features relative to original count (validity mode only) [default: 1.0]", ) parser.add_argument( - "--nose_type", + "--noise_type", type=str, choices=["gaussian", "uniform"], default="gaussian", help="Type of noise features to add (validity mode only) [default: random]", ) + parser.add_argument( + "--ignore_cache", + type=bool, + default=False, + help="Whether to ignore existing cache and rerun the benchmark (useful for debugging) [default: False]", + ) return parser.parse_args() if __name__ == "__main__": args = parse_args() - run_benchmark( - data_foundry_task_id=args.data_foundry_task_id, - mode=args.mode, - method_name=args.method_name, - repeat=args.repeat, - noise=args.noise, - nose_type=args.nose_type, - ) \ No newline at end of file + cache_path = get_cache_path(args) + + if cache_path.exists() and not args.ignore_cache: + print(f"Cache exists at {cache_path}. Skipping operation.") + else: + result = run_benchmark( + data_foundry_task_id=args.data_foundry_task_id, + mode=args.mode, + method_name=args.method_name, + repeat=args.repeat, + noise=args.noise, + noise_type=args.noise_type, + ) + + print(result) + result = pd.DataFrame([result.__dict__]) + result.to_csv(cache_path, index=False) diff --git a/experimental/feature_selection_benchmark/extra_benchmark/run_setup_slurm_jobs_extra_benchmark.py b/experimental/feature_selection_benchmark/extra_benchmark/run_setup_slurm_jobs_extra_benchmark.py new file mode 100644 index 000000000..1e690aeea --- /dev/null +++ b/experimental/feature_selection_benchmark/extra_benchmark/run_setup_slurm_jobs_extra_benchmark.py @@ -0,0 +1,81 @@ +from pathlib import Path + +import pandas as pd +import submitit +from experimental.feature_selection_benchmark.tabarena_setup.fr_cluster_setup import ( + ALL_TASK_METADATA, + FS_TIME_LIMIT, + FSBenchmarkConfig, +) + + +def run_extra_pipeline(mode, method_name, task_id, noise, noise_type): + """Function to run the extra pipeline.""" + print( + f"Running extra pipeline with mode={mode}, method_name={method_name}, task_id={task_id}, noise={noise}, noise_type={noise_type}") + # Add your pipeline logic here + + +if __name__ == "__main__": + method_names = FSBenchmarkConfig().get_default_preprocessing_configs( + fs_methods=[ + "AccuracyFeatureSelector", + "RandomFeatureSelector", + "ANOVAFeatureSelector", + "CFSFeatureSelector", + "Chi2FeatureSelector", + "DISRFeatureSelector", + "GainRatioFeatureSelector", + "GiniFeatureSelector", + "ImpurityFeatureSelector", + "InformationGainFeatureSelector", + "INTERACTFeatureSelector", + "MarkovBlanketFeatureSelector", + "MIFeatureSelector", + "mRMRFeatureSelector", + "PearsonCorrelationFeatureSelector", + "ReliefFFeatureSelector", + "RFImportanceFeatureSelector", + "SequentialBackwardEliminationFeatureSelector", + "SequentialForwardSelectionFeatureSelector", + "SymmetricalUncertaintyFeatureSelector", + "LassoFeatureSelector", # just for regression but with label encoder for classification? + "LaplacianScoreFeatureSelector", # OOM, Segmentation fault issues + "ConsistencyFeatureSelector", + # selected_indices = np.where(S)[0].tolist(), UnboundLocalError: cannot access local variable 'S' where it is not associated with a value + "JMIFeatureSelector", + # time limit computed incorrectly, and error at remaining.remove(best_idx), ValueError: list.remove(x): x not in list + "OneRFeatureSelector", + # major OOM errors (tries to allocate one major array), wrong time limit computation, max(accuracies, key=accuracies.get) -> max() iterable argument is empty + "ElasticNetFeatureSelector", # Only for classification + "CMIMFeatureSelector", # problems with time limit and fallback of features + # "tTestFeatureSelector", # Does not work for regression + "CARTFeatureSelector", # Only implemented for classification, OOM problems as well + ] + ) + task_ids = pd.read_csv(ALL_TASK_METADATA)["task_id_str"].tolist() + # Define the parameter grid + modes = ["validity", "stability"] + noises = [0.5, 0.75, 1.0] + noise_types = ["gaussian"] + + # Create a SLURM executor + executor = submitit.AutoExecutor(folder=Path("slurm_logs")) + executor.update_parameters( + timeout_min=FS_TIME_LIMIT, # Job timeout in minutes + slurm_partition="default", # SLURM partition + cpus_per_task=8, # Number of CPUs per task + mem_gb=32, # Memory in GB + ) + + # Submit jobs + with executor.batch(): + for mode in modes: + for method_name in method_names: + for task_id in task_ids: + if mode == "validity": + for noise in noises: + for noise_type in noise_types: + executor.submit(run_extra_pipeline, mode, method_name, task_id, noise, noise_type) + else: + executor.submit(run_extra_pipeline, mode, method_name, task_id, noise, noise_type) diff --git a/tabarena/tabarena/benchmark/experiment/experiment_runner.py b/tabarena/tabarena/benchmark/experiment/experiment_runner.py index be473f908..75efe3430 100644 --- a/tabarena/tabarena/benchmark/experiment/experiment_runner.py +++ b/tabarena/tabarena/benchmark/experiment/experiment_runner.py @@ -82,11 +82,19 @@ def __init__( self.eval_metric: Scorer = get_metric(metric=self.eval_metric_name, problem_type=self.task.problem_type) self.model: AbstractExecModel | None = None self.task_split_idx = self.task.get_split_idx(fold=self.fold, repeat=self.repeat, sample=self.sample) - self.X, self.y, self.X_test, self.y_test = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample) + + if self.task.lazy_load_data: + assert input_format == "openml", "Lazy load data only works with input_format='openml'" + self.X, self.y, self.X_test, self.y_test = None, None, None, None + _, y, _, _ = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample) + else: + self.X, self.y, self.X_test, self.y_test = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample) + y = self.y + if input_format == "csv": self.X = self.task.to_csv_format(X=self.X) self.X_test = self.task.to_csv_format(X=self.X_test) - self.label_cleaner = LabelCleaner.construct(problem_type=self.task.problem_type, y=self.y) + self.label_cleaner = LabelCleaner.construct(problem_type=self.task.problem_type, y=y) if cacher is None: cacher = CacheFunctionDummy() self.cacher = cacher @@ -105,8 +113,19 @@ def split_seed(self): """We use the split index as a source for a seed that creates different randomness per split.""" return self.task_split_idx + def _lazy_load_for_run_model_fit(self): + X, y, X_test, _ = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample) + return X, y, X_test + + def run_model_fit(self) -> dict: - return self.model.fit_custom(X=self.X, y=self.y, X_test=self.X_test, split_seed=self.split_seed) + if self.task.lazy_load_data: + lazy_load_function = self._lazy_load_for_run_model_fit + X, y, X_test = None, None, None + else: + lazy_load_function = None + X, y, X_test = self.X, self.y, self.X_test + return self.model.fit_custom(X=X, y=y, X_test=X_test, split_seed=self.split_seed, lazy_load_function=lazy_load_function) def run(self) -> dict: out = self._run() @@ -157,11 +176,19 @@ def _run(self) -> dict: self.handle_failure(exc=exc) raise out = self.post_fit(out=out) + + if self.task.lazy_load_data: + _, _, _, y_test = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample) + else: + y_test = self.y_test out["metric_error"] = self.evaluate( - y_true=self.y_test, + y_true=y_test, y_pred=out["predictions"], y_pred_proba=out["probabilities"], ) + if self.task.lazy_load_data: + del y_test + out = self.post_evaluate(out=out) out["experiment_metadata"] = self._experiment_metadata(time_start=time_start, time_start_str=time_start_str) out = self.convert_to_output(out=out) @@ -269,7 +296,12 @@ def post_evaluate(self, out: dict) -> dict: simulation_artifact["pred_proba_dict_test"] = self.label_cleaner.transform_proba(out["probabilities"], as_pandas=True) if self.task.problem_type == "binary": simulation_artifact["pred_proba_dict_test"] = simulation_artifact["pred_proba_dict_test"].iloc[:, 1] - simulation_artifact["y_test"] = self.label_cleaner.transform(self.y_test) + + if self.task.lazy_load_data: + _, _, _, y_test = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample) + else: + y_test = self.y_test + simulation_artifact["y_test"] = self.label_cleaner.transform(y_test) if self.optimize_simulation_artifacts_memory: # optimize memory @@ -296,7 +328,12 @@ def post_evaluate(self, out: dict) -> dict: simulation_artifact["metric"] = self.eval_metric_name if self.compute_bag_info and (self.model.can_get_per_child_oof and self.model.can_get_per_child_val_idx): - simulation_artifact["bag_info"] = self.model.bag_artifact(X_test=self.X_test) + if self.task.lazy_load_data: + _, _, X_test, _ = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample) + else: + X_test = self.X_test + + simulation_artifact["bag_info"] = self.model.bag_artifact(X_test=X_test) simulation_artifact["pred_proba_dict_val"] = {self.method: simulation_artifact["pred_proba_dict_val"]} diff --git a/tabarena/tabarena/benchmark/experiment/experiment_runner_api.py b/tabarena/tabarena/benchmark/experiment/experiment_runner_api.py index 216f9e3a6..2b7055182 100644 --- a/tabarena/tabarena/benchmark/experiment/experiment_runner_api.py +++ b/tabarena/tabarena/benchmark/experiment/experiment_runner_api.py @@ -85,16 +85,12 @@ def _parse_repetitions_mode_and_args( metadata_task_ids = tasks_metadata["task_id"].astype(str).tolist() for task in tasks: t_id = task.task_id_str if isinstance(task, UserTask) else str(task) - assert t_id in metadata_task_ids, ( - f"Task ID '{t_id}' from `tasks` not found in `tasks_metadata`" - ) + assert t_id in metadata_task_ids, f"Task ID '{t_id}' from `tasks` not found in `tasks_metadata`" task_meta = tasks_metadata[metadata_task_ids == t_id].iloc[0] n_folds = int(task_meta["num_folds"]) n_repeats = int(task_meta["tabarena_num_repeats"]) - fold_repeat_pairs = [ - (f, r) for r in range(n_repeats) for f in range(n_folds) - ] + fold_repeat_pairs = [(f, r) for r in range(n_repeats) for f in range(n_folds)] fold_repeat_pairs_per_task.append(fold_repeat_pairs) return fold_repeat_pairs_per_task @@ -113,17 +109,12 @@ def _parse_repetitions_mode_and_args( assert all(isinstance(rep, tuple) for rep in repetitions_mode_args), ( "If `repetitions_mode_args` for 'matrix' is a list, all elements must be tuples" ) - repetitions_mode_args = [ - _clean_repetitions_mode_args_for_matrix(rep) - for rep in repetitions_mode_args - ] + repetitions_mode_args = [_clean_repetitions_mode_args_for_matrix(rep) for rep in repetitions_mode_args] else: assert isinstance(repetitions_mode_args, tuple), ( "If `repetitions_mode_args` for 'matrix' is not a list, it must be a tuple" ) - repetitions_mode_args = [ - _clean_repetitions_mode_args_for_matrix(repetitions_mode_args) - ] * len(tasks) + repetitions_mode_args = [_clean_repetitions_mode_args_for_matrix(repetitions_mode_args)] * len(tasks) return [[(f, r) for f in e[0] for r in e[1]] for e in repetitions_mode_args] if repetitions_mode == "individual": @@ -133,15 +124,11 @@ def _parse_repetitions_mode_and_args( assert isinstance(repetitions_mode_args, list), ( "If `repetitions_mode` is 'individual', `repetitions_mode_args` must be a list" ) - assert len(repetitions_mode_args) > 0, ( - "`repetitions_mode_args` for 'individual' must not be empty" - ) + assert len(repetitions_mode_args) > 0, "`repetitions_mode_args` for 'individual' must not be empty" if isinstance(repetitions_mode_args[0], tuple): assert all( - isinstance(rep, tuple) - and (len(rep) == 2) - and all(isinstance(i, int) for i in rep) + isinstance(rep, tuple) and (len(rep) == 2) and all(isinstance(i, int) for i in rep) for rep in repetitions_mode_args ), ( "If `repetitions_mode_args` for 'individual' is a list of tuples, all elements must be tuples of integers of (fold_index, repeat_index) pairs" @@ -323,9 +310,7 @@ def run_experiments_new( """ if run_mode == "aws": if s3_kwargs is None: - raise ValueError( - f"s3_kwargs parameter is required when mode is 'aws', got {s3_kwargs}" - ) + raise ValueError(f"s3_kwargs parameter is required when mode is 'aws', got {s3_kwargs}") if s3_kwargs.get("bucket") is None or s3_kwargs.get("bucket") == "": raise ValueError( f"bucket parameter in s3_kwargs is required when mode is 'aws', got {s3_kwargs.get('bucket')}" @@ -334,9 +319,7 @@ def run_experiments_new( elif run_mode == "local": base_cache_path = output_dir else: - raise ValueError( - f"Invalid mode: {run_mode}. Supported modes are 'local' and 'aws'." - ) + raise ValueError(f"Invalid mode: {run_mode}. Supported modes are 'local' and 'aws'.") assert all(isinstance(exp, Experiment) for exp in model_experiments), ( "All `model_experiments` elements must be instances of Experiment class" @@ -373,23 +356,15 @@ def run_experiments_new( task, tabarena_task_name, eval_metric_name = None, None, None print(f"Starting Dataset {dataset_index + 1}/{len(tasks)}...") - for split_index, (fold, repeat) in enumerate( - fold_repeat_pairs_per_task[dataset_index], start=1 - ): - subtask_cache_name = ExperimentBatchRunner._subtask_name( - fold=fold, repeat=repeat - ) + for split_index, (fold, repeat) in enumerate(fold_repeat_pairs_per_task[dataset_index], start=1): + subtask_cache_name = ExperimentBatchRunner._subtask_name(fold=fold, repeat=repeat) print( f"Starting Split {split_index}/{len(fold_repeat_pairs_per_task[dataset_index])} (Fold {fold}, Repeat {repeat})..." ) for me_index, model_experiment in enumerate(model_experiments, start=1): cur_experiment_idx += 1 - cache_task_key = ( - task_id_or_object - if isinstance(task_id_or_object, int) - else task_id_or_object.task_id - ) + cache_task_key = task_id_or_object if isinstance(task_id_or_object, int) else task_id_or_object.task_id print( f"Starting Model {me_index}/{len(model_experiments)}..." f"\n\t" @@ -405,9 +380,7 @@ def run_experiments_new( cache_name = "results" cache_prefix = f"data/{model_experiment.name}/{cache_task_key}/{subtask_cache_name}" cache_path = f"{base_cache_path}/{cache_prefix}" - cacher = CacheFunctionPickle( - cache_name=cache_name, cache_path=cache_path - ) + cacher = CacheFunctionPickle(cache_name=cache_name, cache_path=cache_path) cache_exists = cacher.exists # Check cache state @@ -420,13 +393,9 @@ def run_experiments_new( if cache_mode == "only": out = cacher.load_cache() else: - if (task is None) and ( - (cache_mode == "ignore") or (not cache_exists) - ): + if (task is None) and ((cache_mode == "ignore") or (not cache_exists)): if isinstance(task_id_or_object, int): - if (s3_kwargs is not None) and ( - "dataset_cache" in s3_kwargs - ): + if (s3_kwargs is not None) and ("dataset_cache" in s3_kwargs): assert isinstance(s3_kwargs["dataset_cache"], str), ( "'s3_kwargs `dataset_cache` must be a str!" ) @@ -435,9 +404,7 @@ def run_experiments_new( s3_dataset_cache=s3_kwargs["dataset_cache"], ) else: - task = OpenMLTaskWrapper.from_task_id( - task_id=task_id_or_object - ) + task = OpenMLTaskWrapper.from_task_id(task_id=task_id_or_object) # TODO: maybe add a prefix to this. tabarena_task_name = task.task.get_dataset().name else: @@ -445,6 +412,7 @@ def run_experiments_new( task = OpenMLTaskWrapper( task=task_id_or_object.load_local_openml_task(), use_task_eval_metric=True, + lazy_load_data=True, ) eval_metric_name = task.eval_metric @@ -455,6 +423,13 @@ def run_experiments_new( from tabarena.benchmark.experiment.experiment_constructor import ( AGModelBagExperiment, ) + from tabarena.benchmark.task.user_task import TabArenaOpenMLSupervisedTask + + if not isinstance(task.task, TabArenaOpenMLSupervisedTask): + raise ValueError( + "`dynamic_tabarena_validation_protocol` is only " + "implemented for `TabArenaOpenMLSupervisedTask`!" + ) if not isinstance(model_experiment, AGModelBagExperiment): # TODO: add support @@ -469,6 +444,20 @@ def run_experiments_new( **task.get_validation_split_kwargs(), ) + # FIXME: move this somewhere else and allow to enable/disable this. + # Load text cache into memory for the current task + from tabarena.benchmark.preprocessing.text_feature_generators import ( + SemanticTextFeatureGenerator, + ) + + cache_path = SemanticTextFeatureGenerator.get_text_cache_dir(task_id_str=str(task.task_id)) + if cache_path.exists(): + print("[LOADING TEXT CACHE] Loading text embedding cache into memory...") + SemanticTextFeatureGenerator._embedding_look_up = ( + SemanticTextFeatureGenerator.load_embedding_cache(path=cache_path) + ) + SemanticTextFeatureGenerator.only_load_from_cache = True + try: out = model_experiment.run( task=task, @@ -491,10 +480,7 @@ def run_experiments_new( # Safety check for results with non-finite metric errors if (out is not None) and ( - not ( - np.isfinite(out["metric_error"]) - and np.isfinite(out["metric_error_val"]) - ) + not (np.isfinite(out["metric_error"]) and np.isfinite(out["metric_error_val"])) ): print( "Non-finite final metric error detected: " diff --git a/tabarena/tabarena/benchmark/models/ag/ebm/ebm_model.py b/tabarena/tabarena/benchmark/models/ag/ebm/ebm_model.py index 4928868a6..6f91604a3 100644 --- a/tabarena/tabarena/benchmark/models/ag/ebm/ebm_model.py +++ b/tabarena/tabarena/benchmark/models/ag/ebm/ebm_model.py @@ -113,7 +113,6 @@ def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int: problem_type=self.problem_type, num_classes=self.num_classes, hyperparameters=self._get_model_params(), - features=self._features, **kwargs, ) diff --git a/tabarena/tabarena/benchmark/models/ag/knn_new/knn_model.py b/tabarena/tabarena/benchmark/models/ag/knn_new/knn_model.py index 290d805fd..4449a6393 100644 --- a/tabarena/tabarena/benchmark/models/ag/knn_new/knn_model.py +++ b/tabarena/tabarena/benchmark/models/ag/knn_new/knn_model.py @@ -86,3 +86,18 @@ def _fit(self, X, y, num_cpus=-1, time_limit=None, sample_weight=None, **kwargs) self.model = self._get_model_type()(**params).fit(X, y) else: self.model = self._fit_with_samples(X=X, y=y, model_params=params, time_limit=time_limit - (time.time() - time_start)) + + # Higher mem_error_threshold of 0.4 for TabArena. + def _validate_fit_memory_usage( + self, + mem_error_threshold: float = 0.4, + mem_warning_threshold: float = 0.35, + mem_size_threshold: int = 1e7, + **kwargs, + ): + return super()._validate_fit_memory_usage( + mem_error_threshold=mem_error_threshold, + mem_warning_threshold=mem_warning_threshold, + mem_size_threshold=mem_size_threshold, + **kwargs, + ) \ No newline at end of file diff --git a/tabarena/tabarena/benchmark/models/ag/realmlp/realmlp_model.py b/tabarena/tabarena/benchmark/models/ag/realmlp/realmlp_model.py index b5c720bdf..65d33aaf9 100644 --- a/tabarena/tabarena/benchmark/models/ag/realmlp/realmlp_model.py +++ b/tabarena/tabarena/benchmark/models/ag/realmlp/realmlp_model.py @@ -4,17 +4,16 @@ import logging import time from contextlib import contextmanager -from typing import TYPE_CHECKING, Literal +from typing import Literal import pandas as pd +import numpy as np from autogluon.common.utils.resource_utils import ResourceManager -from autogluon.core.models import AbstractModel +from autogluon.tabular.models.abstract.abstract_torch_model import AbstractTorchModel from sklearn.impute import SimpleImputer from autogluon.tabular import __version__ -if TYPE_CHECKING: - import numpy as np logger = logging.getLogger(__name__) @@ -31,7 +30,7 @@ def set_logger_level(logger_name: str, level: int): # pip install pytabkit -class RealMLPModel(AbstractModel): +class RealMLPModel(AbstractTorchModel): """RealMLP is an improved multilayer perception (MLP) model through a bag of tricks and better default hyperparameters. @@ -56,6 +55,8 @@ def __init__(self, **kwargs): self._indicator_columns = None self._features_bool = None self._bool_to_cat = None + self._cat_col_names = None + self._category_mapping = None def get_model_cls(self, default_hyperparameters: Literal["td", "td_s"] = "td"): from pytabkit import ( @@ -77,6 +78,12 @@ def get_model_cls(self, default_hyperparameters: Literal["td", "td_s"] = "td"): model_cls = RealMLP_TD_S_Regressor return model_cls + def get_device(self) -> str: + return self.model.device + + def _set_device(self, device: str): + self.model.to(device) + def _fit( self, X: pd.DataFrame, @@ -190,8 +197,7 @@ def _fit( # FIXME: In rare cases can cause exceptions if name_categories=False, unknown why extra_fit_kwargs = {} if name_categories: - cat_col_names = X.select_dtypes(include="category").columns.tolist() - extra_fit_kwargs["cat_col_names"] = cat_col_names + extra_fit_kwargs["cat_col_names"] = self._cat_col_names if X_val is not None: X_val = self.preprocess(X_val) @@ -274,6 +280,28 @@ def _preprocess( if self._bool_to_cat and self._features_bool: # FIXME: Use CategoryFeatureGenerator? Or tell the model which is category X[self._features_bool] = X[self._features_bool].astype("category") + + if is_train: + self._cat_col_names = X.select_dtypes(include="category").columns.tolist() + + # Avoid bad dtype for cat categories in later ordinal encoding. + # Maps unseen categories to a new high integer. + if self._cat_col_names is not None: + if self._category_mapping is None: + self._category_mapping = {} + for col in self._cat_col_names: + cats = X[col].cat.categories + self._category_mapping[col] = {cat: code for code, cat in enumerate(cats)} + + if self._category_mapping is not None: + for col in self._cat_col_names: + mapping = self._category_mapping[col] + unseen_code = len(mapping) + nan_mask = X[col].isna() + X[col] = X[col].astype(object) + X[col] = X[col].map(mapping).fillna(unseen_code).astype(int).astype("category") + X.loc[nan_mask, col] = np.nan + return X def _set_default_params(self): @@ -330,6 +358,7 @@ def _estimate_memory_usage_static( X: pd.DataFrame, hyperparameters: dict | None = None, num_classes: int = 1, + overhead_for_large_data: float = 1.5, **kwargs, ) -> int: """RealMLP memory estimation logic.""" @@ -373,7 +402,13 @@ def _estimate_memory_usage_static( res = alg_interface.get_required_resources( ds, n_cv=1, n_refit=0, n_splits=1, split_seeds=[0], n_train=n_samples ) - return int(res.gpu_ram_gb * 1e9) + + est = int(res.gpu_ram_gb * 1e9) + + if n_samples > 250_000: + est = int(est * overhead_for_large_data) + + return est def _validate_fit_memory_usage(self, mem_error_threshold: float = 1, **kwargs): return super()._validate_fit_memory_usage( diff --git a/tabarena/tabarena/benchmark/models/ag/sap_rpt_oss/sap_rpt_oss_model.py b/tabarena/tabarena/benchmark/models/ag/sap_rpt_oss/sap_rpt_oss_model.py index 9e94b89cd..074b6845e 100644 --- a/tabarena/tabarena/benchmark/models/ag/sap_rpt_oss/sap_rpt_oss_model.py +++ b/tabarena/tabarena/benchmark/models/ag/sap_rpt_oss/sap_rpt_oss_model.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING from autogluon.common.utils.resource_utils import ResourceManager -from autogluon.core.models import AbstractModel +from autogluon.tabular.models.abstract.abstract_torch_model import AbstractTorchModel if TYPE_CHECKING: import pandas as pd @@ -14,7 +14,7 @@ # FIXME: model is for some reason super slow for 200 features and 50k samples (363616) -class SAPRPTOSSModel(AbstractModel): +class SAPRPTOSSModel(AbstractTorchModel): """ConTextTab Model: https://github.com/SAP-samples/sap-rpt-1-oss.""" ag_key = "SAP-RPT-OSS" @@ -69,11 +69,17 @@ def _set_default_params(self): "checkpoint": "2025-11-04_sap-rpt-one-oss.pt", "max_context_size": 8192, "bagging": 8, - "test_chunk_size": 1000, # TODO, optimize based on dataset/VRAM? + "test_chunk_size": 4000, # TODO, optimize based on dataset/VRAM? } for param, val in default_params.items(): self._set_default_param_value(param, val) + def get_device(self) -> str: + return self.model.model.device + + def _set_device(self, device: str): + self.model.model.to(device) + @classmethod def supported_problem_types(cls) -> list[str] | None: return ["binary", "multiclass", "regression"] @@ -90,7 +96,7 @@ def get_minimum_resources( ) -> dict[str, int | float]: return { "num_cpus": 1, - "num_gpus": 1 if is_gpu_available else 0, + "num_gpus": 0.5 if is_gpu_available else 0, } @classmethod @@ -100,7 +106,6 @@ def _get_default_ag_args_ensemble(cls, **kwargs) -> dict: """ default_ag_args_ensemble = super()._get_default_ag_args_ensemble(**kwargs) extra_ag_args_ensemble = { - "fold_fitting_strategy": "sequential_local", "refit_folds": True, } default_ag_args_ensemble.update(extra_ag_args_ensemble) diff --git a/tabarena/tabarena/benchmark/models/ag/tabdpt/tabdpt_model.py b/tabarena/tabarena/benchmark/models/ag/tabdpt/tabdpt_model.py index dea8d2e23..c2c26cafe 100644 --- a/tabarena/tabarena/benchmark/models/ag/tabdpt/tabdpt_model.py +++ b/tabarena/tabarena/benchmark/models/ag/tabdpt/tabdpt_model.py @@ -4,7 +4,7 @@ from autogluon.common.utils.resource_utils import ResourceManager from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION -from autogluon.core.models import AbstractModel +from autogluon.tabular.models.abstract.abstract_torch_model import AbstractTorchModel from autogluon.features.generators import LabelEncoderFeatureGenerator if TYPE_CHECKING: @@ -12,8 +12,7 @@ import pandas as pd -# FIXME: Add CPU loading support (.to(device)) -class TabDPTModel(AbstractModel): +class TabDPTModel(AbstractTorchModel): ag_key = "TA-TABDPT" ag_name = "TA-TabDPT" seed_name = "seed" @@ -23,6 +22,8 @@ def __init__(self, **kwargs): super().__init__(**kwargs) self._feature_generator = None self._predict_hps = None + self._use_flash_og = None + def _fit( self, @@ -84,6 +85,23 @@ def _use_flash() -> bool: return capability != (7, 5) + def _post_fit(self, **kwargs): + super()._post_fit(**kwargs) + self._use_flash_og = self.model.use_flash + return self + + def get_device(self) -> str: + return self.model.device + + def _set_device(self, device: str): + self.model.to(device) + if device == "cpu": + self.model.use_flash = False + self.model.model.use_flash = False + else: + self.model.use_flash = self._use_flash_og + self.model.model.use_flash = self._use_flash_og + def _get_default_resources(self) -> tuple[int, int]: # Use only physical cores for better performance based on benchmarks num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True) @@ -97,7 +115,7 @@ def get_minimum_resources( ) -> dict[str, int | float]: return { "num_cpus": 1, - "num_gpus": 1 if is_gpu_available else 0, + "num_gpus": 0.5 if is_gpu_available else 0, } def _predict_proba(self, X, **kwargs) -> np.ndarray: @@ -133,8 +151,46 @@ def _more_tags(self) -> dict: def _get_default_ag_args_ensemble(cls, **kwargs) -> dict: default_ag_args_ensemble = super()._get_default_ag_args_ensemble(**kwargs) extra_ag_args_ensemble = { - "fold_fitting_strategy": "sequential_local", "refit_folds": True, } default_ag_args_ensemble.update(extra_ag_args_ensemble) return default_ag_args_ensemble + + # FIXME: This is copied from TabPFN, but TabDPT is not the same + @classmethod + def _estimate_memory_usage_static( + cls, + *, + X: pd.DataFrame, + hyperparameters: dict | None = None, + **kwargs, + ) -> int: + """Heuristic memory estimate based on TabPFN's memory estimate logic in: + https://github.com/PriorLabs/TabPFN/blob/57a2efd3ebdb3886245e4d097cefa73a5261a969/src/tabpfn/model/memory.py#L147. + + This is based on GPU memory usage, but hopefully with overheads it also approximates CPU memory usage. + """ + # TODO: update, this is not correct anymore, consider using internal TabPFN functions directly. + features_per_group = 3 # Based on TabPFNv2 default (unused) + n_layers = 12 # Based on TabPFNv2 default + embedding_size = 192 # Based on TabPFNv2 default + dtype_byte_size = 2 # Based on TabPFNv2 default + + model_mem = 14489108 # Based on TabPFNv2 default + + n_samples, n_features = X.shape[0], min(X.shape[1], 500) + n_feature_groups = (n_features) / features_per_group + 1 # TODO: Unsure how to calculate this + + X_mem = n_samples * n_feature_groups * dtype_byte_size + activation_mem = n_samples * n_feature_groups * embedding_size * n_layers * dtype_byte_size + + baseline_overhead_mem_est = 1e9 # 1 GB generic overhead + + # Add some buffer to each term + 1 GB overhead to be safe + memory_estimate = model_mem + 4 * X_mem + 2 * activation_mem + baseline_overhead_mem_est + + # TabDPT memory estimation is very inaccurate because it is using TabPFN memory estimate. Double it to be safe. + memory_estimate = memory_estimate * 2 + + # Note: This memory estimate is way off if `context_size` is not None + return int(memory_estimate) diff --git a/tabarena/tabarena/benchmark/models/ag/tabicl/tabicl_model.py b/tabarena/tabarena/benchmark/models/ag/tabicl/tabicl_model.py index d6e9dea21..8c87c6770 100644 --- a/tabarena/tabarena/benchmark/models/ag/tabicl/tabicl_model.py +++ b/tabarena/tabarena/benchmark/models/ag/tabicl/tabicl_model.py @@ -5,7 +5,7 @@ from autogluon.common.utils.pandas_utils import get_approximate_df_mem_usage from autogluon.common.utils.resource_utils import ResourceManager -from autogluon.core.models import AbstractModel +from autogluon.tabular.models.abstract.abstract_torch_model import AbstractTorchModel from autogluon.tabular import __version__ if TYPE_CHECKING: @@ -14,7 +14,7 @@ logger = logging.getLogger(__name__) -class TabICLModelBase(AbstractModel): +class TabICLModelBase(AbstractTorchModel): """TabICL is a foundation model for tabular data using in-context learning that is scalable to larger datasets than TabPFNv2. It is pretrained purely on synthetic data. TabICL currently only supports classification tasks. @@ -238,6 +238,19 @@ def _set_default_params(self): for param, val in default_params.items(): self._set_default_param_value(param, val) + def get_device(self) -> str: + return self.model.device_.type + + # TODO: Better to have an official TabICL method for this + def _set_device(self, device: str): + device = self.to_torch_device(device) + self.model.device_ = device + self.model.device = self.model.device_.type + self.model.model_ = self.model.model_.to(self.model.device_) + self.model.inference_config_.COL_CONFIG.device = self.model.device_ + self.model.inference_config_.ROW_CONFIG.device = self.model.device_ + self.model.inference_config_.ICL_CONFIG.device = self.model.device_ + class TabICLv2Model(TabICLModelBase): """TabICLv2 model as used on TabArena.""" diff --git a/tabarena/tabarena/benchmark/models/ag/tabm/_tabm_internal.py b/tabarena/tabarena/benchmark/models/ag/tabm/_tabm_internal.py index 58e164165..609aa7258 100644 --- a/tabarena/tabarena/benchmark/models/ag/tabm/_tabm_internal.py +++ b/tabarena/tabarena/benchmark/models/ag/tabm/_tabm_internal.py @@ -43,7 +43,7 @@ def get_tabm_auto_batch_size(n_train: int) -> int: return 256 if n_train < 108_000: return 512 - return 1024 + return 768 # Adjust to be lower to fit on 80 GB for very large datasets. class RTDLQuantileTransformer(BaseEstimator, TransformerMixin): diff --git a/tabarena/tabarena/benchmark/models/ag/tabm/tabm_model.py b/tabarena/tabarena/benchmark/models/ag/tabm/tabm_model.py index 3f974d5d3..2895abff9 100644 --- a/tabarena/tabarena/benchmark/models/ag/tabm/tabm_model.py +++ b/tabarena/tabarena/benchmark/models/ag/tabm/tabm_model.py @@ -10,14 +10,14 @@ import pandas as pd from autogluon.common.utils.resource_utils import ResourceManager -from autogluon.core.models import AbstractModel +from autogluon.tabular.models.abstract.abstract_torch_model import AbstractTorchModel from autogluon.tabular import __version__ logger = logging.getLogger(__name__) -class TabMModel(AbstractModel): +class TabMModel(AbstractTorchModel): """TabM is an efficient ensemble of MLPs that is trained simultaneously with mostly shared parameters. TabM is one of the top performing methods overall on TabArena-v0.1: https://tabarena.ai @@ -143,6 +143,14 @@ def _preprocess( return X + def get_device(self) -> str: + return self.model.device_.type + + def _set_device(self, device: str): + device = self.to_torch_device(device) + self.model.device_ = device + self.model.model_ = self.model.model_.to(device) + def _set_default_params(self): default_params = dict( random_state=0, @@ -260,9 +268,16 @@ def _estimate_tabm_ram( mem_ds = n_samples * (4 * n_numerical + 8 * len(cat_sizes)) # some safety constants and offsets (the 5 is probably excessive) - return ( + res = ( 5 * mem_ds + 1.2 * mem_forward_backward + 1.2 * mem_params + 0.3 * (1024**3) ) + # Safety overhead + res = res * 1.5 + logger.log( + 40, + f"\tEstimated memory usage {res/1e9:4}.", + ) + return res @classmethod def get_tabm_auto_batch_size(cls, n_samples: int) -> int: @@ -277,7 +292,7 @@ def get_tabm_auto_batch_size(cls, n_samples: int) -> int: return 256 if n_samples < 108_000: return 512 - return 1024 + return 768 # Adjust to be lower to fit on 80 GB for very large datasets. @classmethod def _class_tags(cls): diff --git a/tabarena/tabarena/benchmark/models/ag/tabpfnv2_5/tabpfnv2_5_model.py b/tabarena/tabarena/benchmark/models/ag/tabpfnv2_5/tabpfnv2_5_model.py index 688e8c2d9..7404f023a 100644 --- a/tabarena/tabarena/benchmark/models/ag/tabpfnv2_5/tabpfnv2_5_model.py +++ b/tabarena/tabarena/benchmark/models/ag/tabpfnv2_5/tabpfnv2_5_model.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING from autogluon.common.utils.resource_utils import ResourceManager -from autogluon.core.models import AbstractModel +from autogluon.tabular.models.abstract.abstract_torch_model import AbstractTorchModel from autogluon.features.generators import LabelEncoderFeatureGenerator if TYPE_CHECKING: @@ -17,7 +17,7 @@ _HAS_LOGGED_TABPFN_LICENSE: bool = False -class TabPFNModel(AbstractModel): +class TabPFNModel(AbstractTorchModel): """TabPFN-2.5 is a tabular foundation model that is developed and maintained by PriorLabs: https://priorlabs.ai/. This class is an abstract template for various TabPFN versions as subclasses. @@ -267,6 +267,12 @@ def _set_default_params(self): for param, val in default_params.items(): self._set_default_param_value(param, val) + def get_device(self) -> str: + return self.model.devices_[0].type + + def _set_device(self, device: str): + self.model.to(device) + @classmethod def supported_problem_types(cls) -> list[str] | None: return ["binary", "multiclass", "regression"] diff --git a/tabarena/tabarena/benchmark/models/wrapper/AutoGluon_class.py b/tabarena/tabarena/benchmark/models/wrapper/AutoGluon_class.py index 9efc1c992..928b2c641 100644 --- a/tabarena/tabarena/benchmark/models/wrapper/AutoGluon_class.py +++ b/tabarena/tabarena/benchmark/models/wrapper/AutoGluon_class.py @@ -50,14 +50,11 @@ def _resolve_validation_protocol( init_kwargs = copy.deepcopy(self.init_kwargs) fit_kwargs = copy.deepcopy(self.fit_kwargs) - # TODO: think about if we can reset the index here without breaking simulation artifacts - train_data = X.copy() - num_folds = fit_kwargs.pop("num_bag_folds", None) num_repeats = fit_kwargs.pop("num_bag_folds", None) custom_splits, num_folds, num_repeats = self.resolve_validation_splits( - X=train_data.reset_index(drop=True), + X=X.reset_index(drop=True), y=y.reset_index(drop=True), num_folds=num_folds, num_repeats=num_repeats @@ -89,9 +86,18 @@ def _resolve_validation_protocol( feature_generator_kwargs[param] = value fit_kwargs["feature_generator"] = feature_generator_cls(**feature_generator_kwargs) + # TODO: think about if we can reset the index here without breaking simulation artifacts + if self._can_use_data_in_place: + train_data = X + if X_val is not None: + tuning_data = X_val + else: + train_data = X.copy() + if X_val is not None: + tuning_data = X_val.copy() + train_data[self.label] = y if X_val is not None: - tuning_data = X_val.copy() tuning_data[self.label] = y_val fit_kwargs["tuning_data"] = tuning_data @@ -115,7 +121,6 @@ def _fit( X_val=X_val, y_val=y_val, ) - del X, y, X_val, y_val # encourage memory release self.predictor = TabularPredictor( label=self.label, diff --git a/tabarena/tabarena/benchmark/models/wrapper/abstract_class.py b/tabarena/tabarena/benchmark/models/wrapper/abstract_class.py index a86eba329..900e37af3 100644 --- a/tabarena/tabarena/benchmark/models/wrapper/abstract_class.py +++ b/tabarena/tabarena/benchmark/models/wrapper/abstract_class.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import Callable + import numpy as np import pandas as pd from autogluon.core.data.label_cleaner import LabelCleaner, LabelCleanerDummy @@ -9,7 +11,6 @@ from tabarena.utils.time_utils import Timer from tabarena.benchmark.models.wrapper.validation_utils import TabArenaValidationProtocolExecMixin - class AbstractExecModel(TabArenaValidationProtocolExecMixin): can_get_error_val = False can_get_oof = False @@ -44,6 +45,7 @@ def __init__( self._feature_generator = None self.failure_artifact = None self.shuffle_features = shuffle_features + self._can_use_data_in_place = False def transform_y(self, y: pd.Series) -> pd.Series: return self.label_cleaner.transform(y) @@ -76,42 +78,28 @@ def _preprocess_fit_transform(self, X: pd.DataFrame, y: pd.Series): def post_fit(self, X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame): pass - def fit_custom(self, X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame, *, split_seed: int | None = None) -> dict: - og_index = X_test.index - inv_perm = None - - if self.shuffle_test: - perm, inv_perm = _make_perm(len(X_test), seed=self.shuffle_seed) - X_test = X_test.iloc[perm] - if self.reset_index_test: - X_test = X_test.reset_index(drop=True) - if self.shuffle_features: - assert split_seed is not None, "If shuffle_features is True, split_seed must not be None!" - features = list(X.columns) - rng = np.random.default_rng(seed=split_seed) - rng.shuffle(features) - X, X_test = X[features], X_test[features] - - out = self._fit_custom(X=X, y=y, X_test=X_test) - - if self.shuffle_test: - # Inverse-permute outputs back to original X_test order - out["predictions"] = _apply_inv_perm(out["predictions"], inv_perm, index=og_index) - if out["probabilities"] is not None: - out["probabilities"] = _apply_inv_perm(out["probabilities"], inv_perm, index=og_index) - elif self.reset_index_test: - out["predictions"].index = og_index - if out["probabilities"] is not None: - out["probabilities"].index = og_index - - return out - # TODO: Prateek, Add a toggle here to see if user wants to fit or fit and predict, also add model saving functionality # TODO: Nick: Temporary name - def _fit_custom(self, X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame) -> dict: + def fit_custom( + self, + X: pd.DataFrame | None, + y: pd.Series | None, + X_test: pd.DataFrame | None, + *, + split_seed: int | None = None, + lazy_load_function: Callable | None = None + ) -> dict: """ Calls the fit function of the inheriting class and proceeds to perform predictions based on the problem type + Arguments + --------- + split_seed: + If not None, the seed that is different per split to use for shuffling features. + lazy_load_function: + If not None, a function that one can call to load X, y, X_test lazily (e.g. to save memory by not + loading them until needed). If provided, X, y, and X_test arguments must be None. + Returns ------- dict @@ -119,9 +107,38 @@ def _fit_custom(self, X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame) -> di """ from tabarena.utils.memory_utils import CpuMemoryTracker, GpuMemoryTracker + if lazy_load_function is not None: + assert X is None and y is None and X_test is None, "If lazy_load_function is provided, X and y must be None" + X, y, _ = lazy_load_function() + self._can_use_data_in_place = True + + shuffled_features = None + if self.shuffle_features: + assert split_seed is not None, "If shuffle_features is True, split_seed must not be None!" + shuffled_features = list(X.columns) + rng = np.random.default_rng(seed=split_seed) + rng.shuffle(shuffled_features) + X = X[shuffled_features] + with CpuMemoryTracker() as cpu_tracker, GpuMemoryTracker(device=0) as gpu_tracker, Timer() as timer_fit: self.fit(X, y) + # Reload all, allows X,y to be used in-place + if lazy_load_function is not None: + del X, y, X_test # Free memory from previous load + X, y, X_test = lazy_load_function() + + og_index = X_test.index + inv_perm = None + if self.shuffle_test: + perm, inv_perm = _make_perm(len(X_test), seed=self.shuffle_seed) + X_test = X_test.iloc[perm] + if self.reset_index_test: + X_test = X_test.reset_index(drop=True) + if shuffled_features is not None: + X_test = X_test[shuffled_features] + X = X[shuffled_features] + self.post_fit(X=X, y=y, X_test=X_test) if self.problem_type in ["binary", "multiclass"]: @@ -152,6 +169,16 @@ def _fit_custom(self, X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame) -> di gpu_tracking_enabled=gpu_tracker.enabled, ) + if self.shuffle_test: + # Inverse-permute outputs back to original X_test order + out["predictions"] = _apply_inv_perm(out["predictions"], inv_perm, index=og_index) + if out["probabilities"] is not None: + out["probabilities"] = _apply_inv_perm(out["probabilities"], inv_perm, index=og_index) + elif self.reset_index_test: + out["predictions"].index = og_index + if out["probabilities"] is not None: + out["probabilities"].index = og_index + return out def fit(self, X: pd.DataFrame, y: pd.Series, X_val=None, y_val=None): diff --git a/tabarena/tabarena/benchmark/models/wrapper/validation_utils.py b/tabarena/tabarena/benchmark/models/wrapper/validation_utils.py index e6b24d841..54b752781 100644 --- a/tabarena/tabarena/benchmark/models/wrapper/validation_utils.py +++ b/tabarena/tabarena/benchmark/models/wrapper/validation_utils.py @@ -139,9 +139,9 @@ def resolve_validation_splits( stratify_on_data = None if self.stratify_on is not None: - stratify_on_data = ( - X[self.stratify_on] if self.stratify_on in X.columns else y - ) + stratify_on_data = X[self.stratify_on] if self.stratify_on in X.columns else y + # Enforce categorical dtype for stratification column, as some splitting logic relies on it. + stratify_on_data = stratify_on_data.astype("category") groups_data = None group_labels = None @@ -149,14 +149,10 @@ def resolve_validation_splits( raise NotImplementedError if self.time_on is not None: - groups_data, num_folds_new = self.time_on_to_groups_data( - X=X, time_on=self.time_on, num_folds=num_folds - ) + groups_data, num_folds_new = self.time_on_to_groups_data(X=X, time_on=self.time_on, num_folds=num_folds) num_repeats = 1 logger.info( - f"\n\tFolds time-based grouping: before={num_folds}; " - f"after={num_folds_new}" - f"\n\tnum_repeats set to 1!" + f"\n\tFolds time-based grouping: before={num_folds}; after={num_folds_new}\n\tnum_repeats set to 1!" ) num_folds = num_folds_new # Set group labels as needed for time split @@ -167,6 +163,19 @@ def resolve_validation_splits( group_labels = self.group_labels if groups_data is not None: + if num_repeats is None: + num_repeats = 1 + + n_groups = groups_data.nunique() + if n_groups < num_folds: + logger.info( + f"Number of unique groups in the data ({n_groups}) is less than the " + f"number of folds ({num_folds})! Adjusting the number of folds to be equal to the number of " + f"unique groups, and setting num_repeats to 1." + ) + num_folds = n_groups + num_repeats = 1 + custom_splits = self._resolve_group_splits( X=X, num_folds=num_folds, @@ -179,28 +188,54 @@ def resolve_validation_splits( # Sanity checks for custom splits if custom_splits is not None: for train_idx, test_idx in custom_splits: + + assert len(train_idx) > 0, "Train split is empty!" + assert len(test_idx) > 0, "Test split is empty!" + if stratify_on_data is not None: - stratify_values = stratify_on_data.unique() - train_stratify_values = set( - stratify_on_data.iloc[train_idx].unique() - ) + stratify_values = set(stratify_on_data.unique()) + train_stratify_values = set(stratify_on_data.iloc[train_idx].unique()) test_stratify_values = set(stratify_on_data.iloc[test_idx].unique()) - assert ( - train_stratify_values - == test_stratify_values - == set(stratify_values) - ), ( - f"Stratification values in train and test splits do not match!" + + assert train_stratify_values == stratify_values, ( + "[Missing Train Stratification Values] " + "Stratification values in train split do not match overall stratification values!" f"\n\tOverall stratification values: {stratify_values}" f"\n\tTrain stratification values: {train_stratify_values}" + ) + assert test_stratify_values.issubset(train_stratify_values), ( + "[Unseen Test Stratification Values] " + "Stratification values in test split are not a subset of train stratification values!" + f"\n\tTrain stratification values: {train_stratify_values}" f"\n\tTest stratification values: {test_stratify_values}" ) + if train_stratify_values != stratify_values: + # Check if test has all labels for binary, as metrics require it. + if len(stratify_values) == 2: + raise ValueError( + "[Binary Metric Missing Stratification Values in Test] " + "Stratification values in train and test splits do not match!" + f"\n\tOverall stratification values: {stratify_values}" + f"\n\tTrain stratification values: {train_stratify_values}" + f"\n\tTest stratification values: {test_stratify_values}" + ) + + # For multi-stratify values, we do not allow missing a stratify value in the test split + logger.warning( + "[Stratification Value Missing in Test Data] " + "Stratification values in train and test splits are not identical." + "This means the validation data is likely missing some classes." + f"\n\tOverall stratification values: {stratify_values}" + f"\n\tTrain stratification values: {train_stratify_values}" + f"\n\tTest stratification values: {test_stratify_values}" + ) + return custom_splits, num_folds, num_repeats def _resolve_number_of_splits( - self, *, num_folds: int, num_repeats: int, num_group_instances: int - ) -> tuple[int, int]: + self, *, num_folds: int, num_repeats: int | None, num_group_instances: int + ) -> tuple[int, int | None]: """Determine the number of splits we want to use. Parameters @@ -213,25 +248,30 @@ def _resolve_number_of_splits( The number of group instances in the data. """ new_num_folds, new_num_repeats = None, None + new_num_folds_reason, new_num_repeats_reason = "", "" if num_group_instances <= self.max_samples_for_tiny_data: new_num_folds = self.tiny_data_num_folds new_num_repeats = self.tiny_data_num_repeats + new_num_folds_reason += "Tiny data" + new_num_repeats_reason += "Tiny data" else: # We want these by default for all other data in our benchmark. assert num_folds == 8 assert (num_repeats == 1) or (num_repeats is None) + + if new_num_folds is not None: logger.info( - f"\nUpdating num_bag_folds from {new_num_folds} to {new_num_folds}" - f" since number of group instances is less than num_bag_folds." + f"\nUpdating num_bag_folds from {num_folds} to {new_num_folds} " + f"because: {new_num_folds_reason}" ) num_folds = new_num_folds if new_num_repeats is not None: logger.info( - f"\nUpdating num_bag_sets from {num_repeats} to {new_num_repeats}" - f" since number of group instances is less than num_bag_folds." + f"\nUpdating new_num_repeats from {num_repeats} to {new_num_repeats}" + f"because: {new_num_repeats_reason}" ) num_repeats = new_num_repeats @@ -314,17 +354,13 @@ def group_on_to_groups_data(*, X: pd.DataFrame, group_on: str | list[str]): return groups_data.copy() @staticmethod - def time_on_to_groups_data( - *, X: pd.DataFrame, time_on: str, num_folds: int - ) -> tuple[pd.Series, int]: + def time_on_to_groups_data(*, X: pd.DataFrame, time_on: str, num_folds: int) -> tuple[pd.Series, int]: """Go from time column to a group column for splits.""" time_data = X[time_on] if pd.api.types.is_datetime64_any_dtype(time_data): - time_data = time_data.view("int64") - assert pd.api.types.is_numeric_dtype(time_data), ( - "Time_on column is not datetime or numeric!" - ) + time_data = time_data.astype("int64") + assert pd.api.types.is_numeric_dtype(time_data), "Time_on column is not datetime or numeric!" return split_time_index_into_intervals( time_data=time_data, @@ -345,7 +381,7 @@ def _get_group_columns_to_drop(self) -> list[str]: cols += self.group_on if isinstance(self.group_on, list) else [self.group_on] return cols - def get_num_group_instances(self, X: pd.DataFrame): + def get_num_group_instances(self, X: pd.DataFrame, *, group_labels: None = None) -> int: """Compute the number of rows that represent how much (multi-instance) samples the data has. This is used to determine which splits to use. """ @@ -405,9 +441,7 @@ def split_time_index_into_intervals( n_unique = len(counts) if n_unique < 2: - raise ValueError( - "Need at least 2 unique time values to create at least 2 intervals." - ) + raise ValueError("Need at least 2 unique time values to create at least 2 intervals.") actual_n_intervals = min(goal_n_intervals, n_unique) if actual_n_intervals < 2: raise ValueError("Could not create at least 2 intervals.") diff --git a/tabarena/tabarena/benchmark/preprocessing/model_agnostic_default_preprocessing.py b/tabarena/tabarena/benchmark/preprocessing/model_agnostic_default_preprocessing.py index 724c7cd59..660bbd5c2 100644 --- a/tabarena/tabarena/benchmark/preprocessing/model_agnostic_default_preprocessing.py +++ b/tabarena/tabarena/benchmark/preprocessing/model_agnostic_default_preprocessing.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import pandas as pd from autogluon.common.features.types import ( R_BOOL, @@ -31,6 +33,9 @@ ) from tabarena.benchmark.task.user_task import GroupLabelTypes +if TYPE_CHECKING: + from autogluon.common.features.feature_metadata import FeatureMetadata + # TODO: we likely need some kind of off-loading logic for text features class TabArenaModelAgnosticPreprocessing(AutoMLPipelineFeatureGenerator): @@ -39,12 +44,12 @@ class TabArenaModelAgnosticPreprocessing(AutoMLPipelineFeatureGenerator): def __init__( self, *, - enable_datetime_features: bool = False, - enable_text_ngram_features: bool = False, - enable_text_special_features: bool = True, enable_sematic_text_features: bool = True, - enable_statistical_text_features: bool = True, enable_new_datetime_features: bool = True, + enable_text_special_features: bool = False, + enable_statistical_text_features: bool = False, + enable_text_ngram_features: bool = False, + enable_datetime_features: bool = False, group_cols: str | list[str] | None = None, group_labels: GroupLabelTypes | None = None, group_time_on: str | None = None, @@ -107,6 +112,35 @@ def __init__( **kwargs, ) + def fit_transform( + self, X: pd.DataFrame, y: pd.Series | None = None, feature_metadata_in: FeatureMetadata = None, **kwargs + ) -> pd.DataFrame: + """Rename columns with '.' before AutoGluon stores feature metadata. + + AutoGluon's ``AbstractFeatureGenerator.fit_transform`` records ``features_in`` + from the *original* X before calling ``_fit_transform``. We must therefore + rename at the public API level so that the stored metadata matches what the + downstream generators will see. + + The ``"."`` character is reserved as the source-column separator in text + feature names produced downstream (e.g. ``TextSpecialFeatureGenerator`` + produces ``{col}.char_count``). Sanitizing raw column names here prevents + parsing ambiguity in + ``TextEmbeddingDimensionalityReductionFeatureGenerator._parse_source_column``. + """ + self._dot_rename_map_: dict[str, str] = {c: str(c).replace(".", "_") for c in X.columns if "." in str(c)} + if self._dot_rename_map_: + X = X.rename(columns=self._dot_rename_map_) + if feature_metadata_in is not None: + feature_metadata_in = feature_metadata_in.rename_features(rename_map=self._dot_rename_map_) + return super().fit_transform(X, y=y, feature_metadata_in=feature_metadata_in, **kwargs) + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """Apply the same dot-renaming as fit before passing to parent transform.""" + if self._dot_rename_map_: + X = X.rename(columns=self._dot_rename_map_) + return super().transform(X) + def _get_category_feature_generator(self): # Pass categorical columns through *without* encoding. # Cat handling is deferred to TabArenaModelSpecificPreprocessing. @@ -129,43 +163,25 @@ def _get_category_feature_generator(self): # TODO: maybe better cardinality threshold but we assume we only # run on well-curated data for now class StringFixAsTypeFeatureGenerator(AsTypeFeatureGenerator): - """Custom AsTypeFeatureGenerator to fix string dtype handling and column name sanitization. + """Custom AsTypeFeatureGenerator to fix string dtype handling. The default string detection from AutoGluon is hardcoded in a weird way. Thus, we overwrite it here before passing feature metadata to the rest of the pipeline. We overwrite it such that we believe the dtype of the input dataframe. - Additionally, any input column whose name contains ``"."`` is renamed so that - ``"."`` is replaced by ``"_"``. The ``"."`` character is reserved as the - source-column separator in text feature names produced downstream (e.g. - ``TextSpecialFeatureGenerator`` produces ``{col}.char_count``). Sanitizing - raw column names here prevents parsing ambiguity in - ``TextEmbeddingDimensionalityReductionFeatureGenerator._parse_source_column``. - We further adjust the original logic to better handle unseen categories or suddenly appearing - nan values at test time. - + nan values at test time: + + * **Categorical columns** — unknown category values at test time are preserved + (not silently mapped to NaN) by converting through ``object`` dtype. + * **Bool columns** — columns with exactly 2 unique values at fit time are + bool-encoded to int8 (``true_val`` → 1, else → 0). If a bool column gains + additional unique values at test time, the normal bool encoding still applies + and the unseen values are mapped to 0 (False). A warning is logged. + * **Int columns** — NaN values that appear at test time but were absent during + fit are imputed to 0. """ - def fit_transform(self, X: pd.DataFrame, y: pd.Series | None = None, **kwargs) -> pd.DataFrame: - """Rename columns with '.' before AutoGluon stores feature metadata. - - AutoGluon's ``AbstractFeatureGenerator.fit_transform`` records ``features_in`` - from the *original* X before calling ``_fit_transform``. We must therefore - rename at the public API level so that the stored metadata matches what the - parent's ``_fit_transform`` will see. - """ - self._dot_rename_map_: dict[str, str] = {c: str(c).replace(".", "_") for c in X.columns if "." in str(c)} - if self._dot_rename_map_: - X = X.rename(columns=self._dot_rename_map_) - return super().fit_transform(X, y=y, **kwargs) - - def transform(self, X: pd.DataFrame) -> pd.DataFrame: - """Apply the same dot-renaming as fit before passing to parent transform.""" - if self._dot_rename_map_: - X = X.rename(columns=self._dot_rename_map_) - return super().transform(X) - def _handle_nan_in_int_only_at_test_time(self, X: pd.DataFrame) -> pd.DataFrame: """Handle int features that contain null values at inference time but not at fit time. This logic is copied from the original AsTypeFeatureGenerator._transform. @@ -188,16 +204,13 @@ def _handle_nan_in_int_only_at_test_time(self, X: pd.DataFrame) -> pd.DataFrame: return X - def _handle_dtype_mismatch_at_test_time(self, X: pd.DataFrame, bool_cols_with_extra_cats: set) -> pd.DataFrame: + def _handle_dtype_mismatch_at_test_time(self, X: pd.DataFrame) -> pd.DataFrame: """Handle situation where dtypes of test data do not match those of training data. The logic is split between cat and non-cat features to avoid the issue where astype(CategoricalDtype(categories=[...])) silently maps unknown categories to NaN. By converting through object dtype first, we ensure that all values are preserved as valid categories, even if they were not seen during training. - bool_cols_with_extra_cats are excluded from non_cat_type_map because they - are still typed as int8 in _type_map_real_opt but have not been bool-encoded; - trying to astype them to int8 would silently discard the extra category values. """ # TODO: Confirm this works with sparse and other feature types! # FIXME: Address situation where test-time invalid type values cause crash: @@ -210,9 +223,7 @@ def _handle_dtype_mismatch_at_test_time(self, X: pd.DataFrame, bool_cols_with_ex col: dtype for col, dtype in self._type_map_real_opt.items() if isinstance(dtype, pd.CategoricalDtype) } non_cat_type_map = { - col: dtype - for col, dtype in self._type_map_real_opt.items() - if not isinstance(dtype, pd.CategoricalDtype) and col not in bool_cols_with_extra_cats + col: dtype for col, dtype in self._type_map_real_opt.items() if not isinstance(dtype, pd.CategoricalDtype) } if non_cat_type_map: try: @@ -225,38 +236,35 @@ def _handle_dtype_mismatch_at_test_time(self, X: pd.DataFrame, bool_cols_with_ex X[col] = X[col].astype(object).astype(pd.CategoricalDtype(ordered=dtype.ordered)) return X - def _handle_bool_cols_with_extra_cats_at_test_time(self, X: pd.DataFrame) -> tuple[pd.DataFrame, set]: - """Handle situation where bool columns gain extra categories at test time. + def _handle_bool_cols_with_unseen_values_at_test_time(self, X: pd.DataFrame) -> pd.DataFrame: + """Handle bool columns that gain unseen values at test time. - If a bool column gains more than the expected 2 unique non-null values at test time, - we skip bool-encoding for that column and convert it to categorical at the end of the transform method. - This is because encoding a 3rd value through the bool path (== true_val → 1, else → 0) silently maps unknown categories to 0 (false). - By skipping bool-encoding and converting to categorical, we ensure that all values are preserved as valid categories, - even if they were not seen during training. + Bool columns are always bool-encoded (``true_val`` → 1, else → 0) + regardless of whether unseen values appear. This means unseen values + are silently mapped to 0 (False), which keeps the output dtype + identical to training (int8) and avoids downstream dtype mismatches. + A warning is logged for each affected column. """ - bool_cols_with_extra_cats = { + bool_cols_with_unseen = { col for col in self._bool_features if col in X.columns and X[col].dropna().nunique() > 2 } - if bool_cols_with_extra_cats: - saved_extra = {col: self._bool_features.pop(col) for col in bool_cols_with_extra_cats} - self._set_bool_features_val() + for col in bool_cols_with_unseen: + self._log( + level=20, + msg=f"WARNING: Bool column '{col}' has more than 2 unique non-null values at test time. " + "Unseen values will be mapped to 0 (False). " + "Consider passing this column with >2 values at train time to avoid bool encoding, or" + "force to treat this as a numerical column!", + ) if self._bool_features: X = self._convert_to_bool(X) - if bool_cols_with_extra_cats: - self._bool_features.update(saved_extra) - self._set_bool_features_val() - return X, bool_cols_with_extra_cats + return X def _transform(self, X: pd.DataFrame) -> pd.DataFrame: """Override the default handling for unseen values!""" - # Identify bool columns that gained more than the expected 2 unique non-null values - # at test time. Encoding a 3rd value through the bool path (== true_val → 1, else → 0) - # silently maps unknown categories to 0 (false). We instead skip bool-encoding for - # those columns and convert them to categorical at the end of this method. - bool_cols_with_extra_cats: set[str] = set() if self._bool_features: - X, bool_cols_with_extra_cats = self._handle_bool_cols_with_extra_cats_at_test_time(X) + X = self._handle_bool_cols_with_unseen_values_at_test_time(X) # This means we have unobserved nans/categories if self._type_map_real_opt != X.dtypes.to_dict(): @@ -264,18 +272,12 @@ def _transform(self, X: pd.DataFrame) -> pd.DataFrame: X = self._handle_nan_in_int_only_at_test_time(X) if self._type_map_real_opt: - X = self._handle_dtype_mismatch_at_test_time(X, bool_cols_with_extra_cats=bool_cols_with_extra_cats) - - # Convert bool columns that gained extra categories to categorical so that - # all values (including novel ones) are preserved rather than silently mapped to 0. - for col in bool_cols_with_extra_cats: - if col in X.columns: - X[col] = X[col].astype(object).astype(pd.CategoricalDtype(ordered=False)) + X = self._handle_dtype_mismatch_at_test_time(X) return X def _fit_transform(self, X: pd.DataFrame, **kwargs) -> tuple[pd.DataFrame, dict]: - # X arrives here with '.' already replaced by '_' (done in fit_transform above). + # X arrives here with '.' already replaced by '_' (done in TabArenaModelAgnosticPreprocessing.fit_transform). X, type_group_map_special = super()._fit_transform(X=X, **kwargs) found_text_cols = type_group_map_special.get("text", []) diff --git a/tabarena/tabarena/benchmark/preprocessing/model_specific_default_preprocessing.py b/tabarena/tabarena/benchmark/preprocessing/model_specific_default_preprocessing.py index 0e391753d..1c4281834 100644 --- a/tabarena/tabarena/benchmark/preprocessing/model_specific_default_preprocessing.py +++ b/tabarena/tabarena/benchmark/preprocessing/model_specific_default_preprocessing.py @@ -40,6 +40,7 @@ class TabArenaModelSpecificPreprocessing: """ hp_key_kwargs: str = "ag.model_specific_feature_generator_kwargs" + use_pca: bool = False @staticmethod def add_to_hyperparameters(hyperparameters: dict) -> dict: @@ -85,24 +86,28 @@ def get_model_specific_generator() -> list: ``S_TEXT_EMBEDDING`` / ``S_TEXT_SPECIAL`` features, grouped by source column. """ # TODO: figure out how to more easily pass IdentityFeatureGenerator / dont drop other columns. - bulk_kwargs = dict( - generators=[ - # Cat/Ordinal Encoding - [ - # The other features are consumed, and thus can be dropped. - IdentityFeatureGenerator( - infer_features_in_args=NoCatAsStringCategoryFeatureGenerator.get_infer_features_in_args_to_drop() - ), - NoCatAsStringCategoryFeatureGenerator(), - ], - # PCA + + generators = [ + [ + # The other features are consumed, and thus can be dropped. + IdentityFeatureGenerator( + infer_features_in_args=NoCatAsStringCategoryFeatureGenerator.get_infer_features_in_args_to_drop() + ), + NoCatAsStringCategoryFeatureGenerator(), + ], + ] + if TabArenaModelSpecificPreprocessing.use_pca: + generators.append( [ IdentityFeatureGenerator( infer_features_in_args=TextEmbeddingDimensionalityReductionFeatureGenerator.get_infer_features_in_args_to_drop() ), TextEmbeddingDimensionalityReductionFeatureGenerator(), - ], - ], + ] + ) + + bulk_kwargs = dict( + generators=generators, verbosity=2, ) @@ -113,19 +118,8 @@ class NoCatAsStringCategoryFeatureGenerator(CategoryFeatureGenerator): """CategoryFeatureGenerator that does not treat each string column as a category. - CategoryFeatureGenerator that preserves unseen categories instead of NaN. - - At transform time, category values not seen during fit are mapped to the integer code - n (max_seen_code + 1) rather than being silently converted to NaN. This is achieved - by two cooperating changes: - - 1. ``_generate_features_category`` replaces each unseen non-NaN value with the - sentinel ``_UNSEEN_CAT`` and adds that sentinel to the category list, so the - Categorical dtype keeps the value rather than encoding it as NaN. - - 2. ``TabArenaCategoryMemoryMinimizeFeatureGenerator`` (used as the post-generator - instead of the plain ``CategoryMemoryMinimizeFeatureGenerator``) detects codes - >= n at transform time and maps them to the integer n. + CategoryFeatureGenerator that preserves unseen categories to be handled by + the downstream model instead of setting them to NaN. """ def __init__(self, **kwargs) -> None: diff --git a/tabarena/tabarena/benchmark/preprocessing/text_feature_generators.py b/tabarena/tabarena/benchmark/preprocessing/text_feature_generators.py index d9bb73a85..49e24b9c8 100644 --- a/tabarena/tabarena/benchmark/preprocessing/text_feature_generators.py +++ b/tabarena/tabarena/benchmark/preprocessing/text_feature_generators.py @@ -4,6 +4,8 @@ import unicodedata import warnings from collections import defaultdict +from pathlib import Path +from typing import TYPE_CHECKING, ClassVar import numpy as np import pandas as pd @@ -19,6 +21,10 @@ from sklearn.decomposition import PCA from tqdm import tqdm +if TYPE_CHECKING: + from sentence_transformers import SentenceTransformer + + # Non-printable ASCII control characters excluding whitespace (tab \x09, LF \x0a, CR \x0d). _CONTROL_CHAR_RE = re.compile(r"[\x00-\x08\x0e-\x1f\x7f]") @@ -39,26 +45,146 @@ def sanitize_text(text_data: pd.Series, fillna_str: str = "Missing Data") -> pd. ) +class TabArenaDefaultTextEncoder: + @staticmethod + def get_default_encoder(): + """Get the default sentence transformer model for encoding text features.""" + import torch + from sentence_transformers import SentenceTransformer + + return SentenceTransformer( + "Qwen/Qwen3-Embedding-8B", + truncate_dim=32, # minimal MRL dimension for Qwen3-Embedding + model_kwargs={"dtype": torch.float16, "attn_implementation": "sdpa"}, + processor_kwargs={"padding_side": "left"}, + ) + + # TODO: could optimize this much more + ideally compute on-the-fly + # Length-bucket thresholds (chars) and batch sizes, ordered from longest to shortest. + # Texts longer than the threshold get the corresponding batch size. + _LENGTH_BUCKETS: ClassVar[list[tuple[int, int]]] = [ + (20_000, 8), + (15_000, 16), + (5_000, 32), + (500, 64), + (0, 128), + ] + + @staticmethod + def encode_texts(*, texts: list[str], encoder_model: SentenceTransformer) -> np.ndarray: + """Encode texts with adaptive batch sizes based on text length. + + Texts are sorted by character length and split into buckets. + Longer texts use smaller batch sizes to limit peak memory, while + shorter texts use larger batch sizes for throughput. + """ + # guess-timate + overhead for characters per token + max_chars = int(encoder_model.max_seq_length * 3) + long_texts = [t for t in texts if len(t) > max_chars] + if long_texts: + warnings.warn( + f"{len(long_texts)} of {len(texts)} text value(s) exceed " + f"~{max_chars:,} characters and may be truncated by the model's " + f"{encoder_model.max_seq_length:,}-token context window. " + f"Longest text: {max(len(t) for t in long_texts):,} characters.", + stacklevel=2, + ) + + # Sort by character length (good proxy for token length, avoids tokenization overhead). + sorted_indices = sorted(range(len(texts)), key=lambda k: len(texts[k])) + sorted_texts = [texts[k] for k in sorted_indices] + + print(f"Encoding {len(texts)} unique text values...") + print(f"\tShortest text: {len(sorted_texts[0])} chars, longest text: {len(sorted_texts[-1])} chars.") + print(f"\tAverage text length: {sum(len(t) for t in sorted_texts) / len(sorted_texts):.1f} chars.") + + # Split sorted texts into length buckets and encode each with its own batch size. + buckets = TabArenaDefaultTextEncoder._LENGTH_BUCKETS + all_embs_parts: list[np.ndarray] = [] + start = len(sorted_texts) # walk backwards (longest first) + for char_threshold, batch_size in buckets: + # Find the first text that is shorter than the threshold. + end = start + start = end + while start > 0 and len(sorted_texts[start - 1]) >= char_threshold: + start -= 1 + bucket_texts = sorted_texts[start:end] + if not bucket_texts: + continue + print(f"\tBucket >={char_threshold} chars: {len(bucket_texts)} texts, batch_size={batch_size}") + embs = encoder_model.encode( + bucket_texts, + prompt_name="query", + convert_to_numpy=True, + normalize_embeddings=True, + batch_size=batch_size, + show_progress_bar=True, + ) + all_embs_parts.append(embs) + + # Reverse parts so they follow the original sorted order (shortest first). + all_embs_parts.reverse() + all_embs_sorted = np.concatenate(all_embs_parts, axis=0) + + # Unsort back to original ordering. + return all_embs_sorted[np.argsort(sorted_indices)] + + @staticmethod + def get_text_to_encode(*, X: pd.DataFrame, seen_texts: set[str]) -> list[str]: + """Collect unique (column, value) pairs from *X* that are not present in *embedding_look_up*.""" + unseen_keys = [] + + # Pass 1: discover unseen (col, value) + for col in tqdm(X.columns, desc="Collecting text to encode..."): + s = sanitize_text(X[col]) + + for val in s.unique(): + if val not in seen_texts: + unseen_keys.append(val) + + return unseen_keys + + @staticmethod + def get_cache_data_for_dataset(*, X: pd.DataFrame, seen_texts: set[str]) -> dict: + """Get the cache data for the given dataset, which is a dict mapping (col, value) pairs to their embeddings.""" + text_to_encode = TabArenaDefaultTextEncoder.get_text_to_encode( + X=X, + seen_texts=seen_texts, + ) + if not text_to_encode: + return {} + + new_embeddings = TabArenaDefaultTextEncoder.encode_texts( + texts=text_to_encode, + encoder_model=TabArenaDefaultTextEncoder.get_default_encoder(), + ) + return dict(zip(text_to_encode, new_embeddings)) + + class SemanticTextFeatureGenerator(AbstractFeatureGenerator): - """Create semantic text embeddings using a pre-trained sentencetransformer model.""" + """Create semantic text embeddings using a pre-trained sentencetransformer model. - _embedding_look_up: dict[str, np.ndarray] - """Cache for the embeddings of unique text values.""" + Uses ``Qwen/Qwen3-Embedding-0.6B`` with Matryoshka Representation Learning + (MRL) to produce compact ``_MRL_DIM``-dimensional embeddings per text value. + """ + + _embedding_look_up: dict[str, np.ndarray] = {} + """Class-level cache for the embeddings of unique text values, shared across all instances within a process.""" _expected_columns: list[str] """Expected columns during transform, set during fit.""" _feature_names: list[str] """Stable feature names for the generated embedding features.""" + only_load_from_cache: bool = False + """Whether to only load embeddings from cache and crash for on-the-fly encoding of unseen text values.""" def _fit_transform(self, X: pd.DataFrame, **kwargs) -> tuple[pd.DataFrame, dict]: """See parameters of the parent class AbstractFeatureGenerator for more details on the parameters. """ - from sentence_transformers import SentenceTransformer - import torch - - device = "cuda" if torch.cuda.is_available() else "cpu" - self._encoder_model = SentenceTransformer("intfloat/e5-small-v2", device=device) - + if self.only_load_from_cache: + self._encoder_model = None + else: + self._encoder_model = TabArenaDefaultTextEncoder.get_default_encoder() X_out = self._transform(X, is_train=True) return X_out, {S_TEXT_EMBEDDING: list(X_out.columns)} @@ -66,9 +192,6 @@ def _transform(self, X: pd.DataFrame, *, is_train: bool = False) -> pd.DataFrame """See parameters of the parent class AbstractFeatureGenerator for more details on the parameters. """ - if is_train and not hasattr(self, "_embedding_look_up"): - self._embedding_look_up = {} - n_rows = len(X) n_cols = len(X.columns) @@ -76,44 +199,27 @@ def _transform(self, X: pd.DataFrame, *, is_train: bool = False) -> pd.DataFrame if n_rows == 0 or n_cols == 0: raise ValueError("Input DataFrame is empty!") - unseen_keys: list[tuple[str, str]] = [] - seen_unseen: set[tuple[str, str]] = set() - - # Pass 1: discover unseen (col, value) - for col in tqdm(X.columns, desc="Collecting text to encode..."): - s = sanitize_text(X[col]) - - for val in s.unique(): - key = (col, val) - if key not in self._embedding_look_up and key not in seen_unseen: - seen_unseen.add(key) - unseen_keys.append(key) + # Encode text + unseen_text = TabArenaDefaultTextEncoder.get_text_to_encode(X=X, seen_texts=set(self._embedding_look_up.keys())) + if unseen_text: - # Encode unseen - if unseen_keys: - texts_to_encode = [f"query: {col} = {val}" for col, val in unseen_keys] + if self.only_load_from_cache: + raise ValueError( + "Cache miss for text values during transform with only_load_from_cache=True. " + f"Unseen text values: {unseen_text[:10]} (showing up to 10)." + ) - embeddings = self._encoder_model.encode( - texts_to_encode, - convert_to_numpy=True, - normalize_embeddings=False, - batch_size=128, - show_progress_bar=True, - precision="float32", + embeddings = TabArenaDefaultTextEncoder.encode_texts( + texts=list(unseen_text), + encoder_model=self._encoder_model, ) - - self._embedding_look_up.update(zip(unseen_keys, embeddings)) + self._embedding_look_up.update(zip(unseen_text, embeddings)) # Infer embedding dimension emb_dim = len(next(iter(self._embedding_look_up.values()))) - # --- Stable feature names: source column is the prefix --- if is_train: - self._feature_names = [ - f"{col}.semantic_embedding_{i}" - for col in X.columns - for i in range(emb_dim) - ] + self._feature_names = [f"{col}.semantic_embedding_{i}" for col in X.columns for i in range(emb_dim)] self._expected_columns = list(X.columns) elif list(X.columns) != self._expected_columns: raise ValueError( @@ -122,10 +228,9 @@ def _transform(self, X: pd.DataFrame, *, is_train: bool = False) -> pd.DataFrame f"Got: {list(X.columns)}" ) + # Pass 2: build matrix (optimized for repeated values) # Preallocate semantic_embedding = np.empty((n_rows, n_cols * emb_dim), dtype=np.float32) - - # Pass 2: build matrix (optimized for repeated values) for j, col in tqdm( enumerate(X.columns), desc="Building semantic embedding matrix...", @@ -135,9 +240,7 @@ def _transform(self, X: pd.DataFrame, *, is_train: bool = False) -> pd.DataFrame arr = s.to_numpy() uniques, inverse = np.unique(arr, return_inverse=True) - unique_embs = np.vstack( - [self._embedding_look_up[(col, val)] for val in uniques] - ) + unique_embs = np.vstack([self._embedding_look_up[val] for val in uniques]) col_matrix = unique_embs[inverse] start = j * emb_dim @@ -165,6 +268,26 @@ def get_default_infer_features_in_args() -> dict: def _more_tags(self): return {"feature_interactions": True} + @staticmethod + def save_embedding_cache(cache: dict[str, np.ndarray], path: str | Path) -> None: + keys = list(cache.keys()) + embs = np.vstack(list(cache.values())) + df = pd.DataFrame(embs, index=pd.Index(keys, name="text")) + df.to_parquet(path) + + @staticmethod + def load_embedding_cache(path: str | Path) -> dict[str, np.ndarray]: + df = pd.read_parquet(path) + return dict(zip(df.index, df.to_numpy())) + + @staticmethod + def get_text_cache_dir(task_id_str: str) -> Path: + import openml + + base_path = (openml.config._root_cache_directory / "tabarena_text_cache").expanduser().resolve() / "text_cache" + Path(base_path).mkdir(parents=True, exist_ok=True) + + return base_path / f"{task_id_str}_cache.parquet" class StatisticalTextFeatureGenerator(AbstractFeatureGenerator): """Generate a statistical embedding of text features using skrub. @@ -177,7 +300,7 @@ class StatisticalTextFeatureGenerator(AbstractFeatureGenerator): ``SemanticTextFeatureGenerator``. """ - MAX_N_OUTPUT_FEATURES = 384 # Same as intfloat/e5-small-v2 + MAX_N_OUTPUT_FEATURES = 32 def _fit_transform(self, X: pd.DataFrame, **kwargs) -> tuple[pd.DataFrame, dict]: from skrub import StringEncoder, TableVectorizer @@ -211,9 +334,7 @@ def _transform(self, X: pd.DataFrame, *, is_train: bool = False) -> pd.DataFrame X = self._vectorizer.fit_transform(X) # TableVectorizer produces "{col}_{i}"; remap to "{col}.{i}" so that # the source column prefix is separated by "." (the project convention). - self._col_rename_map_: dict[str, str] = { - c: re.sub(r"_(\d+)$", r".\1", c) for c in X.columns - } + self._col_rename_map_: dict[str, str] = {c: re.sub(r"_(\d+)$", r".\1", c) for c in X.columns} else: X = self._vectorizer.transform(X) @@ -288,9 +409,7 @@ def _parse_source_column(feature_name: str) -> str: """ return feature_name.split(".", 1)[0] - def _make_batch_plan( - self, feature_names: list[str] - ) -> list[tuple[str, int, list[str]]]: + def _make_batch_plan(self, feature_names: list[str]) -> list[tuple[str, int, list[str]]]: """Build a PCA batch plan grouped by source column. Parameters @@ -317,9 +436,7 @@ def _make_batch_plan( plan.append((src_col, 0, feats)) else: max_n_int = int(max_n) - sub_batches = [ - feats[i : i + max_n_int] for i in range(0, len(feats), max_n_int) - ] + sub_batches = [feats[i : i + max_n_int] for i in range(0, len(feats), max_n_int)] for sub_idx, sub_feats in enumerate(sub_batches): plan.append((src_col, sub_idx, sub_feats)) @@ -352,14 +469,10 @@ def _transform(self, X: pd.DataFrame) -> pd.DataFrame: X_out = self._transform_inference(X) missing_output = [c for c in self.feature_names_out_ if c not in X_out.columns] if missing_output: - raise ValueError( - f"Transformed output is missing expected columns: {missing_output[:10]}" - ) + raise ValueError(f"Transformed output is missing expected columns: {missing_output[:10]}") return X_out[self.feature_names_out_] - def _fit_preprocess_and_transform( - self, X: pd.DataFrame, y: pd.Series - ) -> pd.DataFrame: + def _fit_preprocess_and_transform(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame: X = X.copy() self.pre_pca_feature_names_ = list(X) @@ -379,9 +492,7 @@ def _fit_preprocess_and_transform( transformed_batches: list[pd.DataFrame] = [] - for src_col, sub_batch_idx, batch_cols in tqdm( - batch_plan, desc="Fitting PCA batches..." - ): + for src_col, sub_batch_idx, batch_cols in tqdm(batch_plan, desc="Fitting PCA batches..."): X_batch = X[batch_cols] n_samples, n_features = X_batch.shape @@ -405,9 +516,7 @@ def _fit_preprocess_and_transform( ) X_pca = X_pca[:, :keep_count] - output_cols = [ - f"{src_col}.dr{sub_batch_idx}_{i}" for i in range(keep_count) - ] + output_cols = [f"{src_col}.dr{sub_batch_idx}_{i}" for i in range(keep_count)] X_pca_df = pd.DataFrame(X_pca, index=X.index, columns=output_cols) @@ -422,8 +531,7 @@ def _fit_preprocess_and_transform( X = pd.concat(transformed_batches, axis=1) self._log( 20, - f"Total PCA features generated: {X.shape[1]}" - f" from {len(self.pre_pca_feature_names_)} original features.", + f"Total PCA features generated: {X.shape[1]} from {len(self.pre_pca_feature_names_)} original features.", ) return X @@ -482,14 +590,10 @@ def _standard_scale_transform( @staticmethod def _encode_target_for_correlation(y: pd.Series) -> np.ndarray: if pd.api.types.is_numeric_dtype(y): - y_num = pd.to_numeric(y, errors="coerce").to_numpy( - dtype=np.float64, copy=False - ) + y_num = pd.to_numeric(y, errors="coerce").to_numpy(dtype=np.float64, copy=False) else: # Deterministic encoding for non-numeric labels. - y_num = pd.Series(pd.factorize(y)[0], index=y.index).to_numpy( - dtype=np.float64, copy=False - ) + y_num = pd.Series(pd.factorize(y)[0], index=y.index).to_numpy(dtype=np.float64, copy=False) if np.isnan(y_num).any(): # Fill NaNs with mean to keep correlation computation vectorized/stable. diff --git a/tabarena/tabarena/benchmark/task/openml/task_wrapper.py b/tabarena/tabarena/benchmark/task/openml/task_wrapper.py index e79fc2003..5aa30e953 100644 --- a/tabarena/tabarena/benchmark/task/openml/task_wrapper.py +++ b/tabarena/tabarena/benchmark/task/openml/task_wrapper.py @@ -19,10 +19,15 @@ class OpenMLTaskWrapper: - def __init__(self, task: OpenMLSupervisedTask, *, use_task_eval_metric: bool = False): + def __init__(self, task: OpenMLSupervisedTask, *, use_task_eval_metric: bool = False, lazy_load_data: bool = False): assert isinstance(task, OpenMLSupervisedTask) self.task: OpenMLSupervisedTask = task + self.lazy_load_data = lazy_load_data self.X, self.y = get_task_data(task=self.task) + self._n_rows, self._n_cols = self.X.shape + if self.lazy_load_data: + del self.X, self.y + self.problem_type = get_ag_problem_type(self.task) self.label = self.task.target_name @@ -70,7 +75,11 @@ def get_split_dimensions(self) -> tuple[int, int, int]: return n_repeats, n_folds, n_samples def combine_X_y(self) -> pd.DataFrame: - return pd.concat([self.X, self.y.to_frame(name=self.label)], axis=1) + if self.lazy_load_data: + X, y = get_task_data(task=self.task) + else: + X, y = self.X, self.y + return pd.concat([X, y.to_frame(name=self.label)], axis=1) def save_data(self, path: str, file_type='.csv', train_indices=None, test_indices=None): data = self.combine_X_y() @@ -86,8 +95,8 @@ def save_metadata(self, path: str): metadata = dict( label=self.label, problem_type=self.problem_type, - num_rows=len(self.X), - num_cols=len(self.X.columns), + num_rows=self._n_rows, + num_cols=self._n_cols, task_id=self.task.task_id, dataset_id=self.task.dataset_id, openml_url=self.task.openml_url, @@ -133,10 +142,20 @@ def get_train_test_split( ) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: if train_indices is None or test_indices is None: train_indices, test_indices = self.get_split_indices(fold=fold, repeat=repeat, sample=sample) - X_train = self.X.loc[train_indices] - y_train = self.y[train_indices] - X_test = self.X.loc[test_indices] - y_test = self.y[test_indices] + + if self.lazy_load_data: + X, y = get_task_data(task=self.task) + X_train = X.loc[train_indices].copy() + y_train = y[train_indices].copy() + X_test = X.loc[test_indices].copy() + y_test = y[test_indices].copy() + del X, y + else: + X, y = self.X, self.y + X_train = X.loc[train_indices] + y_train = y[train_indices] + X_test = X.loc[test_indices] + y_test = y[test_indices] if train_size is not None: X_train, y_train = self.subsample(X=X_train, y=y_train, size=train_size, random_state=random_state) diff --git a/tabarena/tabarena/benchmark/task/user_task.py b/tabarena/tabarena/benchmark/task/user_task.py index d1f5ef47d..d848b3b2a 100644 --- a/tabarena/tabarena/benchmark/task/user_task.py +++ b/tabarena/tabarena/benchmark/task/user_task.py @@ -5,7 +5,8 @@ from collections import OrderedDict from collections.abc import Iterable from copy import deepcopy -from dataclasses import asdict, dataclass, fields, replace +from dataclasses import MISSING, asdict, dataclass, fields, replace +from enum import StrEnum from pathlib import Path from typing import Annotated, Literal @@ -26,17 +27,18 @@ OpenMLSupervisedTask, TaskType, ) -from enum import StrEnum SplitIndex = Annotated[str, "format: r{int}f{int}"] SplitTimeHorizonTypes = str | int | float SplitTimeHorizonUnitTypes = Literal["steps", "days", "weeks", "months", "years"] | str + class GroupLabelTypes(StrEnum): PER_SAMPLE = "per_sample" PER_GROUP = "per_group" + @dataclass class TabArenaTaskMetadata: """Metadata about the task to run. @@ -108,6 +110,16 @@ class TabArenaTaskMetadata: identifier of a local task (see `UserTask.task_id_str`). """ + # -- Feature dtype flags (added later; default to None for backward compat) -- + has_datetime: bool | None = None + """Whether the dataset contains datetime feature columns.""" + has_text: bool | None = None + """Whether the dataset contains text (string/object) feature columns.""" + has_categorical: bool | None = None + """Whether the dataset contains categorical feature columns.""" + has_numeric: bool | None = None + """Whether the dataset contains numeric feature columns.""" + @property def n_splits(self): """Get the number of splits in the task.""" @@ -130,6 +142,30 @@ def split_index(self) -> SplitIndex: ) return self.split_indices[0] + def has_supported_dtypes(self, *, required_dtypes: list[str] | None, forbidden_dtypes: list[str] | None) -> bool: + """Check if the dataset contains only allowed dtypes based on the feature dtype flags.""" + if required_dtypes is not None: + if "datetime" in required_dtypes and not self.has_datetime: + return False + if "text" in required_dtypes and not self.has_text: + return False + if "categorical" in required_dtypes and not self.has_categorical: + return False + if "numeric" in required_dtypes and not self.has_numeric: + return False + + if forbidden_dtypes is not None: + if self.has_datetime and "datetime" in forbidden_dtypes: + return False + if self.has_text and "text" in forbidden_dtypes: + return False + if self.has_categorical and "categorical" in forbidden_dtypes: + return False + if self.has_numeric and "numeric" in forbidden_dtypes: + return False + + return True + def to_dict(self, *, exclude_splits_metadata: bool = False) -> dict: """Convert the task metadata to a dictionary for better visualization.""" res = asdict(self) @@ -155,29 +191,29 @@ def from_row(row: pd.Series) -> TabArenaTaskMetadata: """Reconstruct TabArenaTaskMetadata from a single dataframe row.""" row_dict = row.to_dict() - # Identify TabArenaTaskMetadata fields (excluding splits_metadata) - task_field_names = { - f.name for f in fields(TabArenaTaskMetadata) if f.name != "splits_metadata" + # Identify TabArenaTaskMetadata fields (excluding splits_metadata). + # Fields with defaults are optional for backward compatibility with + # older serialized metadata that may not contain newer columns. + all_task_fields = {f.name for f in fields(TabArenaTaskMetadata) if f.name != "splits_metadata"} + required_task_fields = { + f.name + for f in fields(TabArenaTaskMetadata) + if f.name != "splits_metadata" and f.default is MISSING and f.default_factory is MISSING } - if not all(name in row_dict for name in task_field_names): + if not all(name in row_dict for name in required_task_fields): raise ValueError( "Metadata row is missing required TabArenaTaskMetadata fields: " - f"{task_field_names - row_dict.keys()}" + f"{required_task_fields - row_dict.keys()}" ) - task_kwargs = { - key: row_dict[key] for key in task_field_names if key in row_dict - } + task_kwargs = {key: row_dict[key] for key in all_task_fields if key in row_dict} # Identify SplitMetadata fields split_field_names = {f.name for f in fields(SplitMetadata)} if not all(name in row_dict for name in split_field_names): raise ValueError( - "Metadata row is missing required SplitMetadata fields: " - f"{split_field_names - row_dict.keys()}" + f"Metadata row is missing required SplitMetadata fields: {split_field_names - row_dict.keys()}" ) - split_kwargs = { - key: row_dict[key] for key in split_field_names if key in row_dict - } + split_kwargs = {key: row_dict[key] for key in split_field_names if key in row_dict} # Reconstruct SplitMetadata split_metadata = SplitMetadata(**split_kwargs) @@ -310,7 +346,9 @@ def _get_dataset_stats( # Resolve instance groups if self.group_on is not None: num_instance_groups = self.get_num_instance_groups( - X=oml_dataset, group_on=self.group_on, group_labels=self.group_labels, + X=oml_dataset, + group_on=self.group_on, + group_labels=self.group_labels, ) return ( @@ -351,9 +389,7 @@ def compute_metadata( splits_metadata = {} for repeat_i, splits in self.split.split.items(): for fold_i, samples_for_split in splits.items(): - assert len(samples_for_split) == 1, ( - "Only one sample per split is supported so far!." - ) + assert len(samples_for_split) == 1, "Only one sample per split is supported so far!." train_idx, test_idx = samples_for_split[0] ( @@ -390,12 +426,8 @@ def compute_metadata( if task_problem_type is None: task_problem_type = split_problem_type else: - assert task_problem_type == split_problem_type, ( - "All splits must have the same problem type." - ) - s_index = SplitMetadata.get_split_index( - repeat_i=repeat_i, fold_i=fold_i - ) + assert task_problem_type == split_problem_type, "All splits must have the same problem type." + s_index = SplitMetadata.get_split_index(repeat_i=repeat_i, fold_i=fold_i) splits_metadata[s_index] = SplitMetadata( repeat=repeat_i, fold=fold_i, @@ -418,6 +450,20 @@ def compute_metadata( max_n_classes = max(num_classes_list) class_consistency_over_splits = min_n_classes == max_n_classes + # Detect feature dtype flags (exclude target column) + feature_df = oml_dataset.drop(columns=[target_name]) + + # FIXME: make this less strict? + if len(feature_df.select_dtypes(include=["object"]).columns) > 0: + raise ValueError( + "Object dtype columns are not supported. Please convert them to string dtype or categorical dtype!" + ) + + has_datetime = len(feature_df.select_dtypes(include=["datetime64"]).columns) > 0 + has_text = len(feature_df.select_dtypes(include=["string"]).columns) > 0 + has_categorical = len(feature_df.select_dtypes(include=["category"]).columns) > 0 + has_numeric = len(feature_df.select_dtypes(include=["number"]).columns) > 0 + self._task_metadata = TabArenaTaskMetadata( dataset_name=dataset_name, eval_metric=eval_metric, @@ -441,14 +487,16 @@ def compute_metadata( num_instance_groups=full_num_instance_groups, split_time_horizon=self.split_time_horizon, split_time_horizon_unit=self.split_time_horizon_unit, + has_datetime=has_datetime, + has_text=has_text, + has_categorical=has_categorical, + has_numeric=has_numeric, ) return self._task_metadata -class TabArenaOpenMLClassificationTask( - TabArenaTaskMetadataMixin, OpenMLClassificationTask -): +class TabArenaOpenMLClassificationTask(TabArenaTaskMetadataMixin, OpenMLClassificationTask): """A local OpenMLClassificationTask with additional metadata for TabArena.""" @@ -457,9 +505,7 @@ class TabArenaOpenMLRegressionTask(TabArenaTaskMetadataMixin, OpenMLRegressionTa # For typing -TabArenaOpenMLSupervisedTask = ( - TabArenaOpenMLClassificationTask | TabArenaOpenMLRegressionTask -) +TabArenaOpenMLSupervisedTask = TabArenaOpenMLClassificationTask | TabArenaOpenMLRegressionTask # Patch Functions for OpenML Dataset @@ -487,9 +533,7 @@ def __init__(self, *, task_name: str, task_cache_path: Path | None = None) -> No If None, the default OpenML cache directory is used. """ self.task_name = task_name - self._task_name_hash = hashlib.sha256( - self.task_name.encode("utf-8") - ).hexdigest() + self._task_name_hash = hashlib.sha256(self.task_name.encode("utf-8")).hexdigest() self._task_cache_path = task_cache_path @property @@ -497,11 +541,7 @@ def task_cache_path(self) -> Path: """Path to use for caching the local OpenML tasks.""" if self._task_cache_path is not None: return self._task_cache_path - return ( - (openml.config._root_cache_directory / "tabarena_tasks") - .expanduser() - .resolve() - ) + return (openml.config._root_cache_directory / "tabarena_tasks").expanduser().resolve() @staticmethod def from_task_id_str(task_id_str: str) -> UserTask: @@ -536,12 +576,7 @@ def _local_dataset_id(self) -> str: @property def _local_cache_path(self) -> Path: - return ( - Path(openml.config._root_cache_directory) - / "local" - / "datasets" - / self._local_dataset_id - ) + return Path(openml.config._root_cache_directory) / "local" / "datasets" / self._local_dataset_id def get_dataset_name(self, dataset_name: str | None = None) -> str: """Get the dataset name to use for the local OpenML dataset.""" @@ -636,9 +671,7 @@ def create_local_openml_task( self._validate_splits(splits=splits, n_samples=len(dataset)) task_type = ( - TaskType.SUPERVISED_CLASSIFICATION - if problem_type == "classification" - else TaskType.SUPERVISED_REGRESSION + TaskType.SUPERVISED_CLASSIFICATION if problem_type == "classification" else TaskType.SUPERVISED_REGRESSION ) extra_kwargs = {} if task_type == TaskType.SUPERVISED_CLASSIFICATION: @@ -650,23 +683,29 @@ def create_local_openml_task( raise NotImplementedError(f"Task type {task_type:d} not supported.") dataset_name = self.get_dataset_name(dataset_name=dataset_name) - print( - f"Creating local OpenML task {self.task_id} with dataset '{dataset_name}'..." - ) + print(f"Creating local OpenML task {self.task_id} with dataset '{dataset_name}'...") local_dataset = openml_create_datasets_without_arff_dump( name=dataset_name, data=dataset, default_target_attribute=target_feature, ) # Cache data to disk - parquet_file = self._local_cache_path / "data.pq" - parquet_file.parent.mkdir(parents=True, exist_ok=True) - dataset.to_parquet(parquet_file) + # This ensures to keep the dtypes of the original dataframe (and not lose it via parquet or similar) + # Moreover, this skips that OpenML itself has do pickle dump the dataset again. + pickle_file = self._local_cache_path / "data.pkl.py3" + pickle_file.parent.mkdir(parents=True, exist_ok=True) + with pickle_file.open("wb") as fh: + pickle.dump( + (dataset, [dataset[c].dtype.name == "category" for c in dataset.columns], list(dataset.columns)), + fh, + pickle.HIGHEST_PROTOCOL, + ) del dataset # Free memory # We only need local_dataset.get_data() from the OpenMLDataset, thus, we make # sure with the code below that get_data() returns the data. - local_dataset.parquet_file = parquet_file + local_dataset.data_pickle_file = pickle_file + local_dataset.cache_format = "pickle" local_dataset.data_file = "ignored" # not used for local datasets # Create the task @@ -710,9 +749,7 @@ def create_local_openml_task( return task @staticmethod - def _validate_splits( - *, splits: dict[int, dict[int, tuple[list, list]]], n_samples: int - ) -> None: + def _validate_splits(*, splits: dict[int, dict[int, tuple[list, list]]], n_samples: int) -> None: """Validate the splits passed by the user.""" if not isinstance(splits, dict): raise ValueError("Splits must be a dictionary.") @@ -723,28 +760,16 @@ def _validate_splits( raise ValueError(f"Splits for repeat {repeat_id} must be a dictionary.") test_indices_per_repeat = set() for split_id, (train_indices, test_indices) in split_dict.items(): - if not isinstance(train_indices, list) or not isinstance( - test_indices, list - ): + if not isinstance(train_indices, list) or not isinstance(test_indices, list): raise ValueError(f"Indices for split {split_id} must be lists.") - if not all( - isinstance(idx, int) for idx in train_indices + test_indices - ): - raise ValueError( - f"All indices in split {split_id} must be integers." - ) + if not all(isinstance(idx, int) for idx in train_indices + test_indices): + raise ValueError(f"All indices in split {split_id} must be integers.") if len(train_indices) == 0 or len(test_indices) == 0: - raise ValueError( - f"Train and test indices in split {split_id} must not be empty." - ) + raise ValueError(f"Train and test indices in split {split_id} must not be empty.") if set(train_indices) & set(test_indices): - raise ValueError( - f"Train and test indices in split {split_id} must not overlap." - ) + raise ValueError(f"Train and test indices in split {split_id} must not overlap.") if any(np.array(train_indices + test_indices) < 0): - raise ValueError( - f"Indices in split {split_id} must be non-negative." - ) + raise ValueError(f"Indices in split {split_id} must be non-negative.") if any(np.array(train_indices + test_indices) >= n_samples): raise ValueError( f"Indices in split {split_id} must not exceed the dataset size (0 to {n_samples - 1})." @@ -777,9 +802,7 @@ def save_local_openml_task(self, task: OpenMLSupervisedTask) -> None: def load_local_openml_task(self) -> TabArenaOpenMLSupervisedTask: """Load a local OpenML task from disk.""" if not self.openml_task_path.exists(): - raise FileNotFoundError( - f"Cached task file {self.openml_task_path} does not exist!" - ) + raise FileNotFoundError(f"Cached task file {self.openml_task_path} does not exist!") with self.openml_task_path.open("rb") as f: task: OpenMLSupervisedTask = pickle.load(f) @@ -830,14 +853,22 @@ def openml_create_datasets_without_arff_dump( unsupported_cols = data.select_dtypes(include=["datetime64", "timedelta64"]).columns # select_dtypes doesn't support "period" or "interval" as strings, so detect manually unsupported_cols = unsupported_cols.append( - pd.Index( - col for col in data.columns - if isinstance(data[col].dtype, (pd.PeriodDtype, pd.IntervalDtype)) - ) + pd.Index(col for col in data.columns if isinstance(data[col].dtype, (pd.PeriodDtype, pd.IntervalDtype))) ) - if len(unsupported_cols) > 0: + # Cast categories of categorical columns to string so that + # attributes_arff_from_df can handle them (e.g. integer categories). + cat_cols_to_fix = [ + col + for col in data.select_dtypes(include=["category"]).columns + if not pd.api.types.is_string_dtype(data[col].cat.categories) + ] + + if len(unsupported_cols) > 0 or len(cat_cols_to_fix) > 0: data = data.copy() + if len(unsupported_cols) > 0: data[unsupported_cols] = data[unsupported_cols].astype(str) + for col in cat_cols_to_fix: + data[col] = data[col].cat.rename_categories(str) # infer the type of data for each column of the DataFrame attributes_ = attributes_arff_from_df(data) @@ -846,9 +877,7 @@ def openml_create_datasets_without_arff_dump( _validated_data_attributes(ignore_attributes, attributes_, "ignore_attribute") default_target_attributes = _expand_parameter(default_target_attribute) - _validated_data_attributes( - default_target_attributes, attributes_, "default_target_attribute" - ) + _validated_data_attributes(default_target_attributes, attributes_, "default_target_attribute") return OpenMLDataset( name=name, diff --git a/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata.py b/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata.py index b5feac3c3..851ae205a 100644 --- a/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata.py +++ b/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata.py @@ -48,6 +48,13 @@ tabstar_metadata, ) +from tabarena.nips2025_utils.artifacts._tabarena_method_metadata_2026_01_23_tabprep import ( + tabprep_gbm_metadata, + tabprep_lr_metadata, + tabprep_realtabpfnv250_metadata, + tabprep_tabm_metadata, +) + methods_2025_09_03: list[MethodMetadata] = [ ag_140_metadata, @@ -90,6 +97,13 @@ # prep_gbm_v6_metadata, ] +methods_tabprep = [ + tabprep_gbm_metadata, + tabprep_lr_metadata, + tabprep_realtabpfnv250_metadata, # only first 3 splits + tabprep_tabm_metadata, # only first 3 splits +] + replaced_methods = [ "ExplainableBM", "RealMLP_GPU", diff --git a/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata_2026_01_23_tabprep.py b/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata_2026_01_23_tabprep.py new file mode 100644 index 000000000..00d192af2 --- /dev/null +++ b/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata_2026_01_23_tabprep.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +from tabarena.nips2025_utils.artifacts.method_metadata import MethodMetadata + + +tabprep_gbm_metadata = MethodMetadata( + method="PrepLightGBM", + artifact_name="tabarena-2026-01-23", + display_name="PrepLightGBM", + method_type="config", + compute="cpu", + date="2026-01-23", + ag_key="PREP_GBM", + model_key="PREP_GBM", + # config_default="PrepLightGBM_c1_BAG_L1", # FIXME + config_default="prep_LightGBM_icml_v3_c1_BAG_L1", + name_suffix=None, + has_raw=True, + has_processed=True, + has_results=True, + upload_as_public=True, + can_hpo=True, + is_bag=True, + s3_bucket="tabarena", + s3_prefix="cache", + verified=True, +) + +tabprep_lr_metadata = MethodMetadata( + method="PrepLinearModel", + artifact_name="tabarena-2026-01-23", + display_name="PrepLinear", + method_type="config", + compute="cpu", + date="2026-01-23", + ag_key="PREP_LR", + model_key="PREP_LR", + # config_default="PrepLinearModel_c1_BAG_L1", # FIXME + config_default="prep_LinearModel_icml_v3_c1_BAG_L1", + name_suffix=None, + has_raw=True, + has_processed=True, + has_results=True, + upload_as_public=True, + can_hpo=True, + is_bag=True, + s3_bucket="tabarena", + s3_prefix="cache", + verified=True, +) + + +tabprep_tabm_metadata = MethodMetadata( + method="PrepTabM", + artifact_name="tabarena-2026-01-23", + display_name="PrepTabM", + method_type="config", + compute="gpu", + date="2026-01-23", + ag_key="PREP_TABM", + model_key="PREP_TABM", + config_default="prep_TabM_c1_BAG_L1", # FIXME + name_suffix=None, + has_raw=True, + has_processed=True, + has_results=True, + upload_as_public=True, + can_hpo=True, + is_bag=True, + s3_bucket="tabarena", + s3_prefix="cache", + verified=True, +) + + +tabprep_realtabpfnv250_metadata = MethodMetadata( + method="PrepRealTabPFN-v2.5", + artifact_name="tabarena-2026-01-23", + display_name="PrepRealTabPFN-2.5", + method_type="config", + compute="gpu", + date="2026-01-23", + ag_key="PREP_REALTABPFN-V2.5", + model_key="PREP_REALTABPFN-V2.5", + config_default="prep_RealTabPFN-v2.5_c1_BAG_L1", # FIXME + name_suffix=None, + has_raw=True, + has_processed=True, + has_results=True, + upload_as_public=True, + can_hpo=True, + is_bag=False, + s3_bucket="tabarena", + s3_prefix="cache", + verified=True, +) diff --git a/tabarena/tabarena/nips2025_utils/compare.py b/tabarena/tabarena/nips2025_utils/compare.py index f53ab2b30..3576e2d52 100644 --- a/tabarena/tabarena/nips2025_utils/compare.py +++ b/tabarena/tabarena/nips2025_utils/compare.py @@ -16,6 +16,7 @@ def compare_on_tabarena( *, only_valid_tasks: bool | str | list[str] = False, subset: str | list[str] | None = None, + datasets: list[str] | None = None, folds: list[int] | None = None, tabarena_context: TabArenaContext | None = None, tabarena_context_kwargs: dict | None = None, @@ -62,12 +63,12 @@ def compare_on_tabarena( df_filter=new_results, ) - if subset is not None or folds is not None: + if subset is not None or folds is not None or datasets is not None: if subset is None: subset = [] if isinstance(subset, str): subset = [subset] - df_results = subset_tasks(df_results=df_results, subset=subset, folds=folds) + df_results = subset_tasks(df_results=df_results, subset=subset, folds=folds, datasets=datasets) return compare( df_results=df_results, @@ -203,7 +204,12 @@ def prepare_data( return df_results -def subset_tasks(df_results: pd.DataFrame, subset: list[str], folds: list[int] = None) -> pd.DataFrame: +def subset_tasks( + df_results: pd.DataFrame, + subset: list[str], + folds: list[int] = None, + datasets: list[str] = None, +) -> pd.DataFrame: from tabarena.nips2025_utils.fetch_metadata import load_task_metadata df_results = df_results.copy(deep=True) @@ -270,6 +276,8 @@ def subset_tasks(df_results: pd.DataFrame, subset: list[str], folds: list[int] = else: raise ValueError(f"Invalid subset {subset} name!") + if datasets is not None: + df_results = df_results[df_results["dataset"].isin(datasets)] if folds is not None: df_results = df_results[df_results["fold"].isin(folds)] df_results = df_results.reset_index(drop=True) diff --git a/tabarena/tabarena/nips2025_utils/end_to_end.py b/tabarena/tabarena/nips2025_utils/end_to_end.py index 49efaa9ce..d4c2f56d3 100644 --- a/tabarena/tabarena/nips2025_utils/end_to_end.py +++ b/tabarena/tabarena/nips2025_utils/end_to_end.py @@ -14,7 +14,6 @@ EndToEndResultsSingle, EndToEndSingle, ) -from tabarena.nips2025_utils.fetch_metadata import load_task_metadata from tabarena.nips2025_utils.method_processor import ( generate_task_metadata, load_all_artifacts, @@ -136,7 +135,7 @@ def from_path_raw( unique_types = list(unique_types_dict.keys()) if task_metadata is None: - task_metadata = generate_task_metadata(tids=list(unique_tids)) + task_metadata = EndToEndSingle.fetch_task_metadata(tids=list(unique_tids), verbose=verbose) log( f"Constructing EndToEnd from raw results... Found {len(unique_types)} unique methods: {unique_types}" @@ -185,6 +184,7 @@ def from_path_raw_to_results( model_key: str | None = None, artifact_name: str | None = None, num_cpus: int | None = None, + verbose: bool = True, ) -> EndToEndResults: """ Create and cache end-to-end results for all methods in the given directory. @@ -245,10 +245,8 @@ def from_path_raw_to_results( all_file_paths_method[did_sid].append(file_path) if task_metadata is None: - print("Get task metadata...") - task_metadata = load_task_metadata() - # Below is too slow to use by default, TODO: get logic for any task that is fast - # task_metadata = generate_task_metadata(tids=list({r.split("/")[0] for r in all_file_paths_method})) + tids = list({r.split("/")[0] for r in all_file_paths_method}) + task_metadata = EndToEndSingle.fetch_task_metadata(tids=tids, verbose=verbose) results: list[EndToEndResults] = ray_map_list( list_to_map=list(all_file_paths_method.values()), diff --git a/tabarena/tabarena/nips2025_utils/end_to_end_single.py b/tabarena/tabarena/nips2025_utils/end_to_end_single.py index e6f11b5af..114b30c3a 100644 --- a/tabarena/tabarena/nips2025_utils/end_to_end_single.py +++ b/tabarena/tabarena/nips2025_utils/end_to_end_single.py @@ -251,9 +251,8 @@ def from_raw( method_metadata.cache_raw(results_lst=results_lst) if task_metadata is None: - log(f"\tFetching task_metadata from OpenML...") tids = list({r.task_metadata["tid"] for r in results_lst}) - task_metadata = generate_task_metadata(tids=tids) + task_metadata = cls.fetch_task_metadata(tids=tids, verbose=verbose) log(f"\tConverting raw results into an EvaluationRepository...") # processed @@ -315,6 +314,7 @@ def from_path_raw( model_key: str | None = None, method: str | None = None, artifact_name: str | None = None, + name_prefix_raw: str | None = None, backend: Literal["ray", "native"] = "ray", verbose: bool = True, ) -> Self: @@ -343,7 +343,7 @@ def from_path_raw( """ engine = "ray" if backend == "ray" else "sequential" - results_lst: list[BaselineResult] = load_raw(path_raw=path_raw, engine=engine) + results_lst: list[BaselineResult] = load_raw(path_raw=path_raw, engine=engine, name_pattern=name_prefix_raw) return cls.from_raw( results_lst=results_lst, method_metadata=method_metadata, @@ -409,6 +409,19 @@ def to_results(self) -> EndToEndResultsSingle: hpo_results=self.hpo_results, ) + @staticmethod + def fetch_task_metadata(tids: list[int], verbose: bool = True): + log = print if verbose else (lambda *a, **k: None) + task_metadata = load_task_metadata() + tids_cached = set(task_metadata["tid"].unique()) + + tids_missing = [tid for tid in tids if tid not in tids_cached] + if tids_missing: + log(f"Note: Missing {len(tids_missing)} tasks in the cached task_metadata...") + log(f"\tFetching task_metadata from OpenML... (this may take ~1 minute)") + task_metadata = generate_task_metadata(tids=tids) + return task_metadata + @staticmethod def from_path_raw_to_results( path_raw: str | Path | list[str | Path], @@ -423,6 +436,7 @@ def from_path_raw_to_results( artifact_name: str | None = None, num_cpus: int | None = None, name_prefix_raw: str | None = None, + verbose: bool = True, ) -> EndToEndResultsSingle: """ Create and cache end-to-end results for the method in the given directory. @@ -486,10 +500,8 @@ def from_path_raw_to_results( all_file_paths_method[did_sid].append(file_path) if task_metadata is None: - print("Get task metadata...") - task_metadata = load_task_metadata() - # Below is too slow to use by default, TODO: get logic for any task that is fast - # task_metadata = generate_task_metadata(tids=list({r.split("/")[0] for r in all_file_paths_method})) + tids = list({int(r.split("/")[0]) for r in all_file_paths_method}) + task_metadata = EndToEndSingle.fetch_task_metadata(tids=tids, verbose=verbose) import ray if not ray.is_initialized(): diff --git a/tabarena/tabarena/nips2025_utils/method_processor.py b/tabarena/tabarena/nips2025_utils/method_processor.py index b60d05ec5..477a68e69 100644 --- a/tabarena/tabarena/nips2025_utils/method_processor.py +++ b/tabarena/tabarena/nips2025_utils/method_processor.py @@ -102,6 +102,7 @@ def get_info_from_result(result: BaselineResult) -> dict: def load_raw( path_raw: str | Path | list[str | Path] = None, + name_pattern: str | None = None, engine: str = "ray", as_holdout: bool = False, ) -> list[BaselineResult]: @@ -120,7 +121,7 @@ def load_raw( """ suffix = "results.pkl" - file_paths_method = fetch_all_pickles(dir_path=path_raw, suffix=suffix) + file_paths_method = fetch_all_pickles(dir_path=path_raw, suffix=suffix, name_pattern=name_pattern) if len(file_paths_method) == 0: # Look at every file to provide debugging info all_files = [p for p in Path(path_raw).rglob("*") if p.is_file()] diff --git a/tabflow_slurm/benchmarking_setup/data_foundry_integration/data_foundry_registry.py b/tabflow_slurm/benchmarking_setup/data_foundry_integration/data_foundry_registry.py deleted file mode 100644 index 59afa735d..000000000 --- a/tabflow_slurm/benchmarking_setup/data_foundry_integration/data_foundry_registry.py +++ /dev/null @@ -1,6 +0,0 @@ -# TODO -"""Categorizes data foundry artifacts that we aim to use.""" -from __future__ import annotations - - - diff --git a/tabflow_slurm/benchmarking_setup/data_foundry_integration/data_foundry_task_creator.py b/tabflow_slurm/benchmarking_setup/data_foundry_integration/data_foundry_task_creator.py index 7eb681c8e..2fc6bb4ae 100644 --- a/tabflow_slurm/benchmarking_setup/data_foundry_integration/data_foundry_task_creator.py +++ b/tabflow_slurm/benchmarking_setup/data_foundry_integration/data_foundry_task_creator.py @@ -5,7 +5,6 @@ import pandas as pd from data_foundry.curation_container import CuratedContainer -from data_foundry.schema import ProblemTypeClassification from loguru import logger from tabarena.benchmark.task import UserTask from tqdm import tqdm @@ -127,10 +126,41 @@ def convert_data_foundry_task_to_user_task( task_container = CuratedContainer.load(path_to_local_task) # Resolve task type + y: pd.Series = task_container.dataset[task_container.task_metadata.target_column_name] if task_container.task_metadata.problem_type == "regression": problem_type = "regression" - elif task_container.task_metadata.problem_type in ProblemTypeClassification: + # Assert y is pd.numeric + if not pd.api.types.is_numeric_dtype(y): + raise ValueError( + f"Target column {task_container.task_metadata.target_column_name} is not numeric for " + f"regression problem. ({task_container.dataset_metadata.unique_name})" + ) + elif task_container.task_metadata.problem_type == "binary_classification": + problem_type = "classification" + # Assert y is pd.categorical with 2 classes + if not isinstance(y.dtype, pd.CategoricalDtype): + raise ValueError( + f"Target column {task_container.task_metadata.target_column_name} is not categorical " + f"for classification problem. ({task_container.dataset_metadata.unique_name})" + ) + if y.nunique() != 2: + raise ValueError( + f"Target column {task_container.task_metadata.target_column_name} has {y.nunique()} classes, " + f"but expected 2 for binary classification problem. ({task_container.dataset_metadata.unique_name})" + ) + elif task_container.task_metadata.problem_type == "multiclass_classification": problem_type = "classification" + if not isinstance(y.dtype, pd.CategoricalDtype): + raise ValueError( + f"Target column {task_container.task_metadata.target_column_name} is not categorical for " + f"classification problem. ({task_container.dataset_metadata.unique_name})" + ) + if y.nunique() < 3: + raise ValueError( + f"Target column {task_container.task_metadata.target_column_name} has {y.nunique()} classes, " + f"but expected at least 3 for multiclass classification " + f"problem. ({task_container.dataset_metadata.unique_name})" + ) else: raise ValueError(f"Unknown problem type {task_container.task_metadata.problem_type}") @@ -141,7 +171,7 @@ def convert_data_foundry_task_to_user_task( fallback_metric = allowed_eval_metrics[0] if eval_metric not in allowed_eval_metrics: logger.info( - f"Objective metric {eval_metric} not in allowed for problem type {problem_type}. " + f"\nObjective metric {eval_metric} not allowed for problem type {problem_type}. " f"Falling back to {fallback_metric}." ) eval_metric = fallback_metric @@ -237,7 +267,7 @@ def get_metadata_for_benchmark_suite(benchmark_suite_name: str, data_foundry_cac path_to_metadata = data_foundry_cache / f"{benchmark_suite_name}_tasks_metadata.csv" if not path_to_metadata.exists(): raise FileNotFoundError( - f"Metadata file {path_to_metadata} does not exist. " "Please run download_data_foundry_datasets first." + f"Metadata file {path_to_metadata} does not exist. Please run download_data_foundry_datasets first." ) return path_to_metadata diff --git a/tabflow_slurm/benchmarking_setup/text_caching/__init__.py b/tabflow_slurm/benchmarking_setup/text_caching/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tabflow_slurm/benchmarking_setup/text_caching/run_text_cache.py b/tabflow_slurm/benchmarking_setup/text_caching/run_text_cache.py new file mode 100644 index 000000000..d2088c84f --- /dev/null +++ b/tabflow_slurm/benchmarking_setup/text_caching/run_text_cache.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from pathlib import Path + + +def pre_generate_text_cache(task_id_str: str, *, ignore_cache: bool = False) -> Path: + """Generate the cache as it would be generated on-the-fly during preprocessing, + and save it to a parquet file for later loading. + """ + from tabarena.benchmark.preprocessing.model_agnostic_default_preprocessing import TabArenaModelAgnosticPreprocessing + from tabarena.benchmark.preprocessing.text_feature_generators import SemanticTextFeatureGenerator + from tabarena.benchmark.task.openml import OpenMLTaskWrapper + from tabarena.benchmark.task.user_task import UserTask + + task_id_or_object = UserTask.from_task_id_str(task_id_str) + cache_path = SemanticTextFeatureGenerator.get_text_cache_dir(task_id_str=str(task_id_or_object.task_id)) + if (not ignore_cache) and cache_path.exists(): + print(f"Cache already exists for {task_id_str} at {cache_path}, skipping generation.") + return cache_path + + task = OpenMLTaskWrapper( + task=task_id_or_object.load_local_openml_task(), + ) + print(f"Loaded {task_id_str}, with {len(task.X)} rows and {len(task.X.columns)} columns.") + preprocessing = TabArenaModelAgnosticPreprocessing( + enable_sematic_text_features=True, + enable_raw_text_features=False, + enable_text_special_features=False, + enable_statistical_text_features=False, + enable_text_ngram_features=False, + enable_datetime_features=False, + verbosity=4, + ) + preprocessing.fit_transform(X=task.X) + + cache_path = SemanticTextFeatureGenerator.get_text_cache_dir(task_id_str=str(task.task_id)) + SemanticTextFeatureGenerator.save_embedding_cache( + cache=SemanticTextFeatureGenerator._embedding_look_up, path=cache_path + ) + SemanticTextFeatureGenerator._embedding_look_up.clear() + print(f"Cache generated and saved to: {cache_path}") + return cache_path + + +if __name__ == "__main__": + import argparse + import logging + + logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(name)s %(levelname)s: %(message)s") + + # TODO: add support for setting the OpenML cache dir here as well. + parser = argparse.ArgumentParser() + # Require tasks settings + parser.add_argument( + "--task_id_str", + type=str, + required=True, + help="User Task ID for a dataset with text.", + ) + args = parser.parse_args() + + pre_generate_text_cache(args.task_id_str) diff --git a/tabflow_slurm/run_tabarena_experiment.py b/tabflow_slurm/run_tabarena_experiment.py index f93a6aaec..994b4d943 100644 --- a/tabflow_slurm/run_tabarena_experiment.py +++ b/tabflow_slurm/run_tabarena_experiment.py @@ -48,6 +48,9 @@ def setup_slurm_job( import tempfile import ray + import os + + os.environ["RAY_DISABLE_RETRIES"] = "1" ray_dir = tempfile.mkdtemp() + "/ray" @@ -65,18 +68,22 @@ def setup_slurm_job( # Likely slower but runs at least. _plasma_directory = ray_dir - ray.init( - address="local", - _memory=ray_mem_in_b, - object_store_memory=int(ray_mem_in_b * 0.3), - _temp_dir=ray_dir, - include_dashboard=False, - logging_level=logging.INFO, - log_to_driver=True, - num_gpus=num_gpus, - num_cpus=num_cpus, - _plasma_directory=_plasma_directory, - ) + import warnings + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + ray.init( + address="local", + _memory=ray_mem_in_b, + object_store_memory=int(ray_mem_in_b * 0.3), + _temp_dir=ray_dir, + include_dashboard=False, + logging_level=logging.INFO, + log_to_driver=True, + num_gpus=num_gpus, + num_cpus=num_cpus, + _plasma_directory=_plasma_directory, + ) return ray_dir @@ -389,9 +396,17 @@ def _parse_int_or_none(s): parser.add_argument( "--num_gpus", type=int, - help="Number of GPUs to use for the experiment.", + help="Number of GPUs to use for the experiment (SLURM node allocation and Ray).", default=0, ) + parser.add_argument( + "--num_gpus_model", + type=_parse_int_or_none, + help="Number of GPUs passed to AutoGluon for model fitting. " + "If None, defaults to --num_gpus. Set to 0 to reserve the GPU " + "for preprocessing only (e.g. text embedding) while fitting models on CPU.", + default=None, + ) parser.add_argument( "--memory_limit", type=_parse_int_or_none, @@ -430,6 +445,9 @@ def _parse_int_or_none(s): f"Memory limit not provided, using detected memory size: {memory_limit} GB" ) + num_gpus_model = args.num_gpus_model if args.num_gpus_model is not None else args.num_gpus + print(f"GPUs for node/Ray: {args.num_gpus}, GPUs for model fitting: {num_gpus_model}") + ray_temp_dir = setup_slurm_job( openml_cache_dir=args.openml_cache_dir, setup_ray_for_slurm_shared_resources_environment=args.setup_ray_for_slurm_shared_resources_environment, @@ -447,7 +465,7 @@ def _parse_int_or_none(s): output_dir=args.output_dir, ignore_cache=args.ignore_cache, num_cpus=num_cpus, - num_gpus=args.num_gpus, + num_gpus=num_gpus_model, memory_limit=memory_limit, sequential_local_fold_fitting=args.sequential_local_fold_fitting, dynamic_tabarena_validation_protocol=args.dynamic_tabarena_validation_protocol, diff --git a/tabflow_slurm/setup_slurm_base_v2.py b/tabflow_slurm/setup_slurm_base_v2.py index 77d04cc89..4f44406ff 100644 --- a/tabflow_slurm/setup_slurm_base_v2.py +++ b/tabflow_slurm/setup_slurm_base_v2.py @@ -2,6 +2,7 @@ import json import re +import warnings from copy import deepcopy from dataclasses import dataclass, field from pathlib import Path @@ -70,10 +71,7 @@ def run_script_path(self) -> str: """Python script to run the benchmark. This should point to the script that runs the benchmark for TabArena. """ - return self.base_path + ( - f"code/{self.tabarena_repo_name}/tabarena" - f"/tabflow_slurm/run_tabarena_experiment.py" - ) + return self.base_path + (f"code/{self.tabarena_repo_name}/tabarena/tabflow_slurm/run_tabarena_experiment.py") @property def configs_base_path(self) -> str: @@ -82,9 +80,7 @@ def configs_base_path(self) -> str: File path is f"{self.base_path}{self.configs_path_from_base_path} {self._safe_benchmark_name}.yaml". """ - return ( - f"code/{self.tabarena_repo_name}/tabarena/tabflow_slurm/benchmark_configs_" - ) + return f"code/{self.tabarena_repo_name}/tabarena/tabflow_slurm/benchmark_configs_" def get_slurm_job_json_path(self, safe_benchmark_name: str) -> str: """JSON file with the job data to run used by SLURM. @@ -92,10 +88,7 @@ def get_slurm_job_json_path(self, safe_benchmark_name: str) -> str: """ # TODO: change UX for config and slurm paths. path_to_config_file = str(Path(self.configs_base_path).parent) + "/" - return ( - f"{self.base_path}{path_to_config_file}" - f"slurm_run_data_{safe_benchmark_name}.json" - ) + return f"{self.base_path}{path_to_config_file}slurm_run_data_{safe_benchmark_name}.json" def get_configs_path(self, safe_benchmark_name: str) -> str: """YAML file with the configs to run.""" @@ -111,10 +104,7 @@ def get_slurm_log_output_path(self, benchmark_name: str) -> str: def get_slurm_script_path(self, script_name: str) -> str: """Path to the SLURM script to run.""" - return ( - self.base_path - + f"code/{self.tabarena_repo_name}/tabarena/tabflow_slurm/{script_name}" - ) + return self.base_path + f"code/{self.tabarena_repo_name}/tabarena/tabflow_slurm/{script_name}" @dataclass @@ -187,6 +177,23 @@ class BenchmarkSetup2026: split_indices_to_run: list[str] | Literal["lite"] | None = None """Split indices to run in the benchmark. Adjust as needed to run only specific splits. If None, we run all splits. If "lite", we run only the first split.""" + required_dtypes_to_run: list[str] | None = None + """Adjust as needed to run only datasets with at least one column of data types. + Options: "numeric", "categorical", "text", "datetime". + If None, we do not require any data types. + """ + forbidden_dtypes_to_run: list[str] | None = None + """Adjust as needed to run only datasets without any columns of data types. + Options: "numeric", "categorical", "text", "datetime". + If None, we do not forbid any data types. + """ + n_train_samples_to_run: tuple[int | None, int | None] | None = None + """Tuple of lower and upper limit for the number of training samples of datasets run in the benchmark. + Adjust as needed to run only datasets with a certain number of training samples. + If None, we run all datasets. + Lower limit is inclusive, upper limit is exclusive. For example, (0, 1000) runs only datasets with less + than 1000 training samples. If a tuple value is None, there is no limit in that direction. + """ path_setup: PathSetup = field(default_factory=PathSetup) """Contains all path related to the benchmark.""" @@ -209,7 +216,12 @@ class BenchmarkSetup2026: """Number of CPUs to use for the job. If None, use all available CPUs.""" num_gpus: int = 0 - """Number of GPUs to use for the jobs.""" + """Number of GPUs to use for the jobs (SLURM allocation and Ray).""" + num_gpus_model: int | None = None + """Number of GPUs passed to a model for fitting. + If None (default), uses the same value as ``num_gpus``. + Set to 0 to reserve the GPU for preprocessing (e.g. sentence-transformer + encoding) while fitting models on CPU only.""" memory_limit: int | None = 32 """Memory/RAM limit for the jobs in GB. If None, use all available memory.""" @@ -248,9 +260,7 @@ class BenchmarkSetup2026: This can be disabled by setting this to False. Warning: the model then needs to be able to handle this! """ - preprocessing_pipelines: list[str] = field( - default_factory=lambda: ["tabarena_default"] - ) + preprocessing_pipelines: list[str] = field(default_factory=lambda: ["tabarena_default"]) """EXPERIMENTAL! Preprocessing pipelines to add to the configurations we want to run. @@ -297,6 +307,11 @@ class BenchmarkSetup2026: } } """ + max_predict_batch_size: int | None = 50_000 + """Maximal batch size for the predict function of the models. + This is used at validation and test predict time. Thus, it trades off speed for memory usage. + If None, no limit is applied. + """ # Misc Settings # ------------- @@ -416,9 +431,7 @@ def _get_slurm_base_command( # noqa: PLR0913 partition = "--partition=" + partition slurm_logs = f"--output={slurm_log_output}/%A/slurm-%A_%a.out" - time_in_h = ( - time_limit_per_config // 3600 * configs_per_job + time_limit_overhead - ) + time_in_h = time_limit_per_config // 3600 * configs_per_job + time_limit_overhead time_in_h = f"--time={time_in_h}:00:00" # Handle GPU (same for exclusive and non-exclusive) @@ -453,10 +466,7 @@ def _get_slurm_base_command( # noqa: PLR0913 @property def slurm_base_command(self): """SLURM command to run the benchmark.""" - p_bm = self._parallel_safe_benchmark_name - slurm_script_path = self.path_setup.get_slurm_script_path( - self.slurm_setup.script_name - ) + slurm_script_path = self.path_setup.get_slurm_script_path(self.slurm_setup.script_name) return self._get_slurm_base_command( num_cpus=self.num_cpus, @@ -465,7 +475,7 @@ def slurm_base_command(self): time_limit_per_config=self.time_limit_per_config, configs_per_job=self._max_configs_per_job, time_limit_overhead=self.slurm_setup.time_limit_overhead, - slurm_log_output=self.path_setup.get_slurm_log_output_path(p_bm), + slurm_log_output=self.path_setup.get_slurm_log_output_path(self.benchmark_name), slurm_script_path=slurm_script_path, slurm_extra_gres=self.slurm_setup.extra_gres, slurm_exclusive_node=self.slurm_setup.exclusive_node, @@ -504,17 +514,15 @@ def _load_task_metadata(self) -> list[TabArenaTaskMetadata]: for repeat_i in range(n_repeats): for fold_i in range(n_folds): - split_index = SplitMetadata.get_split_index(repeat_i=repeat_i, fold_i=fold_i) splits_metadata = { - split_index: - SplitMetadata( + split_index: SplitMetadata( repeat=repeat_i, fold=fold_i, - num_instances_train=num_instances * 2/3, - num_instances_test=num_instances * 1/3, - num_instance_groups_train=num_instances * 2/3, - num_instance_groups_test=num_instances * 1/3, + num_instances_train=num_instances * 2 / 3, + num_instances_test=num_instances * 1 / 3, + num_instance_groups_train=num_instances * 2 / 3, + num_instance_groups_test=num_instances * 1 / 3, num_classes_train=num_classes, num_classes_test=num_classes, num_features_train=num_features, @@ -553,53 +561,65 @@ def _load_task_metadata(self) -> list[TabArenaTaskMetadata]: task_metadata = pd.read_csv(task_metadata, index_col=False) if isinstance(task_metadata, pd.DataFrame): # Parse task_metadat - task_metadata = [ - TabArenaTaskMetadata.from_row(row) - for _, row in task_metadata.iterrows() - ] + task_metadata = [TabArenaTaskMetadata.from_row(row) for _, row in task_metadata.iterrows()] assert all(isinstance(x, TabArenaTaskMetadata) for x in task_metadata) n_rolled_up_tasks = len(task_metadata) # Unify format to be unrolled - task_metadata = [ - single_ttm for ttm in task_metadata for single_ttm in ttm.unroll_splits() - ] + task_metadata = [single_ttm for ttm in task_metadata for single_ttm in ttm.unroll_splits()] n_unrolled_tasks = len(task_metadata) # -- Perform general filters/slices - task_metadata = [ - ttm - for ttm in task_metadata - if ttm.problem_type in self.problem_types_to_run - ] + task_metadata = [ttm for ttm in task_metadata if ttm.problem_type in self.problem_types_to_run] n_problem_types_filtered_tasks = len(task_metadata) if self.split_indices_to_run is not None: if self.split_indices_to_run == "lite": - split_indices_to_run = [ - SplitMetadata.get_split_index(repeat_i=0, fold_i=0) - ] + split_indices_to_run = [SplitMetadata.get_split_index(repeat_i=0, fold_i=0)] else: split_indices_to_run = self.split_indices_to_run # Assert split indices are valid split_index_pattern = re.compile(r"^r\d+f\d+$") for split_index in split_indices_to_run: - assert ( - split_index_pattern.match(split_index) - ), f"Invalid SplitIndex format: {split_index!r}, expected 'r{{int}}f{{int}}'" + assert split_index_pattern.match(split_index), ( + f"Invalid SplitIndex format: {split_index!r}, expected 'r{{int}}f{{int}}'" + ) + + task_metadata = [ttm for ttm in task_metadata if ttm.split_index in split_indices_to_run] + n_splits_filtered_tasks = len(task_metadata) + # Filter based on dtypes if specified + if (self.forbidden_dtypes_to_run is not None) or (self.required_dtypes_to_run is not None): task_metadata = [ - ttm for ttm in task_metadata if ttm.split_index in split_indices_to_run + ttm + for ttm in task_metadata + if ttm.has_supported_dtypes( + required_dtypes=self.required_dtypes_to_run, + forbidden_dtypes=self.forbidden_dtypes_to_run, + ) ] - n_splits_filtered_tasks = len(task_metadata) + n_dtypes_filtered_tasks = len(task_metadata) + + # Filter based on training samples if specified + if self.n_train_samples_to_run is not None: + lb, ub = self.n_train_samples_to_run + lb = lb if lb is not None else 0 + ub = ub if ub is not None else float("inf") + task_metadata = [ + ttm + for ttm in task_metadata + if ( + (ttm.splits_metadata[ttm.split_index].num_instances_train < ub) + and (ttm.splits_metadata[ttm.split_index].num_instances_train >= lb) + ) + ] + n_sizes_filtered_tasks = len(task_metadata) # -- Sanity checks for ttm in task_metadata: if ttm.task_id_str is None: - raise ValueError( - f"Task metadata for task {ttm.tabarena_task_name} does not have a task_id_str!" - ) + raise ValueError(f"Task metadata for task {ttm.tabarena_task_name} does not have a task_id_str!") print( f"Found {len(task_metadata)} tasks from metadata." @@ -607,6 +627,8 @@ def _load_task_metadata(self) -> list[TabArenaTaskMetadata]: f"\n\t(1) {n_rolled_up_tasks} datasets -> {n_unrolled_tasks} Tasks." f"\n\t(2) Filter to problem types: {n_problem_types_filtered_tasks}" f"\n\t(3) Filter to splits: {n_splits_filtered_tasks}." + f"\n\t(4) Filter to dtypes: {n_dtypes_filtered_tasks}." + f"\n\t(5) Filter to dataset size: {n_sizes_filtered_tasks}." ) return task_metadata @@ -616,12 +638,8 @@ def get_jobs_to_run(self): # noqa: C901 """ if self.path_setup.openml_cache_path != "auto": Path(self.path_setup.openml_cache_path).mkdir(parents=True, exist_ok=True) - Path(self.path_setup.get_output_path(self.benchmark_name)).mkdir( - parents=True, exist_ok=True - ) - Path(self.path_setup.get_slurm_log_output_path(self.benchmark_name)).mkdir( - parents=True, exist_ok=True - ) + Path(self.path_setup.get_output_path(self.benchmark_name)).mkdir(parents=True, exist_ok=True) + Path(self.path_setup.get_slurm_log_output_path(self.benchmark_name)).mkdir(parents=True, exist_ok=True) task_metadata_list = self._load_task_metadata() configs = self.generate_configs_yaml() @@ -629,9 +647,7 @@ def get_jobs_to_run(self): # noqa: C901 def yield_all_jobs(): for ta_task_metadata in task_metadata_list: task_id = ta_task_metadata.task_id_str - split_md = ta_task_metadata.splits_metadata[ - ta_task_metadata.split_index - ] + split_md = ta_task_metadata.splits_metadata[ta_task_metadata.split_index] for config_index, config in list(enumerate(configs)): yield { @@ -650,7 +666,9 @@ def yield_all_jobs(): # Check cache and filter invalid jobs in parallel using Ray if ray.is_initialized: ray.shutdown() - ray.init(num_cpus=self.num_ray_cpus) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + ray.init(num_cpus=self.num_ray_cpus) output = ray_map_list( list_to_map=list(to_batch_list(jobs_to_check, 10_000)), func=should_run_job_batch, @@ -665,9 +683,7 @@ def yield_all_jobs(): track_progress=True, tqdm_kwargs={"desc": "Checking Cache and Filter Invalid Jobs"}, ) - output = [ - item for sublist in output for item in sublist - ] # Flatten the batched list + output = [item for sublist in output for item in sublist] # Flatten the batched list to_run_job_map = {} for run_job, job_data in zip(output, jobs_to_check, strict=True): if run_job: @@ -686,9 +702,7 @@ def yield_all_jobs(): max_config_batch = 1 for job_key, config_indices in to_run_job_map.items(): to_run_jobs += len(config_indices) - for config_batch in to_batch_list( - config_indices, self.slurm_setup.configs_per_job - ): + for config_batch in to_batch_list(config_indices, self.slurm_setup.configs_per_job): max_config_batch = max(max_config_batch, len(config_batch)) jobs.append( { @@ -703,9 +717,7 @@ def yield_all_jobs(): print(f"Jobs with batching: {len(jobs)}") return jobs - def _generate_autogluon_config( - self, *, model_name: str, agexp_kwargs: dict, pipeline_method_kwargs: dict - ) -> list: + def _generate_autogluon_config(self, *, model_name: str, agexp_kwargs: dict, pipeline_method_kwargs: dict) -> list: """Parse the AutoGluon config from the models.""" from tabarena.benchmark.experiment.experiment_constructor import ( AGExperiment, @@ -741,8 +753,7 @@ def _generate_model_configs( n_configs = self.n_random_configs elif not isinstance(n_configs, int): raise ValueError( - f"Invalid number of configurations for model {model_name}: {n_configs}. " - "Must be an integer or 'all'." + f"Invalid number of configurations for model {model_name}: {n_configs}. Must be an integer or 'all'." ) config_generator = get_configs_generator_from_name(model_name) # TODO: add model agnostic time limit here @@ -768,22 +779,22 @@ def generate_configs_yaml(self) -> list[dict]: "init_kwargs": {"verbosity": self.verbosity}, "shuffle_features": self.shuffle_features, "fit_kwargs": dict(), + "model_hyperparameters": dict(), } if self.model_artifacts_base_path is not None: - method_kwargs["init_kwargs"]["default_base_path"] = ( - self.model_artifacts_base_path - ) + method_kwargs["init_kwargs"]["default_base_path"] = self.model_artifacts_base_path if not self.model_agnostic_preprocessing: method_kwargs["fit_kwargs"]["feature_generator"] = None if self.adapt_num_folds_to_n_classes: method_kwargs["fit_kwargs"]["adapt_num_bag_folds_to_n_classes"] = True + if self.max_predict_batch_size is not None: + method_kwargs["model_hyperparameters"]["ag.max_batch_size"] = self.max_predict_batch_size print( "Generating experiments for models...", f"\n\t`all` := number of configs: {self.n_random_configs}", f"\n\t{len(self.models)} models: {self.models}", - f"\n\t{len(self.preprocessing_pipelines)} preprocessing pipelines: " - f"{self.preprocessing_pipelines}", + f"\n\t{len(self.preprocessing_pipelines)} preprocessing pipelines: {self.preprocessing_pipelines}", f"\n\tMethod kwargs: {method_kwargs}", ) for preprocessing_name in self.preprocessing_pipelines: @@ -819,9 +830,7 @@ def generate_configs_yaml(self) -> list[dict]: ) # Verify no duplicate names - experiments_all = [ - exp for exp_family_lst in experiments_lst for exp in exp_family_lst - ] + experiments_all = [exp for exp_family_lst in experiments_lst for exp in exp_family_lst] experiment_names = set() for experiment in experiments_all: if experiment.name not in experiment_names: @@ -832,9 +841,7 @@ def generate_configs_yaml(self) -> list[dict]: f"All experiment names must be unique!", ) - configs_path = self.path_setup.get_configs_path( - self._parallel_safe_benchmark_name - ) + configs_path = self.path_setup.get_configs_path(self._parallel_safe_benchmark_name) YamlExperimentSerializer.to_yaml( experiments=experiments_all, path=configs_path, @@ -857,12 +864,11 @@ def get_jobs_dict(self): "python": self.path_setup.python_path, "run_script": self.path_setup.run_script_path, "openml_cache_dir": self.path_setup.openml_cache_path, - "configs_yaml_file": self.path_setup.get_configs_path( - self._parallel_safe_benchmark_name - ), + "configs_yaml_file": self.path_setup.get_configs_path(self._parallel_safe_benchmark_name), "output_dir": self.path_setup.get_output_path(self.benchmark_name), "num_cpus": self.num_cpus, "num_gpus": self.num_gpus, + "num_gpus_model": self.num_gpus_model, "memory_limit": memory_limit, "setup_ray_for_slurm_shared_resources_environment": self.slurm_setup.setup_ray_for_slurm_shared_resources_environment, "ignore_cache": self.ignore_cache, @@ -871,7 +877,7 @@ def get_jobs_dict(self): } return {"defaults": default_args, "jobs": jobs} - def setup_jobs(self, array_job_limit: int = 100) -> str | list[str]: + def setup_jobs(self, array_job_limit: int = 100) -> list[str]: """Setup the jobs to run by generating the SLURM job JSON file(s). If the number of jobs exceeds `slurm_setup.max_array_size`, the jobs @@ -881,17 +887,13 @@ def setup_jobs(self, array_job_limit: int = 100) -> str | list[str]: strings if multiple batches are needed. """ jobs_dict = self.get_jobs_dict() - base_json_path = self.path_setup.get_slurm_job_json_path( - self._parallel_safe_benchmark_name - ) + base_json_path = self.path_setup.get_slurm_job_json_path(self._parallel_safe_benchmark_name) all_jobs = jobs_dict["jobs"] n_jobs = len(all_jobs) if n_jobs == 0: print("No jobs to run.") Path(base_json_path).unlink(missing_ok=True) - Path( - self.path_setup.get_configs_path(self._parallel_safe_benchmark_name) - ).unlink(missing_ok=True) + Path(self.path_setup.get_configs_path(self._parallel_safe_benchmark_name)).unlink(missing_ok=True) return "N/A" max_array_size = self.slurm_setup.max_array_size @@ -904,27 +906,19 @@ def setup_jobs(self, array_job_limit: int = 100) -> str | list[str]: if len(job_batches) == 1: json_path = base_json_path else: - json_path = base_json_path.replace( - ".json", f"_batch{batch_idx}.json" - ) + json_path = base_json_path.replace(".json", f"_batch{batch_idx}.json") batch_dict = {"defaults": jobs_dict["defaults"], "jobs": batch_jobs} with open(json_path, "w") as f: json.dump(batch_dict, f) batch_size = len(batch_jobs) - run_command = ( - f"sbatch --array=0-{batch_size - 1}%{array_job_limit}" - f" {self.slurm_base_command} {json_path}" - ) + run_command = f"sbatch --array=0-{batch_size - 1}%{array_job_limit} {self.slurm_base_command} {json_path}" run_commands.append(run_command) batch_info = "" if len(job_batches) > 1: - batch_info = ( - f"\nSplit into {len(job_batches)} array job batches" - f" (max {max_array_size} per batch)." - ) + batch_info = f"\nSplit into {len(job_batches)} array job batches (max {max_array_size} per batch)." print( f"##### Setup Jobs for {self._parallel_safe_benchmark_name}" f"{batch_info}" @@ -932,8 +926,6 @@ def setup_jobs(self, array_job_limit: int = 100) -> str | list[str]: f"\n" + "\n".join(run_commands) + "\n" ) - if len(run_commands) == 1: - return run_commands[0] return run_commands @property @@ -1023,20 +1015,12 @@ def are_model_constraints_valid( if (max_n_features is not None) and (n_features > max_n_features): return False - max_n_samples_train_per_fold = model_constraints.get( - "max_n_samples_train_per_fold", None - ) - if (max_n_samples_train_per_fold is not None) and ( - n_samples_train_per_fold > max_n_samples_train_per_fold - ): + max_n_samples_train_per_fold = model_constraints.get("max_n_samples_train_per_fold", None) + if (max_n_samples_train_per_fold is not None) and (n_samples_train_per_fold > max_n_samples_train_per_fold): return False - min_n_samples_train_per_fold = model_constraints.get( - "min_n_samples_train_per_fold", None - ) - if (min_n_samples_train_per_fold is not None) and ( - n_samples_train_per_fold < min_n_samples_train_per_fold - ): + min_n_samples_train_per_fold = model_constraints.get("min_n_samples_train_per_fold", None) + if (min_n_samples_train_per_fold is not None) and (n_samples_train_per_fold < min_n_samples_train_per_fold): return False max_n_classes = model_constraints.get("max_n_classes", None) diff --git a/tabflow_slurm/submit_template.sh b/tabflow_slurm/submit_template.sh index b0fb3d738..16572a6d2 100644 --- a/tabflow_slurm/submit_template.sh +++ b/tabflow_slurm/submit_template.sh @@ -43,6 +43,7 @@ CONFIGS_YAML_FILE=$(jq -r '.defaults.configs_yaml_file' "$JSON_FILE") OUTPUT_DIR=$(jq -r '.defaults.output_dir' "$JSON_FILE") NUM_CPUS=$(jq -r '.defaults.num_cpus' "$JSON_FILE") NUM_GPUS=$(jq -r '.defaults.num_gpus' "$JSON_FILE") +NUM_GPUS_MODEL=$(jq -r '.defaults.num_gpus_model' "$JSON_FILE") MEMORY_LIMIT=$(jq -r '.defaults.memory_limit' "$JSON_FILE") SETUP_RAY=$(jq -r '.defaults.setup_ray_for_slurm_shared_resources_environment' "$JSON_FILE") IGNORE_CACHE=$(jq -r '.defaults.ignore_cache' "$JSON_FILE") @@ -58,6 +59,7 @@ echo "Configs YAML File: $CONFIGS_YAML_FILE" echo "Output Directory: $OUTPUT_DIR" echo "Number of CPUs: $NUM_CPUS" echo "Number of GPUs: $NUM_GPUS" +echo "Number of GPUs for model fitting: $NUM_GPUS_MODEL" echo "Memory Limit: $MEMORY_LIMIT" echo "Setup Ray for SLURM Shared Resources Environment: $SETUP_RAY" echo "Ignore Cache: $IGNORE_CACHE" @@ -94,6 +96,7 @@ for CI in "${CONFIG_ARRAY[@]}"; do --output_dir $OUTPUT_DIR \ --num_cpus $NUM_CPUS \ --num_gpus $NUM_GPUS \ + --num_gpus_model $NUM_GPUS_MODEL \ --memory_limit $MEMORY_LIMIT \ --setup_ray_for_slurm_shared_resources_environment $SETUP_RAY \ --ignore_cache $IGNORE_CACHE \ diff --git a/tst/benchmark/experiment/test_validation_utils.py b/tst/benchmark/experiment/test_validation_utils.py index ca997d519..ccb96fabe 100644 --- a/tst/benchmark/experiment/test_validation_utils.py +++ b/tst/benchmark/experiment/test_validation_utils.py @@ -270,6 +270,29 @@ def test_get_num_group_instances_no_group(): assert v.get_num_group_instances(X) == 7 +@pytest.mark.skipif(not _DATA_FOUNDRY_AVAILABLE, reason="data_foundry not installed") +def test_resolve_validation_splits_group_on_with_num_repeats_none(): + """When group_on is set and num_repeats is None, num_repeats should default to 1. + + Regression test: previously num_repeats=None was passed through to + _resolve_group_splits which expects an integer, causing a crash. + """ + n = 600 # > 500 so tiny-data path does not override num_repeats + groups = [f"g{i % 10}" for i in range(n)] + X = pd.DataFrame({"feature": np.arange(n, dtype=float), "grp": groups}) + y = pd.Series(np.zeros(n)) + v = _Validation( + use_task_specific_validation=True, + group_on="grp", + group_labels=GroupLabelTypes.PER_SAMPLE, + ) + custom_splits, folds, repeats = v.resolve_validation_splits( + X=X, y=y, num_folds=8, num_repeats=None, + ) + assert custom_splits is not None + assert repeats == 1 + + # =========================================================================== # Additional split_time_index_into_intervals tests # =========================================================================== diff --git a/tst/benchmark/preprocessing/test_preprocessing.py b/tst/benchmark/preprocessing/test_preprocessing.py index 8678789ee..60edf692c 100644 --- a/tst/benchmark/preprocessing/test_preprocessing.py +++ b/tst/benchmark/preprocessing/test_preprocessing.py @@ -19,6 +19,7 @@ from tabarena.benchmark.preprocessing.text_feature_generators import ( SemanticTextFeatureGenerator, StatisticalTextFeatureGenerator, + TabArenaDefaultTextEncoder, TextEmbeddingDimensionalityReductionFeatureGenerator, sanitize_text, ) @@ -295,6 +296,41 @@ def test_fit_transform_with_integer_columns(self): X_out = gen.fit_transform(X) assert len(X_out) == 3 + def test_fit_transform_renames_dot_columns(self): + gen = _make_no_text_gen() + X = pd.DataFrame({"a.b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]}) + X_out = gen.fit_transform(X.copy()) + assert "a_b" in X_out.columns + assert "a.b" not in X_out.columns + + def test_fit_transform_leaves_clean_columns_unchanged(self): + gen = _make_no_text_gen() + X = pd.DataFrame({"a": [1.0, 2.0, 3.0], "b_c": [4.0, 5.0, 6.0]}) + X_out = gen.fit_transform(X.copy()) + assert "a" in X_out.columns + assert "b_c" in X_out.columns + + def test_fit_transform_multiple_dots_all_replaced(self): + gen = _make_no_text_gen() + X = pd.DataFrame({"a.b.c": [1.0, 2.0, 3.0]}) + X_out = gen.fit_transform(X.copy()) + assert "a_b_c" in X_out.columns + + def test_transform_also_renames_dot_columns(self): + gen = _make_no_text_gen() + X_train = pd.DataFrame({"a.b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]}) + gen.fit_transform(X_train.copy()) + X_test = pd.DataFrame({"a.b": [7.0, 8.0], "c": [9.0, 10.0]}) + X_out = gen.transform(X_test) + assert "a_b" in X_out.columns + assert "a.b" not in X_out.columns + + def test_no_dot_columns_produces_empty_rename_map(self): + gen = _make_no_text_gen() + X = pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}) + gen.fit_transform(X.copy()) + assert gen._dot_rename_map_ == {} + # =========================================================================== # NoCatAsStringCategoryFeatureGenerator @@ -444,41 +480,6 @@ def test_is_astype_feature_generator_subclass(self): assert issubclass(StringFixAsTypeFeatureGenerator, AsTypeFeatureGenerator) - def test_fit_transform_renames_dot_columns(self): - gen = StringFixAsTypeFeatureGenerator() - X = pd.DataFrame({"a.b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]}) - X_out = gen.fit_transform(X.copy()) - assert "a_b" in X_out.columns - assert "a.b" not in X_out.columns - - def test_fit_transform_leaves_clean_columns_unchanged(self): - gen = StringFixAsTypeFeatureGenerator() - X = pd.DataFrame({"a": [1.0, 2.0, 3.0], "b_c": [4.0, 5.0, 6.0]}) - X_out = gen.fit_transform(X.copy()) - assert "a" in X_out.columns - assert "b_c" in X_out.columns - - def test_fit_transform_multiple_dots_all_replaced(self): - gen = StringFixAsTypeFeatureGenerator() - X = pd.DataFrame({"a.b.c": [1.0, 2.0, 3.0]}) - X_out = gen.fit_transform(X.copy()) - assert "a_b_c" in X_out.columns - - def test_transform_also_renames_dot_columns(self): - gen = StringFixAsTypeFeatureGenerator() - X_train = pd.DataFrame({"a.b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]}) - gen.fit_transform(X_train.copy()) - X_test = pd.DataFrame({"a.b": [7.0, 8.0], "c": [9.0, 10.0]}) - X_out = gen.transform(X_test) - assert "a_b" in X_out.columns - assert "a.b" not in X_out.columns - - def test_no_dot_columns_produces_empty_rename_map(self): - gen = StringFixAsTypeFeatureGenerator() - X = pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}) - gen.fit_transform(X.copy()) - assert gen._dot_rename_map_ == {} - # =========================================================================== # StringFixAsTypeFeatureGenerator – categorical dtype special cases @@ -643,41 +644,106 @@ def test_float_column_unchanged_by_categorical_fix(self): # ------------------------------------------------------------------ # ------------------------------------------------------------------ - # Binary column gaining a third category at test time + # Bool columns with unseen values at test time + # + # Bool encoding always applies: true_val → 1, everything else → 0. + # Unseen values are mapped to 0 (False) and a warning is logged. + # The column stays in _bool_features and keeps its int8 dtype. # ------------------------------------------------------------------ - def test_binary_column_gaining_third_category_not_silently_mapped(self): - """A binary column (bool-encoded as int8 at fit time) that gains a third value - at test time must not silently map that value to 0 (false). - - With only 2 unique values at fit time the column is stored in _bool_features and - encoded as int8 via _convert_to_bool (== true_val → 1, else → 0). A 3rd value - that appears at test time would silently become 0 without our fix; instead we - convert the whole column to categorical so all values are preserved. - """ + def test_bool_col_unseen_value_mapped_to_false(self): + """An unseen value in a bool column must be mapped to 0 (False).""" X_train = pd.DataFrame({"col": pd.Categorical(["yes", "no", "yes", "no"])}) gen = StringFixAsTypeFeatureGenerator() gen.fit_transform(X_train.copy()) - assert "col" in gen._bool_features, "Expected binary column to be bool-encoded at fit time" + assert "col" in gen._bool_features X_test = pd.DataFrame({"col": pd.Categorical(["yes", "no", "maybe"])}) X_out = gen.transform(X_test.copy()) - assert X_out["col"].isna().sum() == 0, "'maybe' was converted to NaN" - values = set(X_out["col"].astype(object).tolist()) - assert "maybe" in values, "'maybe' was silently discarded / mapped to 0 or 1" + assert X_out["col"].dtype == np.int8 + assert X_out["col"].iloc[0] == 1 # 'yes' (true_val) → 1 + assert X_out["col"].iloc[1] == 0 # 'no' (false_val) → 0 + assert X_out["col"].iloc[2] == 0 # 'maybe' (unseen) → 0 + + def test_bool_int_col_unseen_value_mapped_to_false(self): + """An unseen integer in a bool int column (0/1) must be mapped to 0.""" + X_train = pd.DataFrame({"b": [0, 1, 0, 1]}) + gen = self._fit_gen(X_train) + X_out = gen.transform(pd.DataFrame({"b": [0, 1, 2]})) + assert X_out["b"].dtype == np.int8 + assert list(X_out["b"]) == [0, 1, 0] # 2 is unseen → 0 + + def test_bool_int_col_multiple_unseen_values_all_map_to_false(self): + """All unseen integer values must be mapped to 0.""" + X_train = pd.DataFrame({"b": [0, 1, 0, 1]}) + gen = self._fit_gen(X_train) + X_out = gen.transform(pd.DataFrame({"b": [0, 1, 5, 7, 9]})) + assert list(X_out["b"]) == [0, 1, 0, 0, 0] + + def test_bool_string_col_unseen_value_mapped_to_false(self): + """An unseen string in a bool string column must be mapped to 0.""" + X_train = pd.DataFrame({"b": pd.Categorical(["yes", "no", "yes", "no"])}) + gen = self._fit_gen(X_train) + X_out = gen.transform(pd.DataFrame({"b": pd.Categorical(["yes", "no", "maybe"])})) + assert X_out["b"].dtype == np.int8 + assert X_out["b"].iloc[0] == 1 # 'yes' → 1 + assert X_out["b"].iloc[1] == 0 # 'no' → 0 + assert X_out["b"].iloc[2] == 0 # 'maybe' → 0 + + def test_bool_col_stays_in_bool_features_after_unseen(self): + """A bool column with unseen values must remain in _bool_features.""" + X_train = pd.DataFrame({"b": [0, 1, 0, 1]}) + gen = self._fit_gen(X_train) + assert "b" in gen._bool_features + + gen.transform(pd.DataFrame({"b": [0, 1, 2]})) + assert "b" in gen._bool_features, "Column must remain a bool feature" + + def test_bool_col_second_transform_still_bool_encodes(self): + """A second transform call must still apply bool encoding.""" + X_train = pd.DataFrame({"b": [0, 1, 0, 1]}) + gen = self._fit_gen(X_train) + + gen.transform(pd.DataFrame({"b": [0, 1, 2]})) + + X_out = gen.transform(pd.DataFrame({"b": [0, 1, 3]})) + assert X_out["b"].dtype == np.int8 + assert list(X_out["b"]) == [0, 1, 0] # 3 is unseen → 0 + + def test_bool_col_unseen_other_bool_col_unaffected(self): + """A sibling bool column without unseen values must still be bool-encoded normally.""" + X_train = pd.DataFrame({"a": [0, 1, 0, 1], "b": [0, 1, 0, 1]}) + gen = self._fit_gen(X_train) - def test_binary_column_without_extra_categories_still_bool_encoded(self): + X_test = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 0]}) + X_out = gen.transform(X_test.copy()) + # 'a' gained unseen → still int8, unseen mapped to 0 + assert X_out["a"].dtype == np.int8 + assert list(X_out["a"]) == [0, 1, 0] + # 'b' no unseen → normal bool-encoded 0/1 + assert X_out["b"].dtype == np.int8 + assert set(X_out["b"].tolist()).issubset({0, 1}) + + def test_bool_col_without_unseen_values_still_bool_encoded(self): """When no new categories appear the bool-encoding path must still run normally.""" X_train = pd.DataFrame({"col": pd.Categorical(["yes", "no", "yes", "no"])}) gen = StringFixAsTypeFeatureGenerator() gen.fit_transform(X_train.copy()) - # Only known values at test time → normal bool encoding expected X_test = pd.DataFrame({"col": pd.Categorical(["yes", "no", "yes"])}) X_out = gen.transform(X_test.copy()) assert X_out["col"].isna().sum() == 0 - # Values should be 0/1 (int8 bool encoding), not strings assert set(X_out["col"].tolist()).issubset({0, 1}) + def test_bool_col_with_nan_at_test_time(self): + """NaN in a bool column with unseen values is imputed to 0, consistent + with how all int columns handle test-time NaN. + """ + X_train = pd.DataFrame({"b": [0, 1, 0, 1]}) + gen = self._fit_gen(X_train) + X_out = gen.transform(pd.DataFrame({"b": [0, 1, 2, None]})) + assert X_out["b"].iloc[3] == 0 + assert X_out["b"].dtype == np.int8 + def test_int_column_with_nan_at_test_time_imputed_to_zero(self): """Int features that were never NaN at train time must be imputed to 0 at test time.""" X_train = pd.DataFrame({"val": [1, 2, 3, 4, 5]}) @@ -805,7 +871,7 @@ def test_transform_output_columns_match_fit(self): assert list(X_out_train.columns) == list(X_out_test.columns) def test_max_n_output_features_constant(self): - assert StatisticalTextFeatureGenerator.MAX_N_OUTPUT_FEATURES == 384 + assert StatisticalTextFeatureGenerator.MAX_N_OUTPUT_FEATURES == 32 def test_output_columns_prefixed_with_source_column(self): gen = StatisticalTextFeatureGenerator() @@ -902,6 +968,183 @@ def test_transform_empty_df_raises_value_error(self): gen._transform(X) +class TestSemanticTextFeatureGeneratorCacheRoundTrip: + """End-to-end: fit_transform → save cache to parquet → clear → load cache → transform → compare.""" + + EMB_DIM = 32 + + @pytest.fixture(autouse=True) + def _clean_embedding_cache(self): + """Isolate the class-level cache for each test.""" + saved = dict(SemanticTextFeatureGenerator._embedding_look_up) + SemanticTextFeatureGenerator._embedding_look_up.clear() + yield + SemanticTextFeatureGenerator._embedding_look_up.clear() + SemanticTextFeatureGenerator._embedding_look_up.update(saved) + + @staticmethod + def _deterministic_embeddings(texts: list[str]) -> np.ndarray: + """Hash-based deterministic 32-dim embeddings.""" + import hashlib + + embs = [] + for t in texts: + seed = int(hashlib.md5(t.encode()).hexdigest(), 16) % (2**31) + rng = np.random.RandomState(seed) + emb = rng.randn(32).astype(np.float32) + emb /= np.linalg.norm(emb) + embs.append(emb) + return np.vstack(embs) + + def test_save_load_roundtrip_produces_identical_output(self, tmp_path, monkeypatch): + """Full pipeline: fit_transform, save cache, clear, load cache, transform, compare.""" + monkeypatch.setattr(TabArenaDefaultTextEncoder, "get_default_encoder", lambda: None) + monkeypatch.setattr( + TabArenaDefaultTextEncoder, + "encode_texts", + lambda *, texts, encoder_model: self._deterministic_embeddings(texts), + ) + + X = _make_text_df(n_rows=20) + gen = SemanticTextFeatureGenerator() + + # 1. fit_transform populates _embedding_look_up and produces output + X_out_fit, _type_map = gen._fit_transform(X) + assert not X_out_fit.empty + cache = dict(SemanticTextFeatureGenerator._embedding_look_up) + assert len(cache) > 0 + + # 2. Save cache to parquet + cache_path = tmp_path / "text_cache.parquet" + SemanticTextFeatureGenerator.save_embedding_cache(cache=cache, path=cache_path) + assert cache_path.exists() + + # 3. Clear class-level cache + SemanticTextFeatureGenerator._embedding_look_up.clear() + assert len(SemanticTextFeatureGenerator._embedding_look_up) == 0 + + # 4. Load cache from disk + loaded = SemanticTextFeatureGenerator.load_embedding_cache(cache_path) + SemanticTextFeatureGenerator._embedding_look_up.update(loaded) + assert set(loaded.keys()) == set(cache.keys()) + + # 5. Transform with loaded cache (no encoding happens) + X_out_cached = gen._transform(X) + + # 6. Output must be identical + pd.testing.assert_frame_equal(X_out_fit, X_out_cached) + + def test_loaded_embeddings_match_original_values(self, tmp_path, monkeypatch): + """Verify that individual embedding vectors survive the parquet round-trip.""" + monkeypatch.setattr(TabArenaDefaultTextEncoder, "get_default_encoder", lambda: None) + monkeypatch.setattr( + TabArenaDefaultTextEncoder, + "encode_texts", + lambda *, texts, encoder_model: self._deterministic_embeddings(texts), + ) + + X = _make_text_df(n_rows=20) + gen = SemanticTextFeatureGenerator() + gen._fit_transform(X) + + original_cache = {k: v.copy() for k, v in SemanticTextFeatureGenerator._embedding_look_up.items()} + + cache_path = tmp_path / "emb_cache.parquet" + SemanticTextFeatureGenerator.save_embedding_cache(cache=original_cache, path=cache_path) + loaded_cache = SemanticTextFeatureGenerator.load_embedding_cache(cache_path) + + for key in original_cache: + np.testing.assert_array_almost_equal(loaded_cache[key], original_cache[key], decimal=5) + + def test_cache_roundtrip_with_unseen_data_at_transform(self, tmp_path, monkeypatch): + """Load cache from one fit, then transform data that includes new unseen text values.""" + monkeypatch.setattr(TabArenaDefaultTextEncoder, "get_default_encoder", lambda: None) + monkeypatch.setattr( + TabArenaDefaultTextEncoder, + "encode_texts", + lambda *, texts, encoder_model: self._deterministic_embeddings(texts), + ) + + X_train = _make_text_df(n_rows=20) + gen = SemanticTextFeatureGenerator() + gen._fit_transform(X_train) + + cache_path = tmp_path / "partial_cache.parquet" + SemanticTextFeatureGenerator.save_embedding_cache( + cache=SemanticTextFeatureGenerator._embedding_look_up, path=cache_path + ) + + # Clear and reload + SemanticTextFeatureGenerator._embedding_look_up.clear() + loaded = SemanticTextFeatureGenerator.load_embedding_cache(cache_path) + SemanticTextFeatureGenerator._embedding_look_up.update(loaded) + + # Transform with data that has a mix of seen and unseen text + X_new = pd.DataFrame({"text": ["hello world", "brand new text", "foo bar baz", "never seen before"]}) + X_out = gen._transform(X_new) + + assert X_out.shape == (4, self.EMB_DIM) + assert not X_out.isnull().any().any() + + def test_full_pipeline_cache_roundtrip_with_e5_model(self, tmp_path, monkeypatch): + """End-to-end via TabArenaModelAgnosticPreprocessing with intfloat/e5-small-v2.""" + from sentence_transformers import SentenceTransformer + + # Monkey patch Small model for tests + fast_model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L3-v2", truncate_dim=4) + monkeypatch.setattr(TabArenaDefaultTextEncoder, "get_default_encoder", lambda: fast_model) + + X = pd.DataFrame( + { + "description": [ + f"This is a detailed text description for sample number {i} with unique content" for i in range(50) + ] + } + ) + + preprocessing = TabArenaModelAgnosticPreprocessing( + enable_sematic_text_features=True, + enable_new_datetime_features=False, + enable_text_special_features=False, + enable_statistical_text_features=False, + enable_text_ngram_features=False, + enable_datetime_features=False, + verbosity=0, + ) + + # 1. fit_transform through the full pipeline + X_out_fit = preprocessing.fit_transform(X=X) + assert not X_out_fit.empty + + cache = dict(SemanticTextFeatureGenerator._embedding_look_up) + assert len(cache) > 0 + + # 2. Save cache to parquet + cache_path = tmp_path / "pipeline_cache.parquet" + SemanticTextFeatureGenerator.save_embedding_cache(cache=cache, path=cache_path) + + # 3. Clear and reload cache from disk + SemanticTextFeatureGenerator._embedding_look_up.clear() + loaded = SemanticTextFeatureGenerator.load_embedding_cache(cache_path) + SemanticTextFeatureGenerator._embedding_look_up.update(loaded) + assert set(loaded.keys()) == set(cache.keys()) + + # 4. Transform same data with the loaded cache + preprocessing = TabArenaModelAgnosticPreprocessing( + enable_sematic_text_features=True, + enable_new_datetime_features=False, + enable_text_special_features=False, + enable_statistical_text_features=False, + enable_text_ngram_features=False, + enable_datetime_features=False, + verbosity=0, + ) + X_out_cached = preprocessing.fit_transform(X) + + # 5. Output must be identical + pd.testing.assert_frame_equal(X_out_fit, X_out_cached) + + # =========================================================================== # TextEmbeddingDimensionalityReductionFeatureGenerator # =========================================================================== @@ -1374,7 +1617,7 @@ def test_all_possible_aggs_generated_when_budget_large(self): def test_n_top_features_limits_selection(self): X, y = _make_grouped_df() gen = GroupAggregationFeatureGenerator(group_col="gid", n_top_features=3) - X_out, meta = gen._fit_transform(X.copy(), y) + _X_out, meta = gen._fit_transform(X.copy(), y) assert len(meta[GROUP_INDEX_FEATURES]) == 3 def test_highest_variance_feature_selected(self): diff --git a/tst/benchmark/task/test_user_task.py b/tst/benchmark/task/test_user_task.py index be2e02ec3..a3c57428b 100644 --- a/tst/benchmark/task/test_user_task.py +++ b/tst/benchmark/task/test_user_task.py @@ -2,6 +2,7 @@ import functools import operator +import pickle from pathlib import Path import numpy as np @@ -27,9 +28,7 @@ def _isolate_openml_cache(tmp_path_factory): Path(openml.config._root_cache_directory).mkdir(parents=True, exist_ok=True) -def _make_dataset( - problem_type: str, *, n: int = 10 -) -> tuple[pd.DataFrame, str, list[str] | None, list[bool]]: +def _make_dataset(problem_type: str, *, n: int = 10) -> tuple[pd.DataFrame, str, list[str] | None, list[bool]]: dataset = pd.DataFrame( { "num": np.arange(n, dtype="int64"), @@ -59,9 +58,7 @@ def test_user_task_as_openml_task(problem_type, expected_cls, tmp_path): """Test that UserTask can be converted to an OpenML task for local use. This does not test the splits, which are tested in another test. """ - df_original, target_feature, _class_labels, cat_indicator = _make_dataset( - problem_type, n=10 - ) + df_original, target_feature, _class_labels, cat_indicator = _make_dataset(problem_type, n=10) splits = {0: {0: (list(range(8)), [8, 9])}} ut = UserTask( @@ -76,9 +73,7 @@ def test_user_task_as_openml_task(problem_type, expected_cls, tmp_path): ) # Check Task Metadata - assert isinstance(oml_task, expected_cls), ( - f"Expected {expected_cls}, got {type(oml_task)}" - ) + assert isinstance(oml_task, expected_cls), f"Expected {expected_cls}, got {type(oml_task)}" if problem_type == "classification": assert oml_task.task_type_id == TaskType.SUPERVISED_CLASSIFICATION assert oml_task.class_labels == ["neg", "pos"] @@ -94,14 +89,13 @@ def test_user_task_as_openml_task(problem_type, expected_cls, tmp_path): assert isinstance(oml_dataset, openml.datasets.OpenMLDataset) assert oml_dataset.name == ut.get_dataset_name() assert oml_dataset.default_target_attribute == target_feature - assert oml_dataset.parquet_file == (ut._local_cache_path / "data.pq") - assert (ut._local_cache_path / "data.pq").exists() + assert oml_dataset.data_pickle_file == (ut._local_cache_path / "data.pkl.py3") + assert oml_dataset.cache_format == "pickle" + assert (ut._local_cache_path / "data.pkl.py3").exists() assert oml_dataset.data_file == "ignored" # Check Dataset State - X, y, categorical_indicator, attribute_names = oml_dataset.get_data( - target=oml_task.target_name - ) + X, y, categorical_indicator, attribute_names = oml_dataset.get_data(target=oml_task.target_name) assert categorical_indicator == cat_indicator expected_a_names = list(df_original.columns) expected_a_names.remove(target_feature) @@ -118,13 +112,7 @@ def test_user_task_as_openml_task(problem_type, expected_cls, tmp_path): expected_split = OpenMLSplit( name="User-Splits", description="User-defined splits for a custom task.", - split={ - r: { - f: {0: (np.array(tr), np.array(te))} - for f, (tr, te) in splits[r].items() - } - for r in splits - }, + split={r: {f: {0: (np.array(tr), np.array(te))} for f, (tr, te) in splits[r].items()} for r in splits}, ) assert oml_task.split == expected_split @@ -352,9 +340,7 @@ def test_save_load_round_trip_classification(tmp_path): df, target, _, _ = _make_dataset("classification", n=10) splits = {0: {0: (list(range(8)), [8, 9])}} ut = UserTask(task_name="save-load-clf", task_cache_path=tmp_path) - task = ut.create_local_openml_task( - dataset=df, target_feature=target, problem_type="classification", splits=splits - ) + task = ut.create_local_openml_task(dataset=df, target_feature=target, problem_type="classification", splits=splits) ut.save_local_openml_task(task) assert ut.openml_task_path.exists() @@ -370,9 +356,7 @@ def test_save_load_round_trip_regression(tmp_path): df, target, _, _ = _make_dataset("regression", n=10) splits = {0: {0: (list(range(8)), [8, 9])}} ut = UserTask(task_name="save-load-reg", task_cache_path=tmp_path) - task = ut.create_local_openml_task( - dataset=df, target_feature=target, problem_type="regression", splits=splits - ) + task = ut.create_local_openml_task(dataset=df, target_feature=target, problem_type="regression", splits=splits) ut.save_local_openml_task(task) loaded = ut.load_local_openml_task() assert loaded.task_id == ut.task_id @@ -403,9 +387,7 @@ def test_create_local_openml_task_multi_repeat_multi_fold(tmp_path): }, } ut = UserTask(task_name="multi-fold", task_cache_path=tmp_path) - task = ut.create_local_openml_task( - dataset=df, target_feature=target, problem_type="classification", splits=splits - ) + task = ut.create_local_openml_task(dataset=df, target_feature=target, problem_type="classification", splits=splits) # Two repeats, two folds each assert set(task.split.split.keys()) == {0, 1} assert set(task.split.split[0].keys()) == {0, 1} @@ -511,9 +493,7 @@ def test_task_metadata_from_row_missing_field_raises(): meta = _make_task_metadata() df = meta.to_dataframe() row = df.iloc[0].drop("dataset_name") - with pytest.raises( - ValueError, match="missing required TabArenaTaskMetadata fields" - ): + with pytest.raises(ValueError, match="missing required TabArenaTaskMetadata fields"): TabArenaTaskMetadata.from_row(row) @@ -531,6 +511,70 @@ def test_task_metadata_unroll_splits(): assert unrolled[0].dataset_name == meta.dataset_name +def test_task_metadata_from_row_backward_compat_missing_optional_fields(): + """Old CSVs without the new dtype flag columns should load without error.""" + meta = _make_task_metadata() + df = meta.to_dataframe() + row = df.iloc[0] + # Simulate an old CSV that doesn't have the new columns + row = row.drop(["has_datetime", "has_text", "has_categorical", "has_numeric"]) + reconstructed = TabArenaTaskMetadata.from_row(row) + assert reconstructed.dataset_name == meta.dataset_name + # New fields default to None when absent + assert reconstructed.has_datetime is None + assert reconstructed.has_text is None + assert reconstructed.has_categorical is None + assert reconstructed.has_numeric is None + + +def test_task_metadata_dtype_flags_default_to_none(): + """New dtype flag fields default to None when not passed explicitly.""" + meta = _make_task_metadata() + assert meta.has_datetime is None + assert meta.has_text is None + assert meta.has_categorical is None + assert meta.has_numeric is None + + +def test_task_metadata_dtype_flags_round_trip(): + """Dtype flags survive a to_dataframe / from_row round trip.""" + s = _make_split_metadata() + meta = TabArenaTaskMetadata( + dataset_name="test", + problem_type="binary", + is_classification=True, + target_name="target", + eval_metric="roc_auc", + splits_metadata={s.split_index: s}, + split_time_horizon=None, + split_time_horizon_unit=None, + stratify_on=None, + time_on=None, + group_on=None, + group_time_on=None, + group_labels=None, + multiclass_min_n_classes_over_splits=2, + multiclass_max_n_classes_over_splits=2, + class_consistency_over_splits=True, + num_instances=10, + num_features=2, + num_classes=2, + num_instance_groups=10, + tabarena_task_name="test", + task_id_str=None, + has_datetime=False, + has_text=True, + has_categorical=True, + has_numeric=False, + ) + df = meta.to_dataframe() + reconstructed = TabArenaTaskMetadata.from_row(df.iloc[0]) + assert reconstructed.has_datetime is False + assert reconstructed.has_text is True + assert reconstructed.has_categorical is True + assert reconstructed.has_numeric is False + + # --------------------------------------------------------------------------- # from_sklearn_splits_to_user_task_splits # --------------------------------------------------------------------------- @@ -569,9 +613,7 @@ def test_from_sklearn_splits_multiple_repeats(): def test_get_num_instance_groups_no_group(): X = pd.DataFrame({"a": [1, 2, 3]}) - n = TabArenaTaskMetadataMixin.get_num_instance_groups( - X=X, group_on=None, group_labels=None - ) + n = TabArenaTaskMetadataMixin.get_num_instance_groups(X=X, group_on=None, group_labels=None) assert n == 3 @@ -586,9 +628,7 @@ def test_get_num_instance_groups_per_sample_label(): def test_get_num_instance_groups_per_group_label(): X = pd.DataFrame({"a": [1, 2, 3, 4], "group": ["x", "x", "y", "y"]}) - n = TabArenaTaskMetadataMixin.get_num_instance_groups( - X=X, group_on="group", group_labels=GroupLabelTypes.PER_GROUP - ) + n = TabArenaTaskMetadataMixin.get_num_instance_groups(X=X, group_on="group", group_labels=GroupLabelTypes.PER_GROUP) assert n == 2 @@ -609,9 +649,7 @@ def test_get_num_instance_groups_multi_column_group(): def _make_multiclass_dataset(n_per_class: int = 4) -> tuple[pd.DataFrame, str]: """Create a 3-class classification dataset with n_per_class samples per class.""" n = n_per_class * 3 - labels = ( - (["cls0"] * n_per_class) + (["cls1"] * n_per_class) + (["cls2"] * n_per_class) - ) + labels = (["cls0"] * n_per_class) + (["cls1"] * n_per_class) + (["cls2"] * n_per_class) df = pd.DataFrame( { "num": np.arange(n, dtype="int64"), @@ -625,9 +663,7 @@ def _make_multiclass_dataset(n_per_class: int = 4) -> tuple[pd.DataFrame, str]: def _make_4class_dataset(n_per_class: int = 3) -> tuple[pd.DataFrame, str]: """Create a 4-class dataset where fold splits can yield different class counts.""" n = n_per_class * 4 - labels = functools.reduce( - operator.iadd, ([f"cls{c}"] * n_per_class for c in range(4)), [] - ) + labels = functools.reduce(operator.iadd, ([f"cls{c}"] * n_per_class for c in range(4)), []) df = pd.DataFrame( { "num": np.arange(n, dtype="int64"), @@ -666,9 +702,7 @@ def _task_from_user_task( def test_get_dataset_stats_regression_basic(tmp_path): """Regression: num_classes=-1, num_features excludes target, num_instance_groups==len.""" df, target, _, _ = _make_dataset("regression", n=10) - task, _ = _task_from_user_task( - df, target, "regression", {0: {0: (list(range(8)), [8, 9])}}, tmp_path, "ds-reg" - ) + task, _ = _task_from_user_task(df, target, "regression", {0: {0: (list(range(8)), [8, 9])}}, tmp_path, "ds-reg") n_inst, n_feat, n_cls, n_groups = task._get_dataset_stats( oml_dataset=df, is_classification=False, target_name=target ) @@ -689,9 +723,7 @@ def test_get_dataset_stats_classification_class_count(tmp_path): tmp_path, "ds-clf", ) - _, _, n_cls, _ = task._get_dataset_stats( - oml_dataset=df, is_classification=True, target_name=target - ) + _, _, n_cls, _ = task._get_dataset_stats(oml_dataset=df, is_classification=True, target_name=target) assert n_cls == 2 @@ -715,9 +747,7 @@ def test_get_dataset_stats_num_features_excludes_target(tmp_path): tmp_path, "ds-5col", ) - _, n_feat, _, _ = task._get_dataset_stats( - oml_dataset=df, is_classification=True, target_name="target" - ) + _, n_feat, _, _ = task._get_dataset_stats(oml_dataset=df, is_classification=True, target_name="target") assert n_feat == 4 @@ -735,13 +765,10 @@ def test_get_dataset_stats_slice_reports_subset_class_count(tmp_path): tmp_path, "ds-slice", ) - _, _, n_cls, _ = task._get_dataset_stats( - oml_dataset=subset_one_class, is_classification=True, target_name=target - ) + _, _, n_cls, _ = task._get_dataset_stats(oml_dataset=subset_one_class, is_classification=True, target_name=target) assert n_cls == 1 - # --------------------------------------------------------------------------- # compute_metadata — regression # --------------------------------------------------------------------------- @@ -749,12 +776,8 @@ def test_get_dataset_stats_slice_reports_subset_class_count(tmp_path): def test_compute_metadata_regression(tmp_path): df, target, _, _ = _make_dataset("regression", n=10) - task, ut = _task_from_user_task( - df, target, "regression", {0: {0: (list(range(8)), [8, 9])}}, tmp_path, "cm-reg" - ) - meta = task.compute_metadata( - tabarena_task_name="my-task", task_id_str=ut.task_id_str - ) + task, ut = _task_from_user_task(df, target, "regression", {0: {0: (list(range(8)), [8, 9])}}, tmp_path, "cm-reg") + meta = task.compute_metadata(tabarena_task_name="my-task", task_id_str=ut.task_id_str) assert meta.problem_type == "regression" assert meta.is_classification is False @@ -832,6 +855,50 @@ def test_compute_metadata_binary_dataset_level_stats(tmp_path): assert meta.num_instance_groups == 10 # no group_on +def test_compute_metadata_dtype_flags(tmp_path): + """_make_dataset produces int64 'num' and category 'cat' feature columns.""" + df, target, _, _ = _make_dataset("classification", n=10) + task, _ = _task_from_user_task( + df, + target, + "classification", + {0: {0: (list(range(8)), [8, 9])}}, + tmp_path, + "cm-dtype-flags", + ) + meta = task.compute_metadata() + + assert meta.has_numeric is True + assert meta.has_categorical is True + assert meta.has_datetime is False + assert meta.has_text is False + + +def test_compute_metadata_dtype_flags_with_text_and_datetime(tmp_path): + df = pd.DataFrame( + { + "num": np.arange(10, dtype="float64"), + "txt": pd.array(["hello"] * 10, dtype="string"), + "dt": pd.date_range("2020-01-01", periods=10, freq="D"), + "target": [0, 1] * 5, + } + ) + task, _ = _task_from_user_task( + df, + "target", + "classification", + {0: {0: (list(range(8)), [8, 9])}}, + tmp_path, + "cm-dtype-all", + ) + meta = task.compute_metadata() + + assert meta.has_numeric is True + assert meta.has_text is True + assert meta.has_datetime is True + assert meta.has_categorical is False + + # --------------------------------------------------------------------------- # compute_metadata — multiclass classification # --------------------------------------------------------------------------- @@ -876,22 +943,13 @@ def test_compute_metadata_class_consistency_false(tmp_path): """ df, target = _make_4class_dataset(n_per_class=8) # 32 samples # Repeat 1 train/test: alternate halves of each 8-sample class block. - r1_train = ( - list(range(4)) + list(range(8, 12)) + list(range(16, 20)) + list(range(24, 28)) - ) - r1_test = ( - list(range(4, 8)) - + list(range(12, 16)) - + list(range(20, 24)) - + list(range(28, 32)) - ) + r1_train = list(range(4)) + list(range(8, 12)) + list(range(16, 20)) + list(range(24, 28)) + r1_test = list(range(4, 8)) + list(range(12, 16)) + list(range(20, 24)) + list(range(28, 32)) splits = { 0: {0: (list(range(24)), list(range(24, 32)))}, 1: {0: (r1_train, r1_test)}, } - task, _ = _task_from_user_task( - df, target, "classification", splits, tmp_path, "cm-inconsistent" - ) + task, _ = _task_from_user_task(df, target, "classification", splits, tmp_path, "cm-inconsistent") meta = task.compute_metadata() assert meta.problem_type == "multiclass" @@ -913,9 +971,7 @@ def test_compute_metadata_multi_fold_split_indices(tmp_path): 1: (list(range(5, 20)), list(range(5))), } } - task, _ = _task_from_user_task( - df, target, "classification", splits, tmp_path, "cm-mf" - ) + task, _ = _task_from_user_task(df, target, "classification", splits, tmp_path, "cm-mf") meta = task.compute_metadata() assert meta.n_splits == 2 @@ -930,9 +986,7 @@ def test_compute_metadata_multi_fold_per_split_counts(tmp_path): 1: (list(range(5, 20)), list(range(5))), } } - task, _ = _task_from_user_task( - df, target, "classification", splits, tmp_path, "cm-mf-cnt" - ) + task, _ = _task_from_user_task(df, target, "classification", splits, tmp_path, "cm-mf-cnt") meta = task.compute_metadata() s0 = meta.splits_metadata["r0f0"] @@ -949,9 +1003,7 @@ def test_compute_metadata_multi_repeat_split_indices(tmp_path): 0: {0: (list(range(15)), list(range(15, 20)))}, 1: {0: (list(range(5, 20)), list(range(5)))}, } - task, _ = _task_from_user_task( - df, target, "classification", splits, tmp_path, "cm-mr" - ) + task, _ = _task_from_user_task(df, target, "classification", splits, tmp_path, "cm-mr") meta = task.compute_metadata() assert meta.n_splits == 2 @@ -966,9 +1018,7 @@ def test_compute_metadata_multi_repeat_split_indices(tmp_path): def test_compute_metadata_optional_fields_default_none(tmp_path): df, target, _, _ = _make_dataset("regression", n=10) - task, _ = _task_from_user_task( - df, target, "regression", {0: {0: (list(range(8)), [8, 9])}}, tmp_path, "cm-opt" - ) + task, _ = _task_from_user_task(df, target, "regression", {0: {0: (list(range(8)), [8, 9])}}, tmp_path, "cm-opt") meta = task.compute_metadata() assert meta.tabarena_task_name is None @@ -1046,17 +1096,14 @@ def test_compute_metadata_split_time_horizon_passthrough(tmp_path): pd.arrays.IntervalArray.from_breaks(range(11)), pd.IntervalDtype(subtype="int64", closed="right"), ), - # Note: complex128 is also unsupported by liac-arff, but pyarrow (parquet) - # cannot serialize it either, so it fails at a later stage and is excluded here. + # Note: complex128 is also unsupported by liac-arff and excluded here. ], ids=["datetime64", "timedelta64", "period", "interval"], ) -def test_create_local_openml_task_unsupported_arff_dtype_does_not_raise( - col_name, col_values, dtype, tmp_path -): +def test_create_local_openml_task_unsupported_arff_dtype_does_not_raise(col_name, col_values, dtype, tmp_path): """Columns with dtypes unsupported by liac-arff (datetime64, timedelta64, complex) must not prevent task creation — they are cast to string only for ARFF attribute - inference and do not affect the data persisted to parquet. + inference and do not affect the data persisted to pickle. """ n = 10 df = pd.DataFrame( @@ -1071,11 +1118,53 @@ def test_create_local_openml_task_unsupported_arff_dtype_does_not_raise( splits = {0: {0: (list(range(8)), [8, 9])}} ut = UserTask(task_name=f"unsupported-dtype-{col_name}", task_cache_path=tmp_path) # Must not raise - ut.create_local_openml_task( - dataset=df, target_feature="target", problem_type="regression", splits=splits - ) + ut.create_local_openml_task(dataset=df, target_feature="target", problem_type="regression", splits=splits) - # The parquet file must store the original dtype — the workaround must not + # The pickle file must store the original dtype — the workaround must not # modify the persisted data. - stored = pd.read_parquet(ut._local_cache_path / "data.pq") + with (ut._local_cache_path / "data.pkl.py3").open("rb") as fh: + stored, _, _ = pickle.load(fh) assert stored[col_name].dtype == df[col_name].dtype + + +@pytest.mark.parametrize( + ("cat_values", "cat_dtype", "test_id"), + [ + ( + pd.Categorical([0, 1, 2, 1, 0, 2, 1, 0, 2, 1]), + "int64", + "int_categories", + ), + ( + pd.Categorical([1.5, 2.5, 1.5, 2.5, 1.5, 2.5, 1.5, 2.5, 1.5, 2.5]), + "float64", + "float_categories", + ), + ], + ids=lambda x: x if isinstance(x, str) and "_categories" in x else "", +) +def test_create_local_openml_task_non_string_categorical_does_not_raise(cat_values, cat_dtype, test_id, tmp_path): + """Categorical columns whose categories have a non-string dtype (e.g. int, float) + must not break ARFF attribute inference — categories are cast to string only for + metadata and do not affect the data persisted to pickle. + """ + n = 10 + df = pd.DataFrame( + { + "num": np.arange(n, dtype="int64"), + "cat_col": cat_values, + "target": np.linspace(0.0, 1.0, num=n), + } + ) + assert df["cat_col"].dtype.name == "category" + assert df["cat_col"].cat.categories.dtype == cat_dtype + + splits = {0: {0: (list(range(8)), [8, 9])}} + ut = UserTask(task_name=f"non-str-cat-{test_id}", task_cache_path=tmp_path) + # Must not raise + ut.create_local_openml_task(dataset=df, target_feature="target", problem_type="regression", splits=splits) + + # The pickle file must store the original data values unchanged. + with (ut._local_cache_path / "data.pkl.py3").open("rb") as fh: + stored, _, _ = pickle.load(fh) + pd.testing.assert_series_equal(stored["cat_col"].astype(str), df["cat_col"].astype(str), check_names=True)