Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
b78539e
Change bool handling in new preprocessing (#287)
LennartPurucker Apr 9, 2026
d204116
fix: correct mem estimate using X's features after transformation if …
LennartPurucker Apr 9, 2026
364fbef
Add "datasets" filtering option
Innixma Apr 9, 2026
d9f221d
fix: num_repeats None bug for grouped data
LennartPurucker Apr 9, 2026
9fd7344
fix: sanity check problem type when loading data
LennartPurucker Apr 9, 2026
f594fc8
fix: also remap feature_metadata_in
LennartPurucker Apr 9, 2026
0185042
fix: user task arff checker edge case, more clear error
LennartPurucker Apr 9, 2026
1617453
fix: minor changes and corrections for the fs_benchmark_runner
schaeferbasti Apr 10, 2026
9cd8bae
WIP fix: make data_foundry usable for now
schaeferbasti Apr 10, 2026
9712ff2
WIP fix: save FeatureSelectionResult to csv
schaeferbasti Apr 10, 2026
013c06f
fix: use usual constants for data_foundry_cache
schaeferbasti Apr 10, 2026
ccf751a
maint: save results in results folder
schaeferbasti Apr 10, 2026
7a8eb18
fix: renaming now on global level
LennartPurucker Apr 10, 2026
d0c688a
add: basic dtype flags for task metadata
LennartPurucker Apr 10, 2026
bf6eb04
maint: use args instead of download_data_foundry_dataset
schaeferbasti Apr 10, 2026
9f22c32
maint: use args instead of download_data_foundry_dataset
schaeferbasti Apr 10, 2026
dc5ad57
maint: change path for cluster (need to check)
schaeferbasti Apr 10, 2026
b53dd54
add: support for more dataset filters
LennartPurucker Apr 10, 2026
0185dde
add: class-level cache and dynamic batch size
LennartPurucker Apr 10, 2026
5c6e304
add: allow to only pass GPU for preprocessing
LennartPurucker Apr 10, 2026
48c2cd2
unify logging path for parallel benchmarks
LennartPurucker Apr 10, 2026
5ef21ed
add: script for generating script for extra benchmark (needs to be te…
schaeferbasti Apr 10, 2026
5754508
fix: allow missing classes in test split for grouped data
LennartPurucker Apr 10, 2026
b61453d
Add TabPrep configs
Innixma Apr 10, 2026
ebf772c
Add fast task_metadata loading for `EndToEnd.from_raw()`
Innixma Apr 10, 2026
4263865
Switch to cache-based text benchmarking (#289)
LennartPurucker Apr 10, 2026
a783c90
maint: better logging for output of text cache
LennartPurucker Apr 10, 2026
852457b
fix: ensure user tasks do not lose dtypes and go to pickle right away…
LennartPurucker Apr 10, 2026
3b4c19c
add: much more clever batching buckets
LennartPurucker Apr 10, 2026
53436a0
add: finalize text cache
LennartPurucker Apr 11, 2026
9df4307
maint: fix test for new default output size for stat text
LennartPurucker Apr 11, 2026
9ffa667
fix: adjust folds to number of groups
LennartPurucker Apr 11, 2026
3efafd1
fix: do not load model if not needed
LennartPurucker Apr 11, 2026
f15e156
fix: correct reasoning
LennartPurucker Apr 11, 2026
8f0b0d6
fix: dtype wrong for splits
LennartPurucker Apr 11, 2026
55539cd
add: filter future warnings from ray
LennartPurucker Apr 11, 2026
e731335
add: memory est overhead for large data
LennartPurucker Apr 11, 2026
77c9639
fix: better memory estimate for TabM and remove view usage
LennartPurucker Apr 11, 2026
5a8b483
fix: realmlp cat codes
LennartPurucker Apr 11, 2026
1c3d572
fix: OOD cat codes support and nan handling
LennartPurucker Apr 11, 2026
9d1d6d1
add: support for lazy loading of X,y in TabArena and avoid x3 copies
LennartPurucker Apr 11, 2026
be2f763
add: support disable retries
LennartPurucker Apr 11, 2026
63f9b16
maint: make MLPs torch abstract models and adapt batch size for large…
LennartPurucker Apr 11, 2026
4208c3c
maint: make TFMs torch abstract models
LennartPurucker Apr 11, 2026
0f36c2f
add: support for setting max predict batch size
LennartPurucker Apr 12, 2026
6ba9390
fix: add cache function and ignore cache argument
schaeferbasti Apr 13, 2026
77c1c7f
fix: simplify script & remove ExtraBenchmarkSetup2026
schaeferbasti Apr 13, 2026
5dc640b
maint: add num_classes and num_samples to FeatureSelectionResult for …
schaeferbasti Apr 13, 2026
4540d60
Merge branch 'autogluon:main' into fe_benchmark_main_val_pipeline
schaeferbasti Apr 13, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,27 +1,39 @@
"""Shared infrastructure and entry point for feature selection benchmark evaluation.

Usage:
python fs_benchmark_runner.py --mode validity --seed 42
python fs_benchmark_runner.py --mode stability --seed 42
python feature_selection_benchmark_runner.py \
--mode validity \
--method_name FSBench__RandomFeatureSelector__5__0__lgbm__3600 \
--data_foundry_task_id "UserTask|1386903908|anneal/019d3f7b-494a-71fa-8eb2-25d01dfb7792|/work/dlclarge1/purucker-fs_benchmark/.openml/tabarena_tasks" \
--repeat 0 \
--noise 1.0 \
--noise_type gaussian

python feature_selection_benchmark_runner.py \
--mode stability \
--method_name FSBench__RandomFeatureSelector__5__0__lgbm__3600 \
--data_foundry_task_id "UserTask|1386903908|anneal/019d3f7b-494a-71fa-8eb2-25d01dfb7792|/work/dlclarge1/purucker-fs_benchmark/.openml/tabarena_tasks" \
--repeat 0 \
--noise 1.0 \
--noise_type gaussian
"""

from __future__ import annotations

import argparse
import time
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any
from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd
from tabarena.benchmark.feature_selection_methods.feature_selection_benchmark_utils import (
selector_and_config_from_string,
)
from tabarena.benchmark.task.openml import OpenMLTaskWrapper
from tabflow_slurm.run_tabarena_experiment import _parse_task_id

if TYPE_CHECKING:
import pandas as pd


@dataclass
class FeatureSelectionResult:
Expand All @@ -36,6 +48,9 @@ class FeatureSelectionResult:
repeat: Repeat number for the FS metric.

selected_features: Names of selected features from the original_features.
num_classes: Number of classes in the target variable (for classification tasks).
num_samples: Number of samples in the dataset.

elapsed_time_fs: Runtime measurement (seconds).

mode: Evaluation mode ("validity" or "stability").
Expand All @@ -50,6 +65,9 @@ class FeatureSelectionResult:
repeat: int

selected_features: list[int]
num_classes: int
num_samples: int

elapsed_time_fs: float

mode: str
Expand All @@ -62,11 +80,11 @@ def _augment_dataset(
original_features = list(X.columns)

if mode == "validity":
from validity_fs_metric import get_dataset_for_validity
from validity_fs_metric import get_dataset_for_validity # noqa: PLC0415

X = get_dataset_for_validity(X=X, rng=rng, **kwargs)
elif mode == "stability":
from stability_fs_metric import get_dataset_for_stability
from stability_fs_metric import get_dataset_for_stability # noqa: PLC0415

X, y = get_dataset_for_stability(X=X, y=y, rng=rng, **kwargs)
else:
Expand All @@ -79,7 +97,7 @@ def _augment_dataset(
return X, y, original_features


def run_benchmark(
def run_benchmark( # noqa: D417
*,
data_foundry_task_id: str,
mode: str,
Expand Down Expand Up @@ -109,7 +127,7 @@ def run_benchmark(
feature_selector, config = selector_and_config_from_string(preprocessing_name=method_name)

# Augment dataset with new feature based on mode.
X, y, original_features = _augment_dataset(mode=mode, X=X, rng=rng, **kwargs)
X, y, original_features = _augment_dataset(mode=mode, X=X, y=y, rng=rng, **kwargs)

# Run Feature Selection
start_time = time.monotonic()
Expand All @@ -135,6 +153,13 @@ def run_benchmark(
mode_kwargs=kwargs,
)

def get_cache_path(args) -> Path:
"""Generate the cache path based on arguments."""
cache_dir = Path(__file__).parent / "results"
cache_dir.mkdir(parents=True, exist_ok=True) # Ensure the directory exists
cache_path = cache_dir / f"{args.mode}_{args.method_name}_{args.data_foundry_task_id.split('|')[3].split('/')[0]}_{args.repeat}.csv"
return cache_path


def parse_args() -> argparse.Namespace:
"""Parse CLI arguments for the FS benchmark runner."""
Expand All @@ -149,14 +174,15 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--method_name",
type=str,
default="FSBench__AccuracyFeatureSelector__5__0__lgbm__3600",
default="FSBench__RandomFeatureSelector__5__0__lgbm__3600",
help="Feature Selection Method name [default: FSBench__AccuracyFeatureSelector__5__0__lgbm__3600]",
)
parser.add_argument(
"--data_foundry_task_id",
type=str,
default="anneal/019d3f7b-494a-71fa-8eb2-25d01dfb7792",
help="TabArena/OpenML task identifier [default: anneal/019d3f7b-494a-71fa-8eb2-25d01dfb7792]",
default="UserTask|1386903908|anneal/019d3f7b-494a-71fa-8eb2-25d01dfb7792|/Users/schaefer.bastian/.openml/tabarena_tasks",
help="TabArena/OpenML task metadata identifier [default: UserTask|1386903908|anneal/019d3f7b-494a-71fa-8eb2-25d"
"01dfb7792|/Users/schaefer.bastian/.openml/tabarena_tasks]",
)
parser.add_argument("--repeat", type=int, default=0, help="Repeat [default: 0]")

Expand All @@ -165,27 +191,41 @@ def parse_args() -> argparse.Namespace:
"--noise",
type=float,
default=1.0,
nargs="+",
help="Noise features relative to original count (validity mode only) [default: 1.0]",
)
parser.add_argument(
"--nose_type",
"--noise_type",
type=str,
choices=["gaussian", "uniform"],
default="gaussian",
help="Type of noise features to add (validity mode only) [default: random]",
)
parser.add_argument(
"--ignore_cache",
type=bool,
default=False,
help="Whether to ignore existing cache and rerun the benchmark (useful for debugging) [default: False]",
)
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()

run_benchmark(
data_foundry_task_id=args.data_foundry_task_id,
mode=args.mode,
method_name=args.method_name,
repeat=args.repeat,
noise=args.noise,
nose_type=args.nose_type,
)
cache_path = get_cache_path(args)

if cache_path.exists() and not args.ignore_cache:
print(f"Cache exists at {cache_path}. Skipping operation.")
else:
result = run_benchmark(
data_foundry_task_id=args.data_foundry_task_id,
mode=args.mode,
method_name=args.method_name,
repeat=args.repeat,
noise=args.noise,
noise_type=args.noise_type,
)

print(result)
result = pd.DataFrame([result.__dict__])
result.to_csv(cache_path, index=False)
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from pathlib import Path

import pandas as pd
import submitit
from experimental.feature_selection_benchmark.tabarena_setup.fr_cluster_setup import (
ALL_TASK_METADATA,
FS_TIME_LIMIT,
FSBenchmarkConfig,
)


def run_extra_pipeline(mode, method_name, task_id, noise, noise_type):
"""Function to run the extra pipeline."""
print(
f"Running extra pipeline with mode={mode}, method_name={method_name}, task_id={task_id}, noise={noise}, noise_type={noise_type}")
# Add your pipeline logic here


if __name__ == "__main__":
method_names = FSBenchmarkConfig().get_default_preprocessing_configs(
fs_methods=[
"AccuracyFeatureSelector",
"RandomFeatureSelector",
"ANOVAFeatureSelector",
"CFSFeatureSelector",
"Chi2FeatureSelector",
"DISRFeatureSelector",
"GainRatioFeatureSelector",
"GiniFeatureSelector",
"ImpurityFeatureSelector",
"InformationGainFeatureSelector",
"INTERACTFeatureSelector",
"MarkovBlanketFeatureSelector",
"MIFeatureSelector",
"mRMRFeatureSelector",
"PearsonCorrelationFeatureSelector",
"ReliefFFeatureSelector",
"RFImportanceFeatureSelector",
"SequentialBackwardEliminationFeatureSelector",
"SequentialForwardSelectionFeatureSelector",
"SymmetricalUncertaintyFeatureSelector",
"LassoFeatureSelector", # just for regression but with label encoder for classification?
"LaplacianScoreFeatureSelector", # OOM, Segmentation fault issues
"ConsistencyFeatureSelector",
# selected_indices = np.where(S)[0].tolist(), UnboundLocalError: cannot access local variable 'S' where it is not associated with a value
"JMIFeatureSelector",
# time limit computed incorrectly, and error at remaining.remove(best_idx), ValueError: list.remove(x): x not in list
"OneRFeatureSelector",
# major OOM errors (tries to allocate one major array), wrong time limit computation, max(accuracies, key=accuracies.get) -> max() iterable argument is empty
"ElasticNetFeatureSelector", # Only for classification
"CMIMFeatureSelector", # problems with time limit and fallback of features
# "tTestFeatureSelector", # Does not work for regression
"CARTFeatureSelector", # Only implemented for classification, OOM problems as well
]
)
task_ids = pd.read_csv(ALL_TASK_METADATA)["task_id_str"].tolist()
# Define the parameter grid
modes = ["validity", "stability"]
noises = [0.5, 0.75, 1.0]
noise_types = ["gaussian"]

# Create a SLURM executor
executor = submitit.AutoExecutor(folder=Path("slurm_logs"))
executor.update_parameters(
timeout_min=FS_TIME_LIMIT, # Job timeout in minutes
slurm_partition="default", # SLURM partition
cpus_per_task=8, # Number of CPUs per task
mem_gb=32, # Memory in GB
)

# Submit jobs
with executor.batch():
for mode in modes:
for method_name in method_names:
for task_id in task_ids:
if mode == "validity":
for noise in noises:
for noise_type in noise_types:
executor.submit(run_extra_pipeline, mode, method_name, task_id, noise, noise_type)
else:
executor.submit(run_extra_pipeline, mode, method_name, task_id, noise, noise_type)
49 changes: 43 additions & 6 deletions tabarena/tabarena/benchmark/experiment/experiment_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,19 @@ def __init__(
self.eval_metric: Scorer = get_metric(metric=self.eval_metric_name, problem_type=self.task.problem_type)
self.model: AbstractExecModel | None = None
self.task_split_idx = self.task.get_split_idx(fold=self.fold, repeat=self.repeat, sample=self.sample)
self.X, self.y, self.X_test, self.y_test = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample)

if self.task.lazy_load_data:
assert input_format == "openml", "Lazy load data only works with input_format='openml'"
self.X, self.y, self.X_test, self.y_test = None, None, None, None
_, y, _, _ = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample)
else:
self.X, self.y, self.X_test, self.y_test = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample)
y = self.y

if input_format == "csv":
self.X = self.task.to_csv_format(X=self.X)
self.X_test = self.task.to_csv_format(X=self.X_test)
self.label_cleaner = LabelCleaner.construct(problem_type=self.task.problem_type, y=self.y)
self.label_cleaner = LabelCleaner.construct(problem_type=self.task.problem_type, y=y)
if cacher is None:
cacher = CacheFunctionDummy()
self.cacher = cacher
Expand All @@ -105,8 +113,19 @@ def split_seed(self):
"""We use the split index as a source for a seed that creates different randomness per split."""
return self.task_split_idx

def _lazy_load_for_run_model_fit(self):
X, y, X_test, _ = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample)
return X, y, X_test


def run_model_fit(self) -> dict:
return self.model.fit_custom(X=self.X, y=self.y, X_test=self.X_test, split_seed=self.split_seed)
if self.task.lazy_load_data:
lazy_load_function = self._lazy_load_for_run_model_fit
X, y, X_test = None, None, None
else:
lazy_load_function = None
X, y, X_test = self.X, self.y, self.X_test
return self.model.fit_custom(X=X, y=y, X_test=X_test, split_seed=self.split_seed, lazy_load_function=lazy_load_function)

def run(self) -> dict:
out = self._run()
Expand Down Expand Up @@ -157,11 +176,19 @@ def _run(self) -> dict:
self.handle_failure(exc=exc)
raise
out = self.post_fit(out=out)

if self.task.lazy_load_data:
_, _, _, y_test = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample)
else:
y_test = self.y_test
out["metric_error"] = self.evaluate(
y_true=self.y_test,
y_true=y_test,
y_pred=out["predictions"],
y_pred_proba=out["probabilities"],
)
if self.task.lazy_load_data:
del y_test

out = self.post_evaluate(out=out)
out["experiment_metadata"] = self._experiment_metadata(time_start=time_start, time_start_str=time_start_str)
out = self.convert_to_output(out=out)
Expand Down Expand Up @@ -269,7 +296,12 @@ def post_evaluate(self, out: dict) -> dict:
simulation_artifact["pred_proba_dict_test"] = self.label_cleaner.transform_proba(out["probabilities"], as_pandas=True)
if self.task.problem_type == "binary":
simulation_artifact["pred_proba_dict_test"] = simulation_artifact["pred_proba_dict_test"].iloc[:, 1]
simulation_artifact["y_test"] = self.label_cleaner.transform(self.y_test)

if self.task.lazy_load_data:
_, _, _, y_test = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample)
else:
y_test = self.y_test
simulation_artifact["y_test"] = self.label_cleaner.transform(y_test)

if self.optimize_simulation_artifacts_memory:
# optimize memory
Expand All @@ -296,7 +328,12 @@ def post_evaluate(self, out: dict) -> dict:
simulation_artifact["metric"] = self.eval_metric_name

if self.compute_bag_info and (self.model.can_get_per_child_oof and self.model.can_get_per_child_val_idx):
simulation_artifact["bag_info"] = self.model.bag_artifact(X_test=self.X_test)
if self.task.lazy_load_data:
_, _, X_test, _ = self.task.get_train_test_split(fold=self.fold, repeat=self.repeat, sample=self.sample)
else:
X_test = self.X_test

simulation_artifact["bag_info"] = self.model.bag_artifact(X_test=X_test)


simulation_artifact["pred_proba_dict_val"] = {self.method: simulation_artifact["pred_proba_dict_val"]}
Expand Down
Loading