Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions ManPage.md
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,23 @@ Required positionals: `<model>` then `<command>` and, for `datagen`/`run`/`confi
- **`--accelerator-type <type>`, `-at <type>`**
Accelerator the workload should emulate (e.g. `h100`, `b200`, `mi355`). Determines per-accelerator access patterns and data rates. Required for `datasize`, `run`, `configview`.

**Supported (model, accelerator) combinations**

| Model | a100 | h100 | b200 | mi355 |
|-----------|:------:|:------:|:----------:|:----------:|
| unet3d | whatif | — | **v3.0** | — |
| retinanet | — | — | **v3.0** | **v3.0** |
| cosmoflow | whatif | whatif | — | — |
| resnet50 | whatif | whatif | — | — |
| dlrm | — | — | whatif | whatif |
| flux | — | — | whatif | whatif |

- **v3.0** — submittable in CLOSED or OPEN.
- **whatif** — available only via `mlpstorage whatif …` for planning. Not submittable.
- **—** — no workload definition file. `mlpstorage` will fail with a "combination not supported" error pointing at the missing YAML.

Any (model, accelerator) combination not marked **v3.0** is available under `whatif` for planning purposes if a workload definition file is provided.

- **`--num-accelerators <N>`, `-na <N>`**
Number of simulated accelerators for `run`/`configview`. Ranks are distributed round-robin across `--hosts`.

Expand Down
55 changes: 54 additions & 1 deletion mlpstorage_py/benchmarks/dlio.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
LLM_ALLOWED_VALUES, LLM_SUBSET_PROCS, EXIT_CODE, MODELS, HYDRA_OUTPUT_SUBDIR,
LLM_SIZE_BY_RANK)
from mlpstorage_py.dependency_check import validate_benchmark_dependencies
from mlpstorage_py.errors import ConfigurationError, ErrorCode
from mlpstorage_py.rules import calculate_training_data_size, HostInfo, HostMemoryInfo, HostCPUInfo, ClusterInformation
from mlpstorage_py.utils import (read_config_from_file, create_nested_dict, update_nested_dict, generate_mpi_prefix_cmd)
from mlpstorage_py.storage_config import resolve_object_storage_config
Expand Down Expand Up @@ -324,6 +325,54 @@ def _strip_uri_scheme(value):
normalized = (parsed.netloc + parsed.path).rstrip('/')
return normalized or parsed.netloc

def _raise_unsupported_workload(self, workload_abs):
"""Raise ConfigurationError when the resolved workload YAML does not exist.

The DLIO workload YAML name is derived from CLI args
(``<model>_<accelerator>.yaml`` for training, ``<model>.yaml`` for
checkpointing). When the file is absent the user has chosen a
combination we have no workload definition for — surface this with
an explicit "not supported" message and (for training) point at
the v3.0 submittable combinations.
"""
model = getattr(self.args, 'model', None)
accel = getattr(self.args, 'accelerator_type', None)

if self.BENCHMARK_TYPE == BENCHMARK_TYPES.training:
message = (
f"The combination --model={model} --accelerator-type={accel} "
f"is not supported."
)
suggestion = (
f"Missing workload definition: {workload_abs}\n"
" v3.0 submittable combinations (CLOSED or OPEN):\n"
" --model unet3d --accelerator-type b200\n"
" --model retinanet --accelerator-type b200\n"
" --model retinanet --accelerator-type mi355\n"
" Other (model, accelerator) pairs work under `whatif` if a "
"workload definition file exists for them; this combination "
"has none."
)
parameter = "model+accelerator-type"
actual = f"{model} + {accel}"
else:
message = f"The model --model={model} is not supported."
suggestion = (
f"Missing workload definition: {workload_abs}\n"
" Pass a --model value that has a matching "
"configs/dlio/workload/<model>.yaml file."
)
parameter = "model"
actual = str(model)

raise ConfigurationError(
message=message,
parameter=parameter,
actual=actual,
suggestion=suggestion,
code=ErrorCode.CONFIG_FILE_NOT_FOUND,
)

def process_dlio_params(self, config_file):
params_dict = dict() if not self.args.params else {k: v for k, v in (item.split("=") for item in self.args.params)}

Expand All @@ -337,7 +386,11 @@ def process_dlio_params(self, config_file):
)
params_dict['storage.storage_root'] = normalized

yaml_params = read_config_from_file(os.path.join(self.DLIO_CONFIG_PATH, "workload", config_file))
workload_rel = os.path.join(self.DLIO_CONFIG_PATH, "workload", config_file)
workload_abs = os.path.join(CONFIGS_ROOT_DIR, workload_rel)
if not os.path.isfile(workload_abs):
self._raise_unsupported_workload(workload_abs)
yaml_params = read_config_from_file(workload_rel)
combined_params = update_nested_dict(yaml_params, create_nested_dict(params_dict))

self.logger.debug(f'yaml params: \n{pprint.pformat(yaml_params)}')
Expand Down
59 changes: 59 additions & 0 deletions mlpstorage_py/tests/test_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,5 +594,64 @@ def test_datasize_does_not_require_pymilvus(self, mock_logger):
# _validate_vdb_dependencies should NOT have been called
mock_dep.assert_not_called()

# =============================================================================
# Tests for the "unsupported (model, accelerator) combination" error path
# (DLIO workload YAML lookup)
# =============================================================================

class TestUnsupportedWorkloadCombination:
"""When the resolved configs/dlio/workload/<model>_<accel>.yaml does not
exist, process_dlio_params must raise ConfigurationError(CONFIG_FILE_NOT_FOUND)
with a specific "combination not supported" message rather than a generic
FileNotFoundError."""

def _make_stub_training_benchmark(self, model, accelerator_type, mock_logger):
from mlpstorage_py.benchmarks.dlio import TrainingBenchmark
args = SimpleNamespace(
model=model,
accelerator_type=accelerator_type,
command='datasize',
params=None,
)
with patch.object(TrainingBenchmark, '__init__', lambda *a, **kw: None):
bench = TrainingBenchmark.__new__(TrainingBenchmark)
bench.args = args
bench.logger = mock_logger
return bench

def test_training_missing_yaml_raises_specific_error(self, mock_logger):
from mlpstorage_py.errors import ConfigurationError, ErrorCode
bench = self._make_stub_training_benchmark('unet3d', 'mi355', mock_logger)
with pytest.raises(ConfigurationError) as exc_info:
bench.process_dlio_params('unet3d_mi355.yaml')
err = exc_info.value
assert err.code == ErrorCode.CONFIG_FILE_NOT_FOUND
assert 'unet3d' in str(err)
assert 'mi355' in str(err)
assert 'not supported' in str(err)
# v3.0 advisory must list the three submittable combinations.
rendered = str(err) + "\n" + err.suggestion
assert 'retinanet' in rendered
assert 'b200' in rendered

def test_training_existing_yaml_does_not_raise_unsupported(self, mock_logger):
"""unet3d_b200.yaml DOES exist — process_dlio_params must get past the
existence check (it may raise later for unrelated reasons since we
don't fully wire the benchmark, but NOT with our specific error)."""
from mlpstorage_py.errors import ConfigurationError, ErrorCode
bench = self._make_stub_training_benchmark('unet3d', 'b200', mock_logger)
try:
bench.process_dlio_params('unet3d_b200.yaml')
except ConfigurationError as e:
assert e.code != ErrorCode.CONFIG_FILE_NOT_FOUND, (
f"unet3d_b200.yaml exists; the unsupported-combination guard "
f"must not fire. Got: {e}"
)
except Exception:
# Unrelated failures (e.g., logger/attribute access downstream)
# are fine — the guard already passed.
pass


if __name__ == "__main__":
pytest.main([__file__, "-v"])
22 changes: 20 additions & 2 deletions training/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -368,8 +368,26 @@ View Only:
Note: The `reportgen` script must be run in the launcher client host.

## Training Models
Currently, the storage benchmark suite supports benchmarking of 3 deep learning workloads
- Image classification using a Unet3D model

### Supported (model, accelerator) combinations

| Model | a100 | h100 | b200 | mi355 |
|-----------|:------:|:------:|:----------:|:----------:|
| unet3d | whatif | — | **v3.0** | — |
| retinanet | — | — | **v3.0** | **v3.0** |
| cosmoflow | whatif | whatif | — | — |
| resnet50 | whatif | whatif | — | — |
| dlrm | — | — | whatif | whatif |
| flux | — | — | whatif | whatif |

- **v3.0** — submittable in CLOSED or OPEN.
- **whatif** — available only via `mlpstorage whatif …` for planning. Not submittable.
- **—** — no workload definition file. `mlpstorage` will fail with a "combination not supported" error pointing at the missing YAML.

Any (model, accelerator) combination not marked **v3.0** is available under `whatif` for planning purposes if a workload definition file is provided.

The closed-submittable training models are Unet3D and RetinaNet:
- Image classification using a Unet3D model
- Image recognition using a RetinaNet model

### unet3d
Expand Down
Loading