diff --git a/ManPage.md b/ManPage.md index 78ddd278..631de986 100644 --- a/ManPage.md +++ b/ManPage.md @@ -507,6 +507,23 @@ Required positionals: `` then `` and, for `datagen`/`run`/`confi - **`--accelerator-type `, `-at `** Accelerator the workload should emulate (e.g. `h100`, `b200`, `mi355`). Determines per-accelerator access patterns and data rates. Required for `datasize`, `run`, `configview`. + **Supported (model, accelerator) combinations** + + | Model | a100 | h100 | b200 | mi355 | + |-----------|:------:|:------:|:----------:|:----------:| + | unet3d | whatif | — | **v3.0** | — | + | retinanet | — | — | **v3.0** | **v3.0** | + | cosmoflow | whatif | whatif | — | — | + | resnet50 | whatif | whatif | — | — | + | dlrm | — | — | whatif | whatif | + | flux | — | — | whatif | whatif | + + - **v3.0** — submittable in CLOSED or OPEN. + - **whatif** — available only via `mlpstorage whatif …` for planning. Not submittable. + - **—** — no workload definition file. `mlpstorage` will fail with a "combination not supported" error pointing at the missing YAML. + + Any (model, accelerator) combination not marked **v3.0** is available under `whatif` for planning purposes if a workload definition file is provided. + - **`--num-accelerators `, `-na `** Number of simulated accelerators for `run`/`configview`. Ranks are distributed round-robin across `--hosts`. diff --git a/mlpstorage_py/benchmarks/dlio.py b/mlpstorage_py/benchmarks/dlio.py index fc88b002..0dd39cf1 100755 --- a/mlpstorage_py/benchmarks/dlio.py +++ b/mlpstorage_py/benchmarks/dlio.py @@ -10,6 +10,7 @@ LLM_ALLOWED_VALUES, LLM_SUBSET_PROCS, EXIT_CODE, MODELS, HYDRA_OUTPUT_SUBDIR, LLM_SIZE_BY_RANK) from mlpstorage_py.dependency_check import validate_benchmark_dependencies +from mlpstorage_py.errors import ConfigurationError, ErrorCode from mlpstorage_py.rules import calculate_training_data_size, HostInfo, HostMemoryInfo, HostCPUInfo, ClusterInformation from mlpstorage_py.utils import (read_config_from_file, create_nested_dict, update_nested_dict, generate_mpi_prefix_cmd) from mlpstorage_py.storage_config import resolve_object_storage_config @@ -324,6 +325,54 @@ def _strip_uri_scheme(value): normalized = (parsed.netloc + parsed.path).rstrip('/') return normalized or parsed.netloc + def _raise_unsupported_workload(self, workload_abs): + """Raise ConfigurationError when the resolved workload YAML does not exist. + + The DLIO workload YAML name is derived from CLI args + (``_.yaml`` for training, ``.yaml`` for + checkpointing). When the file is absent the user has chosen a + combination we have no workload definition for — surface this with + an explicit "not supported" message and (for training) point at + the v3.0 submittable combinations. + """ + model = getattr(self.args, 'model', None) + accel = getattr(self.args, 'accelerator_type', None) + + if self.BENCHMARK_TYPE == BENCHMARK_TYPES.training: + message = ( + f"The combination --model={model} --accelerator-type={accel} " + f"is not supported." + ) + suggestion = ( + f"Missing workload definition: {workload_abs}\n" + " v3.0 submittable combinations (CLOSED or OPEN):\n" + " --model unet3d --accelerator-type b200\n" + " --model retinanet --accelerator-type b200\n" + " --model retinanet --accelerator-type mi355\n" + " Other (model, accelerator) pairs work under `whatif` if a " + "workload definition file exists for them; this combination " + "has none." + ) + parameter = "model+accelerator-type" + actual = f"{model} + {accel}" + else: + message = f"The model --model={model} is not supported." + suggestion = ( + f"Missing workload definition: {workload_abs}\n" + " Pass a --model value that has a matching " + "configs/dlio/workload/.yaml file." + ) + parameter = "model" + actual = str(model) + + raise ConfigurationError( + message=message, + parameter=parameter, + actual=actual, + suggestion=suggestion, + code=ErrorCode.CONFIG_FILE_NOT_FOUND, + ) + def process_dlio_params(self, config_file): params_dict = dict() if not self.args.params else {k: v for k, v in (item.split("=") for item in self.args.params)} @@ -337,7 +386,11 @@ def process_dlio_params(self, config_file): ) params_dict['storage.storage_root'] = normalized - yaml_params = read_config_from_file(os.path.join(self.DLIO_CONFIG_PATH, "workload", config_file)) + workload_rel = os.path.join(self.DLIO_CONFIG_PATH, "workload", config_file) + workload_abs = os.path.join(CONFIGS_ROOT_DIR, workload_rel) + if not os.path.isfile(workload_abs): + self._raise_unsupported_workload(workload_abs) + yaml_params = read_config_from_file(workload_rel) combined_params = update_nested_dict(yaml_params, create_nested_dict(params_dict)) self.logger.debug(f'yaml params: \n{pprint.pformat(yaml_params)}') diff --git a/mlpstorage_py/tests/test_benchmarks.py b/mlpstorage_py/tests/test_benchmarks.py index 92a7beb3..a5c640f8 100755 --- a/mlpstorage_py/tests/test_benchmarks.py +++ b/mlpstorage_py/tests/test_benchmarks.py @@ -594,5 +594,64 @@ def test_datasize_does_not_require_pymilvus(self, mock_logger): # _validate_vdb_dependencies should NOT have been called mock_dep.assert_not_called() +# ============================================================================= +# Tests for the "unsupported (model, accelerator) combination" error path +# (DLIO workload YAML lookup) +# ============================================================================= + +class TestUnsupportedWorkloadCombination: + """When the resolved configs/dlio/workload/_.yaml does not + exist, process_dlio_params must raise ConfigurationError(CONFIG_FILE_NOT_FOUND) + with a specific "combination not supported" message rather than a generic + FileNotFoundError.""" + + def _make_stub_training_benchmark(self, model, accelerator_type, mock_logger): + from mlpstorage_py.benchmarks.dlio import TrainingBenchmark + args = SimpleNamespace( + model=model, + accelerator_type=accelerator_type, + command='datasize', + params=None, + ) + with patch.object(TrainingBenchmark, '__init__', lambda *a, **kw: None): + bench = TrainingBenchmark.__new__(TrainingBenchmark) + bench.args = args + bench.logger = mock_logger + return bench + + def test_training_missing_yaml_raises_specific_error(self, mock_logger): + from mlpstorage_py.errors import ConfigurationError, ErrorCode + bench = self._make_stub_training_benchmark('unet3d', 'mi355', mock_logger) + with pytest.raises(ConfigurationError) as exc_info: + bench.process_dlio_params('unet3d_mi355.yaml') + err = exc_info.value + assert err.code == ErrorCode.CONFIG_FILE_NOT_FOUND + assert 'unet3d' in str(err) + assert 'mi355' in str(err) + assert 'not supported' in str(err) + # v3.0 advisory must list the three submittable combinations. + rendered = str(err) + "\n" + err.suggestion + assert 'retinanet' in rendered + assert 'b200' in rendered + + def test_training_existing_yaml_does_not_raise_unsupported(self, mock_logger): + """unet3d_b200.yaml DOES exist — process_dlio_params must get past the + existence check (it may raise later for unrelated reasons since we + don't fully wire the benchmark, but NOT with our specific error).""" + from mlpstorage_py.errors import ConfigurationError, ErrorCode + bench = self._make_stub_training_benchmark('unet3d', 'b200', mock_logger) + try: + bench.process_dlio_params('unet3d_b200.yaml') + except ConfigurationError as e: + assert e.code != ErrorCode.CONFIG_FILE_NOT_FOUND, ( + f"unet3d_b200.yaml exists; the unsupported-combination guard " + f"must not fire. Got: {e}" + ) + except Exception: + # Unrelated failures (e.g., logger/attribute access downstream) + # are fine — the guard already passed. + pass + + if __name__ == "__main__": pytest.main([__file__, "-v"]) diff --git a/training/README.md b/training/README.md index ea749791..30f91ac9 100644 --- a/training/README.md +++ b/training/README.md @@ -368,8 +368,26 @@ View Only: Note: The `reportgen` script must be run in the launcher client host. ## Training Models -Currently, the storage benchmark suite supports benchmarking of 3 deep learning workloads -- Image classification using a Unet3D model + +### Supported (model, accelerator) combinations + +| Model | a100 | h100 | b200 | mi355 | +|-----------|:------:|:------:|:----------:|:----------:| +| unet3d | whatif | — | **v3.0** | — | +| retinanet | — | — | **v3.0** | **v3.0** | +| cosmoflow | whatif | whatif | — | — | +| resnet50 | whatif | whatif | — | — | +| dlrm | — | — | whatif | whatif | +| flux | — | — | whatif | whatif | + +- **v3.0** — submittable in CLOSED or OPEN. +- **whatif** — available only via `mlpstorage whatif …` for planning. Not submittable. +- **—** — no workload definition file. `mlpstorage` will fail with a "combination not supported" error pointing at the missing YAML. + +Any (model, accelerator) combination not marked **v3.0** is available under `whatif` for planning purposes if a workload definition file is provided. + +The closed-submittable training models are Unet3D and RetinaNet: +- Image classification using a Unet3D model - Image recognition using a RetinaNet model ### unet3d