mlcommons · FileSystemGuy · Jun 27, 2026
@@ -507,6 +507,23 @@ Required positionals: `<model>` then `<command>` and, for `datagen`/`run`/`confi
 - **`--accelerator-type <type>`, `-at <type>`**
   Accelerator the workload should emulate (e.g. `h100`, `b200`, `mi355`). Determines per-accelerator access patterns and data rates. Required for `datasize`, `run`, `configview`.
 
+  **Supported (model, accelerator) combinations**
+
+  | Model     | a100   | h100   | b200       | mi355      |
+  |-----------|:------:|:------:|:----------:|:----------:|
+  | unet3d    | whatif | —      | **v3.0**   | —          |
+  | retinanet | —      | —      | **v3.0**   | **v3.0**   |
+  | cosmoflow | whatif | whatif | —          | —          |
+  | resnet50  | whatif | whatif | —          | —          |
+  | dlrm      | —      | —      | whatif     | whatif     |
+  | flux      | —      | —      | whatif     | whatif     |
+
+  - **v3.0** — submittable in CLOSED or OPEN.
+  - **whatif** — available only via `mlpstorage whatif …` for planning. Not submittable.
+  - **—** — no workload definition file. `mlpstorage` will fail with a "combination not supported" error pointing at the missing YAML.
+
+  Any (model, accelerator) combination not marked **v3.0** is available under `whatif` for planning purposes if a workload definition file is provided.
+
 - **`--num-accelerators <N>`, `-na <N>`**
   Number of simulated accelerators for `run`/`configview`. Ranks are distributed round-robin across `--hosts`.
 

@@ -10,6 +10,7 @@
                                LLM_ALLOWED_VALUES, LLM_SUBSET_PROCS, EXIT_CODE, MODELS, HYDRA_OUTPUT_SUBDIR,
                                LLM_SIZE_BY_RANK)
 from mlpstorage_py.dependency_check import validate_benchmark_dependencies
+from mlpstorage_py.errors import ConfigurationError, ErrorCode
 from mlpstorage_py.rules import calculate_training_data_size, HostInfo, HostMemoryInfo, HostCPUInfo, ClusterInformation
 from mlpstorage_py.utils import (read_config_from_file, create_nested_dict, update_nested_dict, generate_mpi_prefix_cmd)
 from mlpstorage_py.storage_config import resolve_object_storage_config
@@ -324,6 +325,54 @@ def _strip_uri_scheme(value):
         normalized = (parsed.netloc + parsed.path).rstrip('/')
         return normalized or parsed.netloc
 
+    def _raise_unsupported_workload(self, workload_abs):
+        """Raise ConfigurationError when the resolved workload YAML does not exist.
+
+        The DLIO workload YAML name is derived from CLI args
+        (``<model>_<accelerator>.yaml`` for training, ``<model>.yaml`` for
+        checkpointing). When the file is absent the user has chosen a
+        combination we have no workload definition for — surface this with
+        an explicit "not supported" message and (for training) point at
+        the v3.0 submittable combinations.
+        """
+        model = getattr(self.args, 'model', None)
+        accel = getattr(self.args, 'accelerator_type', None)
+
+        if self.BENCHMARK_TYPE == BENCHMARK_TYPES.training:
+            message = (
+                f"The combination --model={model} --accelerator-type={accel} "
+                f"is not supported."
+            )
+            suggestion = (
+                f"Missing workload definition: {workload_abs}\n"
+                "  v3.0 submittable combinations (CLOSED or OPEN):\n"
+                "    --model unet3d    --accelerator-type b200\n"
+                "    --model retinanet --accelerator-type b200\n"
+                "    --model retinanet --accelerator-type mi355\n"
+                "  Other (model, accelerator) pairs work under `whatif` if a "
+                "workload definition file exists for them; this combination "
+                "has none."
+            )
+            parameter = "model+accelerator-type"
+            actual = f"{model} + {accel}"
+        else:
+            message = f"The model --model={model} is not supported."
+            suggestion = (
+                f"Missing workload definition: {workload_abs}\n"
+                "  Pass a --model value that has a matching "
+                "configs/dlio/workload/<model>.yaml file."
+            )
+            parameter = "model"
+            actual = str(model)
+
+        raise ConfigurationError(
+            message=message,
+            parameter=parameter,
+            actual=actual,
+            suggestion=suggestion,
+            code=ErrorCode.CONFIG_FILE_NOT_FOUND,
+        )
+
     def process_dlio_params(self, config_file):
         params_dict = dict() if not self.args.params else {k: v for k, v in (item.split("=") for item in self.args.params)}
 
@@ -337,7 +386,11 @@ def process_dlio_params(self, config_file):
                 )
                 params_dict['storage.storage_root'] = normalized
 
-        yaml_params = read_config_from_file(os.path.join(self.DLIO_CONFIG_PATH, "workload", config_file))
+        workload_rel = os.path.join(self.DLIO_CONFIG_PATH, "workload", config_file)
+        workload_abs = os.path.join(CONFIGS_ROOT_DIR, workload_rel)
+        if not os.path.isfile(workload_abs):
+            self._raise_unsupported_workload(workload_abs)
+        yaml_params = read_config_from_file(workload_rel)
         combined_params = update_nested_dict(yaml_params, create_nested_dict(params_dict))
 
         self.logger.debug(f'yaml params: \n{pprint.pformat(yaml_params)}')

@@ -594,5 +594,64 @@ def test_datasize_does_not_require_pymilvus(self, mock_logger):
                 # _validate_vdb_dependencies should NOT have been called
                 mock_dep.assert_not_called()
 
+# =============================================================================
+# Tests for the "unsupported (model, accelerator) combination" error path
+# (DLIO workload YAML lookup)
+# =============================================================================
+
+class TestUnsupportedWorkloadCombination:
+    """When the resolved configs/dlio/workload/<model>_<accel>.yaml does not
+    exist, process_dlio_params must raise ConfigurationError(CONFIG_FILE_NOT_FOUND)
+    with a specific "combination not supported" message rather than a generic
+    FileNotFoundError."""
+
+    def _make_stub_training_benchmark(self, model, accelerator_type, mock_logger):
+        from mlpstorage_py.benchmarks.dlio import TrainingBenchmark
+        args = SimpleNamespace(
+            model=model,
+            accelerator_type=accelerator_type,
+            command='datasize',
+            params=None,
+        )
+        with patch.object(TrainingBenchmark, '__init__', lambda *a, **kw: None):
+            bench = TrainingBenchmark.__new__(TrainingBenchmark)
+            bench.args = args
+            bench.logger = mock_logger
+            return bench
+
+    def test_training_missing_yaml_raises_specific_error(self, mock_logger):
+        from mlpstorage_py.errors import ConfigurationError, ErrorCode
+        bench = self._make_stub_training_benchmark('unet3d', 'mi355', mock_logger)
+        with pytest.raises(ConfigurationError) as exc_info:
+            bench.process_dlio_params('unet3d_mi355.yaml')
+        err = exc_info.value
+        assert err.code == ErrorCode.CONFIG_FILE_NOT_FOUND
+        assert 'unet3d' in str(err)
+        assert 'mi355' in str(err)
+        assert 'not supported' in str(err)
+        # v3.0 advisory must list the three submittable combinations.
+        rendered = str(err) + "\n" + err.suggestion
+        assert 'retinanet' in rendered
+        assert 'b200' in rendered
+
+    def test_training_existing_yaml_does_not_raise_unsupported(self, mock_logger):
+        """unet3d_b200.yaml DOES exist — process_dlio_params must get past the
+        existence check (it may raise later for unrelated reasons since we
+        don't fully wire the benchmark, but NOT with our specific error)."""
+        from mlpstorage_py.errors import ConfigurationError, ErrorCode
+        bench = self._make_stub_training_benchmark('unet3d', 'b200', mock_logger)
+        try:
+            bench.process_dlio_params('unet3d_b200.yaml')
+        except ConfigurationError as e:
+            assert e.code != ErrorCode.CONFIG_FILE_NOT_FOUND, (
+                f"unet3d_b200.yaml exists; the unsupported-combination guard "
+                f"must not fire. Got: {e}"
+            )
+        except Exception:
+            # Unrelated failures (e.g., logger/attribute access downstream)
+            # are fine — the guard already passed.
+            pass
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])
@@ -368,8 +368,26 @@ View Only:
 Note: The `reportgen` script must be run in the launcher client host. 
 
 ## Training Models
-Currently, the storage benchmark suite supports benchmarking of 3 deep learning workloads
-- Image classification using a Unet3D model 
+
+### Supported (model, accelerator) combinations
+
+| Model     | a100   | h100   | b200       | mi355      |
+|-----------|:------:|:------:|:----------:|:----------:|
+| unet3d    | whatif | —      | **v3.0**   | —          |
+| retinanet | —      | —      | **v3.0**   | **v3.0**   |
+| cosmoflow | whatif | whatif | —          | —          |
+| resnet50  | whatif | whatif | —          | —          |
+| dlrm      | —      | —      | whatif     | whatif     |
+| flux      | —      | —      | whatif     | whatif     |
+
+- **v3.0** — submittable in CLOSED or OPEN.
+- **whatif** — available only via `mlpstorage whatif …` for planning. Not submittable.
+- **—** — no workload definition file. `mlpstorage` will fail with a "combination not supported" error pointing at the missing YAML.
+
+Any (model, accelerator) combination not marked **v3.0** is available under `whatif` for planning purposes if a workload definition file is provided.
+
+The closed-submittable training models are Unet3D and RetinaNet:
+- Image classification using a Unet3D model
 - Image recognition using a RetinaNet model
 
 ### unet3d