Skip to content

Commit 4cd2f80

Browse files
r4victorjvstme
andauthored
Drop AWS P3 support and use DLAMI for all AWS GPU instances (#3903)
* Discover and use the latest AWS Ubuntu 22.04 DLAMI Automatically use the latest build, so that customers can benefit from the latest security patches without setting `os_images` or waiting for the dstack team to update the default AMI. Still, only use Ubuntu 22.04 to avoid breaking changes. Implementation: fetch the details of all Ubuntu 22.04 DLAMI builds (119 as of today), find the most recent in runtime. * Use DLAMI for all AWS GPU instances * Drop p3 mentions --------- Co-authored-by: Jvst Me <git@jvst.me>
1 parent 60d212a commit 4cd2f80

6 files changed

Lines changed: 23 additions & 38 deletions

File tree

src/dstack/_internal/core/backends/aws/compute.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1244,7 +1244,6 @@ def _supported_instances(offer: InstanceOffer) -> bool:
12441244
"p5e.",
12451245
"p4d.",
12461246
"p4de.",
1247-
"p3.",
12481247
"g7e.",
12491248
"g6.",
12501249
"g6e.",

src/dstack/_internal/core/backends/aws/resources.py

Lines changed: 5 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66

77
import dstack.version as version
88
from dstack._internal.core.backends.aws.models import AWSOSImageConfig
9-
from dstack._internal.core.backends.base.compute import requires_nvidia_proprietary_kernel_modules
10-
from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
119
from dstack._internal.core.errors import BackendError, ComputeError, ComputeResourceNotFoundError
1210
from dstack._internal.utils.logging import get_logger
1311

@@ -31,17 +29,15 @@ def get_image_id_and_username(
3129
image_name = image.name
3230
image_owner = image.owner
3331
username = image.user
34-
elif _supported_by_dlami(instance_type):
32+
elif gpu_name is not None:
33+
# AWS Deep Learning AMIs (DLAMI) support all GPU instance types currently supported by dstack.
34+
# dstack's cuda AMI is still built but not used.
35+
# It may be used again in case some instance types are not supported by DLAMI.
3536
image_name = "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04) *"
3637
image_owner = DLAMI_OWNER_ACCOUNT_ID
3738
username = "ubuntu"
3839
else:
39-
if gpu_name is None:
40-
image_name = f"dstack-{version.base_image}"
41-
elif not requires_nvidia_proprietary_kernel_modules(gpu_name):
42-
image_name = f"dstack-cuda-{version.base_image}"
43-
else:
44-
image_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
40+
image_name = f"dstack-{version.base_image}"
4541
image_owner = DSTACK_ACCOUNT_ID
4642
username = "ubuntu"
4743
response = ec2_client.describe_images(
@@ -636,25 +632,6 @@ def _is_private_subnet_with_internet_egress(
636632
return False
637633

638634

639-
def _supported_by_dlami(instance_type: str) -> bool:
640-
# Currently only p3. instances are not supported by DLAMI among GPU instances.
641-
return any(
642-
instance_type.startswith(family)
643-
for family in [
644-
"g4dn.",
645-
"g5.",
646-
"g6.",
647-
"gr6.",
648-
"g6e.",
649-
"p4d.",
650-
"p4de.",
651-
"p5.",
652-
"p5e.",
653-
"p6-b200.",
654-
]
655-
)
656-
657-
658635
def get_reservation(
659636
ec2_client: botocore.client.BaseClient,
660637
reservation_id: str,

src/dstack/_internal/core/models/fleets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ class BackendFleetConfiguraionProps(CoreModel):
261261
instance_types: Annotated[
262262
Optional[List[str]],
263263
Field(
264-
description="The cloud-specific instance types to consider for provisioning (e.g., `[p3.8xlarge, n1-standard-4]`)"
264+
description="The cloud-specific instance types to consider for provisioning (e.g., `[g6e.24xlarge, n1-standard-4]`)"
265265
),
266266
] = None
267267
spot_policy: Annotated[

src/dstack/_internal/core/models/profiles.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ class ProfileParams(CoreModel):
257257
instance_types: Annotated[
258258
Optional[List[str]],
259259
Field(
260-
description="The cloud-specific instance types to consider for provisioning (e.g., `[p3.8xlarge, n1-standard-4]`)"
260+
description="The cloud-specific instance types to consider for provisioning (e.g., `[g6e.24xlarge, n1-standard-4]`)"
261261
),
262262
] = None
263263
reservation: Annotated[

src/tests/_internal/core/backends/aws/test_resources.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -150,14 +150,23 @@ def test_raises_resource_not_found_if_none_available(
150150
assert "image 'dstack-0.0' not found" in caplog.text
151151

152152
@pytest.mark.parametrize(
153-
["cuda", "expected"],
153+
["cuda", "expected_name", "expected_owner"],
154154
[
155-
[False, "dstack-0.0"],
156-
[True, "dstack-cuda-0.0"],
155+
[False, "dstack-0.0", "142421590066"],
156+
[
157+
True,
158+
"Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04) *",
159+
"898082745236",
160+
],
157161
],
158162
)
159-
def test_uses_dstack_image_name_and_account_id_if_image_config_not_provided(
160-
self, monkeypatch: pytest.MonkeyPatch, ec2_client_mock: Mock, cuda: bool, expected: str
163+
def test_uses_default_image_name_and_account_id_if_image_config_not_provided(
164+
self,
165+
monkeypatch: pytest.MonkeyPatch,
166+
ec2_client_mock: Mock,
167+
cuda: bool,
168+
expected_name: str,
169+
expected_owner: str,
161170
):
162171
monkeypatch.setattr("dstack.version.base_image", "0.0")
163172
_, username = get_image_id_and_username(
@@ -167,7 +176,7 @@ def test_uses_dstack_image_name_and_account_id_if_image_config_not_provided(
167176
)
168177
assert username == "ubuntu"
169178
ec2_client_mock.describe_images.assert_called_once_with(
170-
Filters=[{"Name": "name", "Values": [expected]}], Owners=["142421590066"]
179+
Filters=[{"Name": "name", "Values": [expected_name]}], Owners=[expected_owner]
171180
)
172181

173182
@pytest.mark.parametrize(

src/tests/_internal/server/routers/test_fleets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1534,7 +1534,7 @@ async def test_errors_if_ssh_key_is_bad(
15341534
[
15351535
pytest.param("backends", [BackendType.AWS], id="backends"),
15361536
pytest.param("regions", ["eu-west-1"], id="regions"),
1537-
pytest.param("instance_types", ["p3.8xlarge"], id="instance_types"),
1537+
pytest.param("instance_types", ["g6e.24xlarge"], id="instance_types"),
15381538
pytest.param("idle_duration", 60, id="idle_duration"),
15391539
pytest.param("tags", {}, id="tags"), # falsy value
15401540
],

0 commit comments

Comments
 (0)