diff --git a/cmake/FbossImageDistroCliTests.cmake b/cmake/FbossImageDistroCliTests.cmake index 855999bcc1152..8efe8ff0b1b56 100644 --- a/cmake/FbossImageDistroCliTests.cmake +++ b/cmake/FbossImageDistroCliTests.cmake @@ -17,7 +17,6 @@ find_package(Python3 3.10 COMPONENTS Interpreter REQUIRED) message(STATUS "Using Python ${Python3_VERSION} (${Python3_EXECUTABLE}) for distro_cli tests") include(FBPythonBinary) - file(GLOB DISTRO_CLI_TEST_SOURCES "fboss-image/distro_cli/tests/*_test.py" ) @@ -86,6 +85,19 @@ add_custom_command( COMMENT "Copying test data files for distro_cli_tests" ) +# Copy scripts directory used in unit tests +set(SCRIPTS_DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}/distro_cli_tests/distro_cli/scripts") +set(SCRIPTS_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/fboss-image/distro_cli/scripts") + +add_custom_command( + TARGET distro_cli_tests.GEN_PY_EXE + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_directory + "${SCRIPTS_SOURCE_DIR}" + "${SCRIPTS_DEST_DIR}" + COMMENT "Copying scripts for distro_cli_tests" +) + install_fb_python_executable(distro_cli_tests) # Restore the original Python3_EXECUTABLE if it was set diff --git a/fboss-image/distro_cli/cmds/device.py b/fboss-image/distro_cli/cmds/device.py index 4cc084d4ae97d..536a2f93ed229 100644 --- a/fboss-image/distro_cli/cmds/device.py +++ b/fboss-image/distro_cli/cmds/device.py @@ -7,13 +7,32 @@ """Device command implementation.""" +import json import logging +import os +import sys +from pathlib import Path -from lib.cli import validate_path +from distro_cli.lib.cli import validate_path +from distro_cli.lib.device_update import DeviceUpdateError, DeviceUpdater +from distro_cli.lib.distro_infra import ( + DISTRO_INFRA_CONTAINER, + GETIP_SCRIPT_CONTAINER_PATH, + deploy_image_to_device, + get_interface_name, +) +from distro_cli.lib.docker import container +from distro_cli.lib.exceptions import DistroInfraError +from distro_cli.lib.manifest import ImageManifest logger = logging.getLogger(__name__) +def print_to_console(message: str) -> None: + """Print message to console""" + print(message) # noqa: T201 + + def image_upstream_command(args): """Download full image from upstream repository and set it to be loaded onto device""" logger.info(f"Setting upstream image for device {args.mac}") @@ -21,15 +40,57 @@ def image_upstream_command(args): def image_command(args): - """Set device image from file""" + """Set device image from file and configure PXE boot""" logger.info(f"Setting image for device {args.mac}: {args.image_path}") - logger.info("Device image command (stub)") + + try: + deploy_image_to_device(args.mac, args.image_path) + logger.info( + f"Successfully configured device {args.mac} with image {args.image_path}" + ) + logger.info("Device is ready for PXE boot") + + except DistroInfraError as e: + logger.error(f"Failed to configure device: {e}") + sys.exit(1) + except Exception as e: + logger.error(f"Unexpected error: {e}") + sys.exit(1) def reprovision_command(args): """Reprovision device""" - logger.info(f"Reprovisioning device {args.mac}") - logger.info("Device reprovision command (stub)") + ip_address = get_device_ip(args.mac) + + if not ip_address: + logger.error("No IP address found for device") + return + + # devpart -> /dev/nvme0n1p3 + # dev -> /dev/nvme0n3 + # part -> 3 + cmd = r""" + if [ ! -d /opt/fboss ]; then echo "Not an FBOSS device. Aborting"; exit 1; fi; \ + rm -rf /boot/efi/EFI/*; + root_devpart=$(mount | awk '/\/ type/ { print $1 }'); + root_dev=$(mount | awk -F 'p' '/\/ type/ { print $1 }'); + root_part=$(mount | awk -F '[[:space:]p]' '/\/ type/ { print $2 }'); + dd if=/dev/zero of=${root_devpart} bs=1M count=50; + (sleep 1; echo yes; sleep 1; echo ignore) | parted ---pretend-input-tty ${root_dev} rm ${root_part}; + reboot --force + """ + os.execvp( + "ssh", + [ + "ssh", + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/dev/null", + f"root@{ip_address}", + cmd, + ], + ) def update_command(args): @@ -37,19 +98,116 @@ def update_command(args): logger.info(f"Updating device {args.mac}") logger.info(f"Manifest: {args.manifest}") logger.info(f"Components: {' '.join(args.components)}") - logger.info("Device update command (stub)") + + manifest = ImageManifest(Path(args.manifest)) + + # Get device IP once for all components + device_ip = get_device_ip(args.mac) + if not device_ip: + logger.error("Cannot update: device IP not found") + sys.exit(1) + + for component in args.components: + try: + updater = DeviceUpdater( + mac=args.mac, + manifest=manifest, + component=component, + device_ip=device_ip, + ) + updater.update() + logger.info(f"Successfully updated {component}") + except DeviceUpdateError as e: + logger.error(f"Failed to update {component}: {e}") + sys.exit(1) + + +def get_device_ip(mac: str) -> str | None: + """Get device IP address by querying the distro-infra container. + + Args: + mac: Device MAC address + + Returns: + IP address string (IPv4 preferred, IPv6 fallback), or None if not found + """ + if not container.container_is_running(DISTRO_INFRA_CONTAINER): + logger.error(f"Container '{DISTRO_INFRA_CONTAINER}' is not running") + logger.error("Please start the distro-infra container first") + return None + + try: + interface = get_interface_name() + except DistroInfraError as e: + logger.error(f"Failed to get interface name: {e}") + return None + + cmd = [GETIP_SCRIPT_CONTAINER_PATH, mac, interface] + + # Execute in container + exit_code, stdout, stderr = container.exec_in_container(DISTRO_INFRA_CONTAINER, cmd) + + if exit_code != 0: + logger.error(f"getip.sh failed with exit code {exit_code}") + if stderr: + logger.error(f"stderr: {stderr}") + if stdout: + logger.error(f"stdout: {stdout}") + return None + + try: + result = json.loads(stdout) + + if "error_code" in result: + logger.error(f"Error: {result.get('error', 'Unknown error')}") + logger.error(f"Error code: {result['error_code']}") + return None + + ipv4 = result.get("ipv4") + ipv6 = result.get("ipv6") + + return ipv4 if ipv4 else ipv6 + + except json.JSONDecodeError as e: + logger.error(f"Failed to parse JSON output: {e}") + logger.error(f"Output was: {stdout}") + return None def getip_command(args): """Get device IP address""" logger.info(f"Getting IP for device {args.mac}") - logger.info("Device getip command (stub)") + + ip_address = get_device_ip(args.mac) + + if ip_address: + print_to_console(ip_address) + else: + logger.error("No IP address found in response") def ssh_command(args): """SSH to device""" logger.info(f"SSH to device {args.mac}") - logger.info("Device ssh command (stub)") + + ip_address = get_device_ip(args.mac) + + if not ip_address: + logger.error("No IP address found for device") + return + + logger.info(f"Connecting to {ip_address}") + os.execvp( + "ssh", + [ + "ssh", + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/dev/null", + f"root@{ip_address}", + ], + ) def setup_device_commands(cli): @@ -103,7 +261,13 @@ def setup_device_commands(cli): ) device.add_command( - "getip", getip_command, help_text="Get device IP address", arguments=[] + "getip", + getip_command, + help_text="Get device IP address", ) - device.add_command("ssh", ssh_command, help_text="SSH to device", arguments=[]) + device.add_command( + "ssh", + ssh_command, + help_text="SSH to device", + ) diff --git a/fboss-image/distro_cli/lib/device_update.py b/fboss-image/distro_cli/lib/device_update.py new file mode 100644 index 0000000000000..5d086c79c697a --- /dev/null +++ b/fboss-image/distro_cli/lib/device_update.py @@ -0,0 +1,252 @@ +# Copyright (c) 2004-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +"""Device update logic for updating FBOSS services on devices.""" + +import logging +import subprocess +import uuid +from pathlib import Path + +from distro_cli.builder.image_builder import ImageBuilder +from distro_cli.lib.exceptions import DistroInfraError +from distro_cli.lib.manifest import ImageManifest + +logger = logging.getLogger(__name__) + +# Component to systemd services mapping. +COMPONENT_SERVICES: dict[str, list[str]] = { + "fboss-forwarding-stack": ["wedge_agent", "fsdb", "qsfp_service"], + "fboss-platform-stack": [ + "platform_manager", + "sensor_service", + "fan_service", + "data_corral_service", + ], +} + +# Path to the update script that runs on the device +UPDATE_SCRIPT_PATH = Path(__file__).parent.parent / "scripts" / "update_service.sh" + + +class DeviceUpdateError(DistroInfraError): + """Error during device update.""" + + +class DeviceUpdater: + """Handles updating FBOSS services on a device. + + Workflow: + 1. Validate component is supported for update + 2. Acquire artifacts (build OR download) + 3. SCP artifact and update_service.sh to device + 4. SSH: run update_service.sh + """ + + def __init__( + self, + mac: str, + manifest: ImageManifest, + component: str, + device_ip: str | None = None, + ): + """Initialize the DeviceUpdater. + + Args: + mac: Device MAC address + manifest: Parsed image manifest + component: Component name to update + device_ip: Optional device IP (if already known) + """ + self.mac = mac + self.manifest = manifest + self.component = component + self.device_ip = device_ip + + def _get_services(self) -> list[str]: + """Get systemd services for the component.""" + return COMPONENT_SERVICES.get(self.component, []) + + def validate(self) -> None: + """Validate the update request. + + Raises: + DeviceUpdateError: If validation fails + """ + if self.component not in COMPONENT_SERVICES: + raise DeviceUpdateError( + f"Component '{self.component}' is not updatable. " + f"Updatable components: {', '.join(COMPONENT_SERVICES.keys())}" + ) + + if not self.manifest.has_component(self.component): + raise DeviceUpdateError( + f"Component '{self.component}' not found in manifest" + ) + + services = self._get_services() + if not services: + raise DeviceUpdateError( + f"Component '{self.component}' has no services defined in COMPONENT_SERVICES" + ) + + component_data = self.manifest.get_component(self.component) + has_download = "download" in component_data + has_execute = "execute" in component_data + + if not has_download and not has_execute: + raise DeviceUpdateError( + f"Component '{self.component}' has neither 'download' nor 'execute'" + ) + + def _acquire_artifacts(self) -> Path: + """Acquire component artifacts via build or download. + + Uses ImageBuilder to handle both execute (build) and download modes. + Dependencies are automatically built if needed. + + Returns: + Path to the component artifact (tarball) + + Raises: + DeviceUpdateError: If artifact acquisition fails + """ + logger.info(f"Acquiring artifacts for {self.component}") + + builder = ImageBuilder(self.manifest) + builder.build_components([self.component]) + + artifact_path = builder.component_artifacts.get(self.component) + if not artifact_path: + raise DeviceUpdateError( + f"No artifact produced for component '{self.component}'" + ) + + logger.info(f"Artifact acquired: {artifact_path}") + return artifact_path + + def _transfer_and_execute(self, artifact_path: Path, services: list[str]) -> None: + """Transfer artifact and update script to device and execute. + + Args: + artifact_path: Path to the component artifact tarball + services: List of services to update + + Raises: + DeviceUpdateError: If transfer or execution fails + """ + if not self.device_ip: + raise DeviceUpdateError("Device IP not set") + + if not UPDATE_SCRIPT_PATH.exists(): + raise DeviceUpdateError(f"Update script not found: {UPDATE_SCRIPT_PATH}") + + device_user = "root" + remote_dir = f"/tmp/fboss-update-{uuid.uuid4().hex[:8]}" + ssh_opts = [ + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/dev/null", + ] + + logger.info(f"Transferring files to {self.device_ip}:{remote_dir}") + + try: + # Create remote directory + result = subprocess.run( + [ + "ssh", + *ssh_opts, + f"{device_user}@{self.device_ip}", + f"mkdir -p {remote_dir}", + ], + capture_output=True, + check=False, + ) + if result.returncode != 0: + raise DeviceUpdateError( + f"Failed to create remote directory: {result.stderr.decode()}" + ) + + # SCP artifact(s) and update script to device + # Handle both single artifact (Path) and multiple artifacts (list of Paths) + artifacts = ( + artifact_path if isinstance(artifact_path, list) else [artifact_path] + ) + scp_files = [str(a) for a in artifacts] + [str(UPDATE_SCRIPT_PATH)] + + result = subprocess.run( + [ + "scp", + *ssh_opts, + *scp_files, + f"{device_user}@{self.device_ip}:{remote_dir}/", + ], + capture_output=True, + check=False, + ) + if result.returncode != 0: + raise DeviceUpdateError( + f"Failed to transfer files: {result.stderr.decode()}" + ) + + services_arg = " ".join(services) + remote_cmd = ( + f"cd {remote_dir} && " + f"chmod +x update_service.sh && " + f"./update_service.sh {self.component} {services_arg}" + ) + + logger.info("Executing update on device...") + result = subprocess.run( + ["ssh", *ssh_opts, f"{device_user}@{self.device_ip}", remote_cmd], + capture_output=True, + check=False, + ) + if result.returncode != 0: + raise DeviceUpdateError( + f"Failed to execute update: {result.stderr.decode()}" + ) + + logger.info(f"Update output:\n{result.stdout.decode()}") + finally: + # Cleanup remote directory if present + subprocess.run( + [ + "ssh", + *ssh_opts, + f"{device_user}@{self.device_ip}", + f"rm -rf {remote_dir}", + ], + capture_output=True, + check=False, + ) + + def update(self) -> bool: + """Execute the update workflow. + + Returns: + True if update succeeded + + Raises: + DeviceUpdateError: If update fails + """ + if not self.device_ip: + raise DeviceUpdateError("Device IP not set") + + self.validate() + + services = self._get_services() + logger.info(f"Updating {self.component} on device {self.mac}") + logger.info(f"Services to restart: {', '.join(services)}") + + artifact_path = self._acquire_artifacts() + self._transfer_and_execute(artifact_path, services) + + logger.info(f"Successfully updated {self.component} on device {self.mac}") + return True diff --git a/fboss-image/distro_cli/lib/distro_infra.py b/fboss-image/distro_cli/lib/distro_infra.py new file mode 100644 index 0000000000000..e585822d1408c --- /dev/null +++ b/fboss-image/distro_cli/lib/distro_infra.py @@ -0,0 +1,201 @@ +# Copyright (c) 2004-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +"""Distro Infrastructure helper functions.""" + +import json +import logging +import re +import subprocess +import tarfile +from pathlib import Path + +from distro_cli.lib.docker import container +from distro_cli.lib.exceptions import DistroInfraError + +logger = logging.getLogger("fboss-image") + +# This should match DISTRO_CONTAINER_NAME in distro_infra/distro_infra.sh +DISTRO_INFRA_CONTAINER = "fboss-distro-infra" + +GETIP_SCRIPT_CONTAINER_PATH = "/distro_infra/getip.sh" + + +def normalize_mac_address(mac: str) -> tuple[str, str]: + """Normalize MAC address to both dash and colon formats. + + Args: + mac: MAC address in any format + + Returns: + Tuple of (dash_format, colon_format) + e.g., ("aa-bb-cc-dd-ee-ff", "aa:bb:cc:dd:ee:ff") + + Raises: + DistroInfraError: If MAC address is invalid + """ + # Remove all separators and convert to lowercase + mac_clean = re.sub(r"[:\-]", "", mac.lower()) + + # Validate MAC address format (12 hex characters) + if not re.match(r"^[0-9a-f]{12}$", mac_clean): + raise DistroInfraError( + f"Invalid MAC address: {mac}. Expected 12 hex characters with optional colons or dashes." + ) + + # Convert to dash and colon formats + dash_mac = "-".join([mac_clean[i : i + 2] for i in range(0, 12, 2)]) + colon_mac = ":".join([mac_clean[i : i + 2] for i in range(0, 12, 2)]) + + return dash_mac, colon_mac + + +def get_interface_name() -> str: + """Get the network interface name of the distro-infra container from the persistent directory. + + Returns: + Network interface name + + Raises: + DistroInfraError: If interface_name.txt not found or empty + """ + persistent_dir = find_persistent_dir() + interface_file = persistent_dir / "interface_name.txt" + + if not interface_file.exists(): + raise DistroInfraError( + f"Interface name file not found: {interface_file}. " + "The distro-infra container may not have started properly." + ) + + interface = interface_file.read_text().strip() + if not interface: + raise DistroInfraError(f"Interface name file is empty: {interface_file}") + + return interface + + +def find_persistent_dir() -> Path: + """Find the persistent directory mounted in the distro_infra container. + + Returns: + Path to the persistent directory on the host + + Raises: + DistroInfraError: If container is not running or persistent dir not found + """ + # Check if container is running + if not container.container_is_running(DISTRO_INFRA_CONTAINER): + raise DistroInfraError( + f"Container '{DISTRO_INFRA_CONTAINER}' is not running. " + "Please start it first with distro_infra.sh" + ) + + try: + result = subprocess.run( + ["docker", "inspect", DISTRO_INFRA_CONTAINER], + capture_output=True, + text=True, + check=True, + ) + inspect_data = json.loads(result.stdout) + + if not inspect_data: + raise DistroInfraError( + f"Container {DISTRO_INFRA_CONTAINER} is not running. " + "Please start it first with distro_infra.sh" + ) + + # Find the volume mount for /distro_infra/persistent + mounts = inspect_data[0].get("Mounts", []) + for mount in mounts: + if mount.get("Destination") == "/distro_infra/persistent": + return Path(mount["Source"]) + + raise DistroInfraError( + f"Could not find persistent directory mount in container {DISTRO_INFRA_CONTAINER}" + ) + + except subprocess.CalledProcessError as e: + raise DistroInfraError( + f"Container {DISTRO_INFRA_CONTAINER} is not running. " + "Please start it first with distro_infra.sh" + ) from e + except (json.JSONDecodeError, KeyError, IndexError) as e: + raise DistroInfraError(f"Failed to parse container inspect data: {e}") from e + + +def enable_pxe_boot(mac: str) -> None: + """Enable PXE boot for a device MAC address. + + This creates the necessary directory structure and configuration files + to enable PXE boot for the specified MAC address by calling the + enable_pxeboot.sh script inside the container. + + Args: + mac: MAC address of the device + + Raises: + DistroInfraError: If operation fails + """ + dash_mac, _ = normalize_mac_address(mac) + logger.info(f"Enabling PXE boot for MAC address: {dash_mac}") + + # Call the enable_pxeboot.sh script inside the container + exit_code, stdout, stderr = container.exec_in_container( + DISTRO_INFRA_CONTAINER, + ["/distro_infra/enable_pxeboot.sh", dash_mac], + ) + + if exit_code != 0: + raise DistroInfraError(f"Failed to enable PXE boot for {dash_mac}: {stderr}") + + # Log the output from the script + if stdout: + for line in stdout.strip().split("\n"): + logger.debug(f"enable_pxeboot.sh: {line}") + + +def deploy_image_to_device(mac: str, image_path: str) -> None: + """Deploy an image to a device for PXE boot. + + This function only supports tarball images (.tar, .tar.gz, .tar.zst, etc.) + as produced by the image builder. + + Args: + mac: MAC address of the device + image_path: Path to the image tarball + + Raises: + DistroInfraError: If operation fails or image format is unsupported + """ + image_path_obj = Path(image_path) + + if not image_path_obj.exists(): + raise DistroInfraError(f"Image path not found: {image_path}") + + if not tarfile.is_tarfile(image_path_obj): + raise DistroInfraError( + f"Unsupported image format: {image_path}. " + "Only tarball images (.tar, .tar.gz, .tar.zst) are supported." + ) + + persistent_dir = find_persistent_dir() + logger.info(f"Using persistent directory: {persistent_dir}") + + dash_mac, _ = normalize_mac_address(mac) + mac_dir = persistent_dir / dash_mac + + logger.info(f"Extracting image tarball to {mac_dir}...") + try: + with tarfile.open(image_path_obj, "r") as tar: + tar.extractall(path=mac_dir, filter="data") + logger.info(f"Image extracted successfully to {mac_dir}") + except tarfile.TarError as e: + raise DistroInfraError(f"Failed to extract tarball: {e}") from e + + enable_pxe_boot(mac) diff --git a/fboss-image/distro_cli/lib/docker/container.py b/fboss-image/distro_cli/lib/docker/container.py index 8eedac3ca2a21..0e02ef6610b0a 100644 --- a/fboss-image/distro_cli/lib/docker/container.py +++ b/fboss-image/distro_cli/lib/docker/container.py @@ -22,6 +22,7 @@ def run_container( # noqa: PLR0913 privileged: bool = False, interactive: bool = False, ephemeral: bool = True, + detach: bool = False, working_dir: str | None = None, name: str | None = None, ) -> int: @@ -60,6 +61,9 @@ def run_container( # noqa: PLR0913 if interactive: cmd.extend(["-i", "-t"]) + if detach: + cmd.append("-d") + if privileged: cmd.append("--privileged") @@ -95,3 +99,158 @@ def run_container( # noqa: PLR0913 raise RuntimeError( "Docker command not found. Is Docker installed and in PATH?" ) from e + except subprocess.CalledProcessError as e: + logger.error(f"Command failed: {e}") + return e.returncode + + +def exec_in_container( + name: str, + command: list[str], +) -> tuple[int, str, str]: + """Execute a command in a running Docker container. + + Args: + name: Name of the container + command: Command to execute in container (as list) + + Returns: + Tuple of exit code, stdout, and stderr from the command execution + + Raises: + RuntimeError: If docker command fails + """ + logger.info(f"Executing command in container {name}: {command}") + + cmd = ["docker", "exec", name] + cmd.extend(command) + + logger.debug(f"Running: {' '.join(str(c) for c in cmd)}") + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + check=True, + ) + return result.returncode, result.stdout, result.stderr + except FileNotFoundError: + raise RuntimeError("Docker command not found. Is Docker installed and in PATH?") + except subprocess.CalledProcessError as e: + logger.error(f"Command failed: {e}") + return e.returncode, e.stdout, e.stderr + + +def container_is_running(name: str) -> bool: + """Check if a Docker container is running. + + Args: + name: Name of the container + + Returns: + True if container is running, False otherwise + + Raises: + RuntimeError: If docker command fails + """ + logger.info(f"Checking if container is running: {name}") + + cmd = ["docker", "ps", "-aq", "--filter", f"name={name}"] + + logger.debug(f"Running: {' '.join(str(c) for c in cmd)}") + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + check=True, + ) + return result.returncode == 0 and bool(result.stdout.strip()) + except FileNotFoundError: + raise RuntimeError("Docker command not found. Is Docker installed and in PATH?") + except subprocess.CalledProcessError as e: + logger.error(f"Check command failed: {e}") + return False + + +def stop_container(name: str) -> int: + """Stop a Docker container. + + Args: + name: Name of the container + + Returns: + Exit code from the stop command + + Raises: + RuntimeError: If docker command fails + """ + logger.info(f"Stopping container: {name}") + + cmd = ["docker", "stop", name] + + logger.debug(f"Running: {' '.join(str(c) for c in cmd)}") + + try: + result = subprocess.run(cmd, check=True) + logger.info(f"Stop command exited with code: {result.returncode}") + return result.returncode + except FileNotFoundError as e: + raise RuntimeError( + "Docker command not found. Is Docker installed and in PATH?" + ) from e + except subprocess.CalledProcessError as e: + logger.error(f"Stop command failed: {e}") + return e.returncode + + +def remove_container(name: str) -> int: + """Remove a Docker container. + + Args: + name: Name of the container + + Returns: + Exit code from the remove command + + Raises: + RuntimeError: If docker command fails + """ + logger.info(f"Removing container: {name}") + + cmd = ["docker", "rm", "-f", name] + + logger.debug(f"Running: {' '.join(str(c) for c in cmd)}") + + try: + result = subprocess.run(cmd, check=True) + logger.info(f"Remove command exited with code: {result.returncode}") + return result.returncode + except FileNotFoundError as e: + raise RuntimeError( + "Docker command not found. Is Docker installed and in PATH?" + ) from e + except subprocess.CalledProcessError as e: + logger.error(f"Remove command failed: {e}") + return e.returncode + + +def stop_and_remove_container(name: str) -> int: + """Stop and remove a Docker container. + + Args: + name: Name of the container + + Returns: + Exit code from the remove command + + Raises: + RuntimeError: If docker command fails + """ + logger.info(f"Stopping and removing container: {name}") + stop_exit_code = stop_container(name) + if stop_exit_code != 0: + return stop_exit_code + return remove_container(name) diff --git a/fboss-image/distro_cli/lib/exceptions.py b/fboss-image/distro_cli/lib/exceptions.py index d61d23efbaa38..05a3ce45b3871 100644 --- a/fboss-image/distro_cli/lib/exceptions.py +++ b/fboss-image/distro_cli/lib/exceptions.py @@ -55,3 +55,14 @@ class ComponentError(FbossImageError): - Component configuration invalid - Component builder not implemented """ + + +class DistroInfraError(FbossImageError): + """Distro Infrastructure operation failed. + + Raised when: + - Container operations fail + - PXE boot configuration fails + - Device IP retrieval fails + - MAC address validation fails + """ diff --git a/fboss-image/distro_cli/scripts/update_service.sh b/fboss-image/distro_cli/scripts/update_service.sh new file mode 100755 index 0000000000000..fd8df5ce57462 --- /dev/null +++ b/fboss-image/distro_cli/scripts/update_service.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# Copyright (c) 2004-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Update script that runs on the FBOSS device to update services. +# Installs whatever artifact exists in the script's directory. +# Usage: ./update_service.sh [service2] ... + +set -eou pipefail + +if [ $# -lt 1 ]; then + echo "Usage: $0 [service1] [service2] ..." >&2 + exit 1 +fi + +COMPONENT="$1" +shift + +SCRIPT_DIR=$(dirname "$(readlink -f "$0")") + +SERVICES="$*" +if [ -z "${SERVICES}" ]; then + echo "Error: Component '${COMPONENT}' requires at least one service" >&2 + echo "Usage: $0 [service2] ..." >&2 + exit 1 +fi + +TIMESTAMP=$(date +%s) +# Resolve symlinks to real paths (needed for systemd RootDirectory which doesn't follow symlinks) +BASE_SNAPSHOT="$(readlink -f /distro-base)" +UPDATES_DIR="$(readlink -f /updates)" + +echo "Updating component: ${COMPONENT}" +echo "Services to restart: ${SERVICES}" + +# Extract all artifacts from script's directory to staging +mkdir -p "${UPDATES_DIR}" +STAGING_DIR=$(mktemp -d -p "${UPDATES_DIR}") +trap 'rm -rf "${STAGING_DIR}"' EXIT + +FOUND_ARTIFACT=false +shopt -s nullglob +for f in "${SCRIPT_DIR}"/*.tar.zst "${SCRIPT_DIR}"/*.tar; do + FOUND_ARTIFACT=true + echo "Extracting ${f}..." + tar -xf "$f" -C "${STAGING_DIR}" +done +shopt -u nullglob + +if [ "${FOUND_ARTIFACT}" = false ]; then + echo "Error: No artifact found in ${SCRIPT_DIR}" >&2 + exit 1 +fi + +for svc in ${SERVICES}; do + SNAPSHOT_PATH="${UPDATES_DIR}/${svc}-${TIMESTAMP}" + echo "Creating snapshot for ${svc}: ${SNAPSHOT_PATH}" + + btrfs subvolume snapshot "${BASE_SNAPSHOT}" "${SNAPSHOT_PATH}" + cp -a "${STAGING_DIR}"/* "${SNAPSHOT_PATH}/opt/fboss/" + + mkdir -p "/etc/systemd/system/${svc}.service.d/" + cat >"/etc/systemd/system/${svc}.service.d/root-override.conf" </dev/null || true + fi + done +done + +echo "Update complete for component: ${COMPONENT}" diff --git a/fboss-image/distro_cli/tests/data/artifacts/platform_stack.tar.zst b/fboss-image/distro_cli/tests/data/artifacts/platform_stack.tar.zst new file mode 100644 index 0000000000000..df052bc3c377f Binary files /dev/null and b/fboss-image/distro_cli/tests/data/artifacts/platform_stack.tar.zst differ diff --git a/fboss-image/distro_cli/tests/data/update_manifest.json b/fboss-image/distro_cli/tests/data/update_manifest.json new file mode 100644 index 0000000000000..1c9c39066cd32 --- /dev/null +++ b/fboss-image/distro_cli/tests/data/update_manifest.json @@ -0,0 +1,14 @@ +{ + "distribution_formats": { + "onie": "fboss-onie.bin" + }, + "kernel": { + "download": "https://example.com/kernel.tar" + }, + "fboss-forwarding-stack": { + "download": "file:///opt/fboss/fboss-forwarding-stack.tar" + }, + "fboss-platform-stack": { + "execute": "/opt/fboss/build-platform-stack.sh" + } +} diff --git a/fboss-image/distro_cli/tests/device_test.py b/fboss-image/distro_cli/tests/device_test.py index 08895f5e3bb3e..68ce481b5e960 100644 --- a/fboss-image/distro_cli/tests/device_test.py +++ b/fboss-image/distro_cli/tests/device_test.py @@ -7,30 +7,20 @@ # LICENSE file in the root directory of this source tree. An additional grant # of patent rights can be found in the PATENTS file in the same directory. -""" -Unit tests for device commands +"""Unit tests for device commands.""" -NOTE: These are skeleton tests for stub implementations. -When device commands are fully implemented, these tests will be expanded -to verify actual functionality. - -These tests verify that: -1. Device command group exists and has expected subcommands -2. Commands can be called without crashing (stub behavior) -3. Context passing works correctly -""" - -import sys +import argparse +import json +import shutil +import subprocess +import tarfile import tempfile import unittest from pathlib import Path +from unittest.mock import patch -# Add parent directory to path for imports -sys.path.insert(0, str(Path(__file__).parent.parent)) - -import argparse - -from cmds.device import ( +from distro_cli.cmds.device import ( + get_device_ip, getip_command, image_command, image_upstream_command, @@ -39,29 +29,153 @@ ssh_command, update_command, ) +from distro_cli.lib.cli import CLI +from distro_cli.lib.device_update import DeviceUpdateError, DeviceUpdater +from distro_cli.lib.distro_infra import DISTRO_INFRA_CONTAINER +from distro_cli.lib.docker import container +from distro_cli.lib.manifest import ImageManifest +from distro_cli.tests.test_helpers import waitfor class TestDeviceCommands(unittest.TestCase): - """Test device command group and subcommands (stubs)""" + """Test device command group and subcommands""" + + IPXE_FILES = ("ipxev4.efi", "ipxev6.efi", "autoexec.ipxe") + + @classmethod + def setUpClass(cls): + """Set up test container before all tests""" + try: + result = subprocess.run( + ["docker", "images", "-q", "fboss_distro_infra"], + capture_output=True, + text=True, + check=True, + ) + if not result.stdout.strip(): + raise unittest.SkipTest( + "fboss_distro_infra Docker image not found. " + "Please build it with: cd fboss-image/distro_infra && ./build.sh" + ) + except (subprocess.CalledProcessError, FileNotFoundError): + raise unittest.SkipTest("Docker not available or image not built") + + cwd = Path.cwd() + cls.container_temp_dir = Path( + tempfile.mkdtemp(prefix="distro_infra_test_", dir=cwd) + ) + cls.container_persistent_dir = cls.container_temp_dir / "persistent" + cls.container_persistent_dir.mkdir(parents=True, exist_ok=True) + + # Write interface name file (normally done by distro_infra.sh) + interface_file = cls.container_persistent_dir / "interface_name.txt" + interface_file.write_text("lo") + + # Clean up any existing container with the same name + if container.container_is_running(DISTRO_INFRA_CONTAINER): + container.stop_and_remove_container(DISTRO_INFRA_CONTAINER) + + # Start the fboss-distro-infra container in background + volumes = {cls.container_persistent_dir: Path("/distro_infra/persistent")} + + exit_code = container.run_container( + image="fboss_distro_infra", + command=["/distro_infra/run_distro_infra.sh", "--intf", "lo", "--nodhcpv6"], + volumes=volumes, + ephemeral=False, + detach=True, + name=DISTRO_INFRA_CONTAINER, + privileged=True, # Required for network operations + ) + + if exit_code != 0: + raise RuntimeError(f"Failed to start {DISTRO_INFRA_CONTAINER} container") + + @classmethod + def tearDownClass(cls): + """Clean up test container after all tests""" + if container.container_is_running(DISTRO_INFRA_CONTAINER): + container.stop_and_remove_container(DISTRO_INFRA_CONTAINER) + + shutil.rmtree(cls.container_temp_dir, ignore_errors=True) + + def setup_image_command_test(self): + """Wait for container to create PXE boot infrastructure.""" + cache_dir = self.container_persistent_dir / "cache" + + waitfor( + cache_dir.exists, + lambda: self.fail("Timed out waiting for cache directory to be created"), + ) + + for filename in self.IPXE_FILES: + cache_file = cache_dir / filename + waitfor( + cache_file.exists, + lambda f=filename: self.fail( + f"Timed out waiting for {f} to be created" + ), + ) + + def verify_image_command_common(self, mac): + """Verify common PXE boot infrastructure created by image command""" + dash_mac = mac.replace(":", "-") + mac_dir = self.container_persistent_dir / dash_mac + + self.assertTrue(mac_dir.exists()) + self.assertTrue(mac_dir.is_dir()) + + for ipxe_file in self.IPXE_FILES: + ipxe_path = mac_dir / ipxe_file + self.assertTrue(ipxe_path.exists()) + + pxeboot_marker = mac_dir / "pxeboot_complete" + self.assertTrue(pxeboot_marker.exists()) + + ipxev6_serverip = mac_dir / "ipxev6.efi-serverip" + if ipxev6_serverip.exists(): + content = ipxev6_serverip.read_text() + self.assertIn("#!ipxe", content) + self.assertIn("set server_ip", content) + + result = subprocess.run( + [ + "docker", + "exec", + DISTRO_INFRA_CONTAINER, + "cat", + f"/distro_infra/dnsmasq_conf.d/{dash_mac}", + ], + capture_output=True, + check=False, + text=True, + ) + self.assertEqual(result.returncode, 0) + self.assertIn(mac, result.stdout) + + return mac_dir def setUp(self): """Set up test fixtures""" self.test_mac = "aa:bb:cc:dd:ee:ff" - # Create a temporary manifest file for tests that need it with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: f.write('{"test": "manifest"}') self.manifest_path = Path(f.name) - # Create a temporary image file for tests that need it - with tempfile.NamedTemporaryFile(mode="w", suffix=".bin", delete=False) as f: - f.write("fake image data") - self.image_path = Path(f.name) + self.temp_dir = tempfile.mkdtemp() + self.image_path = Path(self.temp_dir) / "test_image.tar" + + test_file = Path(self.temp_dir) / "test_file.txt" + test_file.write_text("test content") + + with tarfile.open(self.image_path, "w") as tar: + tar.add(test_file, arcname="test_file.txt") def tearDown(self): """Clean up test fixtures""" self.manifest_path.unlink() - self.image_path.unlink() + shutil.rmtree(self.temp_dir, ignore_errors=True) def test_device_commands_exist(self): """Test that device commands exist""" @@ -75,44 +189,349 @@ def test_device_commands_exist(self): def test_image_upstream_stub(self): """Test image-upstream command (stub)""" - args = argparse.Namespace(mac=self.test_mac, components=["kernel", "sai"]) + args = argparse.Namespace( + mac=self.test_mac, components=["kernel", "hw_agent_sai"] + ) # Call command - just verify it doesn't crash image_upstream_command(args) - def test_image_stub(self): - """Test image command (stub)""" - args = argparse.Namespace(mac=self.test_mac, image_path=str(self.image_path)) - # Call command - just verify it doesn't crash + def test_image_command_with_tarball(self): + """Test image command with tarball extraction""" + self.setup_image_command_test() + + temp_dir = tempfile.mkdtemp() + test_file = Path(temp_dir) / "test_file.txt" + test_file.write_text("test content") + tarball_path = Path(temp_dir) / "test_image.tar" + with tarfile.open(tarball_path, "w") as tar: + tar.add(test_file, arcname="test_file.txt") + + args = argparse.Namespace(mac=self.test_mac, image_path=str(tarball_path)) image_command(args) + mac_dir = self.verify_image_command_common(self.test_mac) + + extracted_file = mac_dir / "test_file.txt" + self.assertTrue(extracted_file.exists()) + self.assertEqual(extracted_file.read_text(), "test content") + + shutil.rmtree(temp_dir, ignore_errors=True) + + def test_image_command_with_directory(self): + """Test image command failure with directory (only tarballs supported)""" + self.setup_image_command_test() + + temp_dir = tempfile.mkdtemp() + dir_path = Path(temp_dir) / "test_image_dir" + dir_path.mkdir() + file1 = dir_path / "file1.txt" + file2 = dir_path / "file2.txt" + file1.write_text("content1") + file2.write_text("content2") + + args = argparse.Namespace(mac=self.test_mac, image_path=str(dir_path)) + with self.assertRaises(SystemExit) as excinfo: + image_command(args) + self.assertEqual(excinfo.exception.code, 1) + + shutil.rmtree(temp_dir, ignore_errors=True) + + def test_image_command_with_single_file(self): + """Test image command failure with non-tarball file""" + self.setup_image_command_test() + + temp_dir = tempfile.mkdtemp() + single_file_path = Path(temp_dir) / "single_file.bin" + single_file_path.write_text("single file content") + + args = argparse.Namespace(mac=self.test_mac, image_path=str(single_file_path)) + with self.assertRaises(SystemExit) as excinfo: + image_command(args) + self.assertEqual(excinfo.exception.code, 1) + + shutil.rmtree(temp_dir, ignore_errors=True) + def test_reprovision_stub(self): """Test reprovision command (stub)""" args = argparse.Namespace(mac=self.test_mac) # Call command - just verify it doesn't crash reprovision_command(args) - def test_update_stub(self): - """Test update command (stub)""" - args = argparse.Namespace( - mac=self.test_mac, - manifest=str(self.manifest_path), - components=["kernel", "sai"], + @patch("distro_cli.cmds.device.container.exec_in_container") + @patch("distro_cli.cmds.device.container.container_is_running") + def test_get_device_ip_ipv4(self, mock_is_running, mock_exec): + """Test get_device_ip returns IPv4 when available""" + mock_is_running.return_value = True + mock_exec.return_value = ( + 0, + '{"mac": "aa:bb:cc:dd:ee:ff", "ipv4": "192.168.1.100", "ipv6": "fe80::1"}', + "", ) - # Call command - just verify it doesn't crash - update_command(args) - def test_getip_stub(self): - """Test getip command (stub)""" - args = argparse.Namespace(mac=self.test_mac) - # Call command - just verify it doesn't crash - getip_command(args) + ip = get_device_ip(self.test_mac) + self.assertEqual(ip, "192.168.1.100") - def test_ssh_stub(self): - """Test ssh command (stub)""" - args = argparse.Namespace(mac=self.test_mac) - # Call command - just verify it doesn't crash + @patch("distro_cli.cmds.device.container.exec_in_container") + @patch("distro_cli.cmds.device.container.container_is_running") + def test_get_device_ip_ipv6_fallback(self, mock_is_running, mock_exec): + """Test get_device_ip returns IPv6 when IPv4 not available""" + mock_is_running.return_value = True + mock_exec.return_value = ( + 0, + '{"mac": "aa:bb:cc:dd:ee:ff", "ipv6": "fe80::1"}', + "", + ) + + ip = get_device_ip(self.test_mac) + self.assertEqual(ip, "fe80::1") + + @patch("distro_cli.cmds.device.container.exec_in_container") + @patch("distro_cli.cmds.device.container.container_is_running") + @patch("distro_cli.cmds.device.os.execvp") + def test_ssh_command_calls_execvp_correctly( + self, mock_execvp, mock_is_running, mock_exec + ): + """Test ssh command calls os.execvp with correct arguments""" + mock_is_running.return_value = True + mock_exec.return_value = ( + 0, + '{"mac": "aa:bb:cc:dd:ee:ff", "ipv4": "192.168.1.100"}', + "", + ) + + args = argparse.Namespace(mac=self.test_mac, interface=None) ssh_command(args) + mock_execvp.assert_called_once_with( + "ssh", + [ + "ssh", + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/dev/null", + "root@192.168.1.100", + ], + ) + + +class TestDeviceCLIIntegration(unittest.TestCase): + """Test device command CLI integration for image command""" + + def setUp(self): + """Set up CLI for testing""" + self.cli = CLI(description="Test CLI") + setup_device_commands(self.cli) + + def test_image_command_argument_parsing(self): + """Test that image command arguments are parsed correctly""" + with tempfile.NamedTemporaryFile(suffix=".tar", delete=False) as f: + temp_image = f.name + + try: + args = self.cli.parser.parse_args( + ["device", "aa:bb:cc:dd:ee:ff", "image", temp_image] + ) + self.assertEqual(args.mac, "aa:bb:cc:dd:ee:ff") + self.assertEqual(str(args.image_path), temp_image) + self.assertTrue(callable(args.func)) + self.assertEqual(args.func, image_command) + finally: + Path(temp_image).unlink() + + +class TestDeviceUpdater(unittest.TestCase): + """Unit tests for DeviceUpdater class""" + + def setUp(self): + """Set up test fixtures""" + self.test_data_dir = Path(__file__).parent / "data" + self.update_manifest_path = self.test_data_dir / "update_manifest.json" + + def test_validate_non_updatable_component(self): + """Test that non-updatable components raise error""" + manifest = ImageManifest(self.update_manifest_path) + updater = DeviceUpdater( + mac="aa:bb:cc:dd:ee:ff", + manifest=manifest, + component="kernel", + ) + with self.assertRaises(DeviceUpdateError) as ctx: + updater.validate() + self.assertIn("is not updatable", str(ctx.exception)) + self.assertIn("Updatable components:", str(ctx.exception)) + + def test_validate_component_not_in_manifest(self): + """Test that component in COMPONENT_SERVICES but missing from manifest raises error""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump( + { + "distribution_formats": {"onie": "test.bin"}, + "kernel": {"download": "https://example.com/kernel.tar"}, + }, + f, + ) + temp_manifest_path = Path(f.name) + self.addCleanup(temp_manifest_path.unlink) + + manifest = ImageManifest(temp_manifest_path) + updater = DeviceUpdater( + mac="aa:bb:cc:dd:ee:ff", + manifest=manifest, + component="fboss-forwarding-stack", + ) + with self.assertRaises(DeviceUpdateError) as ctx: + updater.validate() + self.assertIn("not found in manifest", str(ctx.exception)) + + def test_validate_component_with_no_services(self): + """Test that component with empty services list raises error""" + manifest = ImageManifest(self.update_manifest_path) + updater = DeviceUpdater( + mac="aa:bb:cc:dd:ee:ff", + manifest=manifest, + component="fboss-platform-stack", + ) + # Patch COMPONENT_SERVICES to have empty list for platform-stack + with patch( + "distro_cli.lib.device_update.COMPONENT_SERVICES", + {"fboss-platform-stack": [], "fboss-forwarding-stack": ["wedge_agent"]}, + ): + with self.assertRaises(DeviceUpdateError) as ctx: + updater.validate() + self.assertIn("has no services defined", str(ctx.exception)) + + def test_validate_component_without_download_or_execute(self): + """Test that component without download or execute raises error""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump( + { + "distribution_formats": {"onie": "test.bin"}, + "kernel": {"download": "https://example.com/kernel.tar"}, + "fboss-forwarding-stack": {}, + }, + f, + ) + temp_manifest_path = Path(f.name) + self.addCleanup(temp_manifest_path.unlink) + + manifest = ImageManifest(temp_manifest_path) + updater = DeviceUpdater( + mac="aa:bb:cc:dd:ee:ff", + manifest=manifest, + component="fboss-forwarding-stack", + ) + with self.assertRaises(DeviceUpdateError) as ctx: + updater.validate() + self.assertIn("neither 'download' nor 'execute'", str(ctx.exception)) + + def test_validate_success_forwarding_stack(self): + """Test successful validation for fboss-forwarding-stack""" + manifest = ImageManifest(self.update_manifest_path) + updater = DeviceUpdater( + mac="aa:bb:cc:dd:ee:ff", + manifest=manifest, + component="fboss-forwarding-stack", + ) + # Should not raise + updater.validate() + + def test_validate_success_platform_stack(self): + """Test successful validation for fboss-platform-stack""" + manifest = ImageManifest(self.update_manifest_path) + updater = DeviceUpdater( + mac="aa:bb:cc:dd:ee:ff", + manifest=manifest, + component="fboss-platform-stack", + ) + # Should not raise + updater.validate() + + def test_get_services_from_component_services(self): + """Test that services are correctly read from COMPONENT_SERVICES dict""" + manifest = ImageManifest(self.update_manifest_path) + + updater1 = DeviceUpdater( + mac="aa:bb:cc:dd:ee:ff", + manifest=manifest, + component="fboss-forwarding-stack", + ) + self.assertEqual( + updater1._get_services(), + ["wedge_agent", "fsdb", "qsfp_service"], + ) + + updater2 = DeviceUpdater( + mac="aa:bb:cc:dd:ee:ff", + manifest=manifest, + component="fboss-platform-stack", + ) + self.assertEqual( + updater2._get_services(), + [ + "platform_manager", + "sensor_service", + "fan_service", + "data_corral_service", + ], + ) + + def test_update_requires_device_ip(self): + """Test that update() raises DeviceUpdateError when device_ip is not set""" + manifest = ImageManifest(self.update_manifest_path) + updater = DeviceUpdater( + mac="aa:bb:cc:dd:ee:ff", + manifest=manifest, + component="fboss-forwarding-stack", + device_ip=None, + ) + with self.assertRaises(DeviceUpdateError) as ctx: + updater.update() + self.assertIn("Device IP not set", str(ctx.exception)) + + def test_acquire_artifacts_and_services(self): + """Test services are available for platform-stack component""" + manifest = ImageManifest(self.update_manifest_path) + updater = DeviceUpdater( + mac="aa:bb:cc:dd:ee:ff", + manifest=manifest, + component="fboss-platform-stack", + ) + + # Validate first + updater.validate() + + # Verify services come from COMPONENT_SERVICES dict + services = updater._get_services() + self.assertEqual( + services, + [ + "platform_manager", + "sensor_service", + "fan_service", + "data_corral_service", + ], + ) + + def test_acquire_artifacts_forwarding_stack(self): + """Test services are available for forwarding-stack component""" + manifest = ImageManifest(self.update_manifest_path) + updater = DeviceUpdater( + mac="aa:bb:cc:dd:ee:ff", + manifest=manifest, + component="fboss-forwarding-stack", + ) + + # Validate first + updater.validate() + + # Verify services come from COMPONENT_SERVICES dict + services = updater._get_services() + self.assertEqual( + services, + ["wedge_agent", "fsdb", "qsfp_service"], + ) + if __name__ == "__main__": unittest.main() diff --git a/fboss-image/distro_cli/tests/docker_test.py b/fboss-image/distro_cli/tests/docker_test.py index 3b713f388a322..ca8d2cb74a4b6 100644 --- a/fboss-image/distro_cli/tests/docker_test.py +++ b/fboss-image/distro_cli/tests/docker_test.py @@ -3,7 +3,7 @@ import unittest from distro_cli.lib.constants import FBOSS_BUILDER_IMAGE -from distro_cli.lib.docker.container import run_container +from distro_cli.lib.docker import container from distro_cli.tests.test_helpers import ensure_test_docker_image @@ -15,15 +15,98 @@ def setUpClass(cls): """Ensure fboss_builder image exists before running tests.""" ensure_test_docker_image() - def test_run_simple_container(self): + def test_run_container(self): """Test running a simple container command.""" - exit_code = run_container( + exit_code = container.run_container( image=FBOSS_BUILDER_IMAGE, - command=["echo", "hello from container"], + command=["echo", "hello from run_container"], ephemeral=True, ) self.assertEqual(exit_code, 0) + def test_exec_in_container(self): + """Test executing a command in a running container.""" + exit_code = container.run_container( + image=FBOSS_BUILDER_IMAGE, + command=["sleep", "inf"], + ephemeral=False, + detach=True, + name="test_exec_container", + ) + self.assertEqual(exit_code, 0) + + exec_exit_code, stdout, stderr = container.exec_in_container( + name="test_exec_container", + command=["echo", "hello from exec_in_container"], + ) + self.assertEqual(exec_exit_code, 0) + self.assertEqual(stdout.strip(), "hello from exec_in_container") + + # Clean up the container + container.stop_and_remove_container(name="test_exec_container") + + # Check if container is stopped and removed + is_running = container.container_is_running("test_exec_container") + self.assertFalse(is_running) + + # Try to exec in the removed container - should fail with non-zero exit code + exec_exit_code, stdout, stderr = container.exec_in_container( + name="test_exec_container", + command=["echo", "should not work"], + ) + self.assertNotEqual(exec_exit_code, 0) + + def test_container_is_running(self): + """Test checking if a container is running.""" + # Check non-existent container + is_running = container.container_is_running("non_existent_container") + self.assertFalse(is_running) + + # Start a container + exit_code = container.run_container( + image=FBOSS_BUILDER_IMAGE, + command=["sleep", "1"], + ephemeral=False, + name="test_running_container", + ) + self.assertEqual(exit_code, 0) + + # Check if container is running + is_running = container.container_is_running("test_running_container") + self.assertTrue(is_running) + + # Stop and remove the container + container.stop_and_remove_container(name="test_running_container") + + # Check if container is stopped and removed + is_running = container.container_is_running("test_running_container") + self.assertFalse(is_running) + + def test_stop_and_remove_container(self): + """Test stopping and removing a container.""" + # Start a container + exit_code = container.run_container( + image=FBOSS_BUILDER_IMAGE, + command=["sleep", "1"], + ephemeral=False, + name="test_stop_and_remove_container", + ) + self.assertEqual(exit_code, 0) + + # Check if container is running + is_running = container.container_is_running("test_stop_and_remove_container") + self.assertTrue(is_running) + + # Stop and remove the container + exit_code = container.stop_and_remove_container( + name="test_stop_and_remove_container" + ) + self.assertEqual(exit_code, 0) + + # Check if container is stopped and removed + is_running = container.container_is_running("test_stop_and_remove_container") + self.assertFalse(is_running) + if __name__ == "__main__": unittest.main() diff --git a/fboss-image/distro_cli/tests/proxy_device/Dockerfile b/fboss-image/distro_cli/tests/proxy_device/Dockerfile new file mode 100644 index 0000000000000..2f96a4e905804 --- /dev/null +++ b/fboss-image/distro_cli/tests/proxy_device/Dockerfile @@ -0,0 +1,71 @@ +FROM quay.io/centos/centos:stream9 + +# Install systemd and clean up +RUN dnf install -y systemd systemd-libs && \ + dnf clean all && \ + rm -rf /var/cache/dnf && \ + # Remove unnecessary systemd units + rm -f /etc/systemd/system/*.wants/* \ + /lib/systemd/system/multi-user.target.wants/* \ + /lib/systemd/system/local-fs.target.wants/* \ + /lib/systemd/system/sockets.target.wants/*udev* \ + /lib/systemd/system/sockets.target.wants/*initctl* \ + /lib/systemd/system/basic.target.wants/* \ + /lib/systemd/system/anaconda.target.wants/* + +# Install EPEL for btrfs-progs +RUN dnf install -y epel-release && dnf clean all && rm -rf /var/cache/dnf + +# Install SSH server and btrfs tools +RUN dnf install -y \ + openssh-server openssh-clients \ + btrfs-progs \ + tar zstd \ + procps-ng \ + rsync \ + && dnf clean all && rm -rf /var/cache/dnf + +# Configure SSH +RUN ssh-keygen -A && \ + mkdir -p /root/.ssh && \ + chmod 700 /root/.ssh + +# Allow passwordless root login since it is only used in integration tests +RUN sed -i 's/^root:[^:]*:/root::/' /etc/shadow && \ + sed -i 's/#PermitRootLogin.*/PermitRootLogin yes/' /etc/ssh/sshd_config && \ + sed -i 's/#PermitEmptyPasswords.*/PermitEmptyPasswords yes/' /etc/ssh/sshd_config && \ + sed -i 's/#PasswordAuthentication.*/PasswordAuthentication yes/' /etc/ssh/sshd_config && \ + sed -i 's/^UsePAM yes/UsePAM no/' /etc/ssh/sshd_config.d/50-redhat.conf + +# Create FBOSS directory structure +RUN mkdir -p /opt/fboss/bin /opt/fboss/lib /updates + +# Copy service template and create all service scripts from it +COPY parts/services/service_template.sh /tmp/service_template.sh +RUN for svc in wedge_agent fsdb qsfp_service platform_manager sensor_service fan_service data_corral_service; do \ + cp /tmp/service_template.sh /opt/fboss/bin/$svc; \ + done && \ + rm /tmp/service_template.sh + +# Copy setup scripts +COPY parts/systemd/ /etc/systemd/system/ +COPY parts/setup_btrfs.sh /usr/local/bin/ +COPY parts/entrypoint.sh /usr/local/bin/ + +RUN chmod +x /opt/fboss/bin/* /usr/local/bin/*.sh + +# Enable services, SSH, and device init +RUN systemctl enable sshd && \ + systemctl enable device-init && \ + systemctl enable wedge_agent && \ + systemctl enable fsdb && \ + systemctl enable qsfp_service && \ + systemctl enable platform_manager && \ + systemctl enable sensor_service && \ + systemctl enable fan_service && \ + systemctl enable data_corral_service + +EXPOSE 22 + +# Use systemd as init +CMD ["/sbin/init"] diff --git a/fboss-image/distro_cli/tests/proxy_device/README.md b/fboss-image/distro_cli/tests/proxy_device/README.md new file mode 100644 index 0000000000000..e40508a62e0f6 --- /dev/null +++ b/fboss-image/distro_cli/tests/proxy_device/README.md @@ -0,0 +1,150 @@ +# Proxy Device + +A Docker container that simulates a FBOSS device for testing the `fboss-image` CLI commands, particularly the `device update` functionality. + +## Purpose + +The `device update` command creates btrfs subvolumes, installs artifacts, and restarts services. Testing this against real hardware is slow and impractical during development. + +This container provides a lightweight simulation that: +- Runs systemd as init (like a real device) +- Creates a btrfs filesystem on a loopback file +- Runs proxy FBOSS services in per-service btrfs subvolumes +- Accepts SSH connections for CLI commands + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────┐ +│ proxy_device │ +│ │ +│ ┌─────────────────────────────────────────────────────┐ │ +│ │ systemd (PID 1) │ │ +│ │ │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ +│ │ │ wedge_agent │ │ fsdb │ │ qsfp_service│ │ │ +│ │ │ (subvol) │ │ (subvol) │ │ (subvol) │ │ │ +│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │ +│ │ │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ +│ │ │platform_mgr │ │ sensor_svc │ │ fan_service │ │ │ +│ │ │ (subvol) │ │ (subvol) │ │ (subvol) │ │ │ +│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │ +│ │ │ │ +│ │ ┌─────────────┐ │ │ +│ │ │data_corral │ sshd (port 22) │ │ +│ │ │ (subvol) │ │ │ +│ │ └─────────────┘ │ │ +│ └─────────────────────────────────────────────────────┘ │ +│ │ +│ /mnt/btrfs/ │ +│ ├── distro-base/ (base subvolume) │ +│ └── updates/ │ +│ ├── wedge_agent-/ (service subvolume) │ +│ ├── fsdb-/ │ +│ ├── qsfp_service-/ │ +│ └── ... │ +└─────────────────────────────────────────────────────────────┘ +``` + +### Directory Structure Within Subvolumes + +Each btrfs subvolume contains the FBOSS directory structure: + +``` +/opt/fboss/ +├── bin/ +│ └── # Service script (replaced by update) +└── lib/ # Shared libraries +``` + +- **Service scripts** (`/opt/fboss/bin/`) are stub scripts that write their version to `/var/run/.version` and loop forever. +- Updates replace these scripts with new versions. +- Tests verify updates by checking the version file contents. + +## Key Components + +### Dockerfile +Builds a CentOS Stream 9 image with: +- systemd as init +- btrfs-progs for filesystem operations +- SSH server with passwordless root access +- FBOSS service scripts at `/opt/fboss/bin/` + +### parts/setup_btrfs.sh +Runs at first boot via `device-init.service`: +1. Creates a 512MB loopback file at `/var/btrfs.img` +2. Formats it as btrfs and mounts at `/mnt/btrfs` +3. Creates `distro-base` subvolume with FBOSS directory structure +4. Snapshots base into per-service subvolumes under `/updates/` +5. Creates systemd drop-ins setting `RootDirectory=` for each service + +### parts/services/ +Each service script (wedge_agent, fsdb, etc.) is a simple stub that: +1. Writes its VERSION to `/var/run/.version` +2. Logs startup to `/var/log/.log` +3. Loops forever with `sleep 60` + +The initial version is "1.0.0". Updates replace the script with a new version. + +### Version Verification + +Each service writes its version to `/var/run/.version` **inside its btrfs subvolume**. + +To verify from outside the service: +```bash +# Find the service's subvolume and check version +SUBVOL=$(ls -d /mnt/btrfs/updates/wedge_agent-* | head -1) +cat $SUBVOL/var/run/wedge_agent.version +# Output: 1.0.0 (before update) or 2.0.0 (after update) +``` + +Integration tests use this flow: +1. Start container → services run with VERSION="1.0.0" +2. Deploy update with VERSION="2.0.0" artifacts +3. Restart service +4. Verify version file changed to "2.0.0" + +## Usage + +```bash +# Build the container image +./build.sh + +# Run standalone (for debugging) +docker run -d --privileged --cgroupns=host --name proxy-device \ + fboss_proxy_device /sbin/init + +# SSH into it (passwordless root login) +ssh root@ + +# Check services +docker exec proxy-device systemctl status wedge_agent + +# Check subvolumes +docker exec proxy-device ls /mnt/btrfs/updates/ + +# Check service version +docker exec proxy-device bash -c 'cat /mnt/btrfs/updates/wedge_agent-*/var/run/wedge_agent.version' +``` + +## Testing Updates + +The `update` command replaces service scripts and restarts services. + +**How it works:** + +1. Service scripts start with VERSION="1.0.0" at `/opt/fboss/bin/` +2. Systemd runs these scripts: `ExecStart=/opt/fboss/bin/` +3. Updates replace scripts with new versions (e.g., VERSION="2.0.0") +4. Service restarts and writes new version to `/var/run/.version` + +Tests verify updates by checking: +- Version file contains expected version +- Service is running (via systemctl status) +- Log file shows new startup entry + +## Requirements + +- Docker with `--privileged` support (for systemd and loopback mounts) +- `--cgroupns=host` for proper cgroup management diff --git a/fboss-image/distro_cli/tests/proxy_device/build.sh b/fboss-image/distro_cli/tests/proxy_device/build.sh new file mode 100755 index 0000000000000..f4194a2cfe94c --- /dev/null +++ b/fboss-image/distro_cli/tests/proxy_device/build.sh @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +cd "$(dirname "$0")" + +DOCKER_BUILDKIT=1 docker build . -t fboss_proxy_device diff --git a/fboss-image/distro_cli/tests/proxy_device/parts/entrypoint.sh b/fboss-image/distro_cli/tests/proxy_device/parts/entrypoint.sh new file mode 100644 index 0000000000000..72ec0abbff713 --- /dev/null +++ b/fboss-image/distro_cli/tests/proxy_device/parts/entrypoint.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# Entrypoint script for device container +# This is called by systemd after boot + +set -e + +# Setup btrfs loopback filesystem if not already done +if [ ! -f /var/btrfs.img ]; then + /usr/local/bin/setup_btrfs.sh +fi + +echo "Device container ready" diff --git a/fboss-image/distro_cli/tests/proxy_device/parts/services/service_template.sh b/fboss-image/distro_cli/tests/proxy_device/parts/services/service_template.sh new file mode 100644 index 0000000000000..907cc45701bc1 --- /dev/null +++ b/fboss-image/distro_cli/tests/proxy_device/parts/services/service_template.sh @@ -0,0 +1,11 @@ +#!/bin/bash +VERSION="1.0.0" +SERVICE_NAME=$(basename "$0") +VERSION_FILE="/var/run/${SERVICE_NAME}.version" +LOG_FILE="/var/log/${SERVICE_NAME}.log" + +mkdir -p /var/run /var/log +echo "$VERSION" >"$VERSION_FILE" +echo "$(date): $SERVICE_NAME v$VERSION started (pid $$)" >>"$LOG_FILE" + +while true; do sleep 60; done diff --git a/fboss-image/distro_cli/tests/proxy_device/parts/setup_btrfs.sh b/fboss-image/distro_cli/tests/proxy_device/parts/setup_btrfs.sh new file mode 100644 index 0000000000000..6ffc83414cb61 --- /dev/null +++ b/fboss-image/distro_cli/tests/proxy_device/parts/setup_btrfs.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# Setup btrfs loopback filesystem with base snapshot and per-stack volumes +set -euo pipefail + +# Remove nologin file to allow SSH login (systemd creates this during boot) +rm -f /run/nologin /var/run/nologin + +BTRFS_IMG=/var/btrfs.img +BTRFS_MOUNT=/mnt/btrfs + +# Skip btrfs setup if already mounted (but nologin removal above always runs) +if mountpoint -q "$BTRFS_MOUNT" 2>/dev/null; then + echo "btrfs already mounted, skipping" + exit 0 +fi +DISTRO_BASE=/distro-base + +# Clean up any stale loop devices from previous container runs +# (loop devices are shared with host and persist after container exit) +for dev in $(losetup -j "$BTRFS_IMG" 2>/dev/null | cut -d: -f1); do + losetup -d "$dev" 2>/dev/null || true +done + +# Clean up any existing file from previous failed runs +rm -f $BTRFS_IMG + +# Create loopback file (1GB to accommodate full root filesystem copy) +dd if=/dev/zero of=$BTRFS_IMG bs=1M count=1024 + +# Set up loop device first, then format and mount +mkdir -p $BTRFS_MOUNT +LOOP_DEV=$(losetup -f --show $BTRFS_IMG) +mkfs.btrfs $LOOP_DEV +mount $LOOP_DEV $BTRFS_MOUNT + +# Copy the container's root filesystem to btrfs (mimics ONIE extracting rootfs) +# Exclude special filesystems and the btrfs image itself to avoid recursion +echo "Copying root filesystem to btrfs..." +rsync -aAX \ + --exclude=/dev/* \ + --exclude=/proc/* \ + --exclude=/sys/* \ + --exclude=/run/* \ + --exclude=/mnt/* \ + --exclude=/tmp/* \ + --exclude=/var/btrfs.img \ + / $BTRFS_MOUNT/ + +# Create base snapshot from the root copy (mimics real device installation) +# This is what install.sh.tmpl does: btrfs subvolume snapshot ${demo_mnt} ${demo_mnt}/distro-base +echo "Creating base snapshot..." +btrfs subvolume snapshot $BTRFS_MOUNT $BTRFS_MOUNT/distro-base + +# Fix /var/run symlink: replace with real directory so services can write version files +# when running with RootDirectory isolation +if [ -L $BTRFS_MOUNT/distro-base/var/run ]; then + rm $BTRFS_MOUNT/distro-base/var/run + mkdir -p $BTRFS_MOUNT/distro-base/var/run +fi + +# Create symlinks for easy access (remove existing dirs first) +rm -rf $DISTRO_BASE /updates +ln -sf $BTRFS_MOUNT/distro-base $DISTRO_BASE + +# Create updates directory on btrfs +mkdir -p $BTRFS_MOUNT/updates +ln -sf $BTRFS_MOUNT/updates /updates + +echo "Base snapshot created at $DISTRO_BASE (snapshot of root filesystem)" + +echo "btrfs setup complete: Base=$DISTRO_BASE" diff --git a/fboss-image/distro_cli/tests/proxy_device/parts/systemd/data_corral_service.service b/fboss-image/distro_cli/tests/proxy_device/parts/systemd/data_corral_service.service new file mode 100644 index 0000000000000..47e4181ac85fc --- /dev/null +++ b/fboss-image/distro_cli/tests/proxy_device/parts/systemd/data_corral_service.service @@ -0,0 +1,13 @@ +[Unit] +Description=Data Corral Service +After=network.target device-init.service +Requires=device-init.service + +[Service] +Type=simple +ExecStart=/opt/fboss/bin/data_corral_service +Restart=always +RestartSec=5 + +[Install] +WantedBy=multi-user.target diff --git a/fboss-image/distro_cli/tests/proxy_device/parts/systemd/device-init.service b/fboss-image/distro_cli/tests/proxy_device/parts/systemd/device-init.service new file mode 100644 index 0000000000000..cbe7d44f12c2a --- /dev/null +++ b/fboss-image/distro_cli/tests/proxy_device/parts/systemd/device-init.service @@ -0,0 +1,13 @@ +[Unit] +Description=Device Container Initialization +After=local-fs.target +Before=wedge_agent.service fsdb.service qsfp_service.service platform_manager.service sensor_service.service fan_service.service +ConditionPathExists=!/var/btrfs.img + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/setup_btrfs.sh +RemainAfterExit=yes + +[Install] +WantedBy=multi-user.target diff --git a/fboss-image/distro_cli/tests/proxy_device/parts/systemd/fan_service.service b/fboss-image/distro_cli/tests/proxy_device/parts/systemd/fan_service.service new file mode 100644 index 0000000000000..359a5e076bc18 --- /dev/null +++ b/fboss-image/distro_cli/tests/proxy_device/parts/systemd/fan_service.service @@ -0,0 +1,13 @@ +[Unit] +Description=Fan Service +After=network.target device-init.service +Requires=device-init.service + +[Service] +Type=simple +ExecStart=/opt/fboss/bin/fan_service +Restart=always +RestartSec=5 + +[Install] +WantedBy=multi-user.target diff --git a/fboss-image/distro_cli/tests/proxy_device/parts/systemd/fsdb.service b/fboss-image/distro_cli/tests/proxy_device/parts/systemd/fsdb.service new file mode 100644 index 0000000000000..2a364b1098065 --- /dev/null +++ b/fboss-image/distro_cli/tests/proxy_device/parts/systemd/fsdb.service @@ -0,0 +1,13 @@ +[Unit] +Description=FSDB +After=network.target device-init.service +Requires=device-init.service + +[Service] +Type=simple +ExecStart=/opt/fboss/bin/fsdb +Restart=always +RestartSec=5 + +[Install] +WantedBy=multi-user.target diff --git a/fboss-image/distro_cli/tests/proxy_device/parts/systemd/platform_manager.service b/fboss-image/distro_cli/tests/proxy_device/parts/systemd/platform_manager.service new file mode 100644 index 0000000000000..394a36f355cf3 --- /dev/null +++ b/fboss-image/distro_cli/tests/proxy_device/parts/systemd/platform_manager.service @@ -0,0 +1,13 @@ +[Unit] +Description=Platform Manager +After=network.target device-init.service +Requires=device-init.service + +[Service] +Type=simple +ExecStart=/opt/fboss/bin/platform_manager +Restart=always +RestartSec=5 + +[Install] +WantedBy=multi-user.target diff --git a/fboss-image/distro_cli/tests/proxy_device/parts/systemd/qsfp_service.service b/fboss-image/distro_cli/tests/proxy_device/parts/systemd/qsfp_service.service new file mode 100644 index 0000000000000..9e40f3cebe21f --- /dev/null +++ b/fboss-image/distro_cli/tests/proxy_device/parts/systemd/qsfp_service.service @@ -0,0 +1,13 @@ +[Unit] +Description=QSFP Service +After=network.target device-init.service +Requires=device-init.service + +[Service] +Type=simple +ExecStart=/opt/fboss/bin/qsfp_service +Restart=always +RestartSec=5 + +[Install] +WantedBy=multi-user.target diff --git a/fboss-image/distro_cli/tests/proxy_device/parts/systemd/sensor_service.service b/fboss-image/distro_cli/tests/proxy_device/parts/systemd/sensor_service.service new file mode 100644 index 0000000000000..0a86427314fed --- /dev/null +++ b/fboss-image/distro_cli/tests/proxy_device/parts/systemd/sensor_service.service @@ -0,0 +1,13 @@ +[Unit] +Description=Sensor Service +After=network.target device-init.service +Requires=device-init.service + +[Service] +Type=simple +ExecStart=/opt/fboss/bin/sensor_service +Restart=always +RestartSec=5 + +[Install] +WantedBy=multi-user.target diff --git a/fboss-image/distro_cli/tests/proxy_device/parts/systemd/wedge_agent.service b/fboss-image/distro_cli/tests/proxy_device/parts/systemd/wedge_agent.service new file mode 100644 index 0000000000000..2b251e3b6de63 --- /dev/null +++ b/fboss-image/distro_cli/tests/proxy_device/parts/systemd/wedge_agent.service @@ -0,0 +1,13 @@ +[Unit] +Description=Wedge Agent +After=network.target device-init.service +Requires=device-init.service + +[Service] +Type=simple +ExecStart=/opt/fboss/bin/wedge_agent +Restart=always +RestartSec=5 + +[Install] +WantedBy=multi-user.target diff --git a/fboss-image/distro_cli/tests/test_helpers.py b/fboss-image/distro_cli/tests/test_helpers.py index 3edcccbf81676..ea4ce2111dbe7 100644 --- a/fboss-image/distro_cli/tests/test_helpers.py +++ b/fboss-image/distro_cli/tests/test_helpers.py @@ -11,6 +11,7 @@ import os import shutil import tempfile +import time from collections.abc import Generator from contextlib import contextmanager from pathlib import Path @@ -97,3 +98,32 @@ def override_artifact_store_dir(store_dir: Path) -> Generator[None, None, None]: yield finally: ArtifactStore.ARTIFACT_STORE_DIR = original + + +def waitfor(condition_fn, assert_fn, timeout=60.0, interval=0.1): + """Wait for a condition to become true with timeout. + + Repeatedly checks a condition function until it returns True or the timeout + expires. If the timeout is reached, calls the assert function to fail the test. + + Args: + condition_fn: Callable that returns True when the condition is met + assert_fn: Callable to invoke if timeout expires (should fail the test) + timeout: Maximum time to wait in seconds (default: 60.0) + interval: Time to sleep between checks in seconds (default: 0.1) + + Example: + waitfor( + lambda: cache_dir.exists, + lambda: self.fail("Cache directory not created"), + timeout=30.0 + ) + """ + start = time.time() + while True: + if condition_fn(): + return + if time.time() - start > timeout: + assert_fn() + raise AssertionError("assert_fn should not have returned!") + time.sleep(interval) diff --git a/fboss-image/distro_infra/Dockerfile b/fboss-image/distro_infra/Dockerfile index 0fc9d291fd083..8a58b5033eb2e 100644 --- a/fboss-image/distro_infra/Dockerfile +++ b/fboss-image/distro_infra/Dockerfile @@ -4,17 +4,19 @@ RUN dnf install -y 'dnf-command(config-manager)' && \ dnf config-manager --set-enabled crb && \ dnf install -y epel-release epel-next-release && \ dnf install -y --allowerasing \ - wget curl tcpdump zstd iputils which net-tools iproute \ + wget curl tcpdump zstd iputils which net-tools iproute jq \ man dnsmasq vim nginx procps-ng && \ dnf clean all && rm -rf /var/cache/dnf RUN mkdir -p /distro_infra/dnsmasq_conf.d COPY parts/run_distro_infra.sh /distro_infra COPY parts/post_tftp.sh /distro_infra +COPY parts/enable_pxeboot.sh /distro_infra COPY parts/ipxe/ipxev4.efi /distro_infra COPY parts/ipxe/ipxev6.efi /distro_infra COPY parts/autoexec.ipxe /distro_infra COPY parts/nginx.conf /distro_infra +COPY parts/getip.sh /distro_infra RUN mkdir -p /distro_infra/persistent WORKDIR /distro_infra/persistent diff --git a/fboss-image/distro_infra/distro_infra.sh b/fboss-image/distro_infra/distro_infra.sh index ca2d50a72d001..04b084a803a2c 100755 --- a/fboss-image/distro_infra/distro_infra.sh +++ b/fboss-image/distro_infra/distro_infra.sh @@ -1,8 +1,10 @@ #!/bin/bash +set -e INTERFACE="" PERSIST_DIR="" NODHCPV6="" +DISTRO_CONTAINER_NAME="fboss-distro-infra" # This should match DISTRO_CONTAINER_NAME in distro_cli/cmds/device.py help() { echo "Usage: $0 [--nodhcpv6] --intf --persist-dir " @@ -61,7 +63,11 @@ fi mkdir -p "${PERSIST_DIR}" +# Write interface name to persistent directory so it can be read by distro_cli +echo -n "${INTERFACE}" >"${PERSIST_DIR}/interface_name.txt" + # Run the Docker container with the parsed arguments docker run --rm -it --network host --cap-add=NET_ADMIN \ --volume "$(realpath "${PERSIST_DIR}")":/distro_infra/persistent:rw \ + --name "${DISTRO_CONTAINER_NAME}" \ fboss_distro_infra /distro_infra/run_distro_infra.sh "${NODHCPV6}" --intf "${INTERFACE}" diff --git a/fboss-image/distro_infra/parts/enable_pxeboot.sh b/fboss-image/distro_infra/parts/enable_pxeboot.sh new file mode 100755 index 0000000000000..e7fe8b86a9040 --- /dev/null +++ b/fboss-image/distro_infra/parts/enable_pxeboot.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Enable PXE boot for a device MAC address +# This script is run inside the distro_infra container + +set -euo pipefail + +if [ $# -ne 1 ]; then + echo "Usage: $0 " >&2 + echo " mac_address: MAC address in dash format (aa-bb-cc-dd-ee-ff)" >&2 + exit 1 +fi + +DASH_MAC="$1" + +# Validate dash format MAC address +if ! echo "$DASH_MAC" | grep -qE '^[0-9a-f]{2}(-[0-9a-f]{2}){5}$'; then + echo "Error: Invalid MAC address: $DASH_MAC" >&2 + echo "Expected format: aa-bb-cc-dd-ee-ff" >&2 + exit 1 +fi + +echo "Enabling PXE boot for MAC address: $DASH_MAC" + +MAC_DIR="/distro_infra/persistent/$DASH_MAC" +mkdir -p "$MAC_DIR" +chmod 777 "$MAC_DIR" + +CACHE_DIR="/distro_infra/persistent/cache" + +for filename in ipxev4.efi ipxev6.efi autoexec.ipxe; do + SRC="$CACHE_DIR/$filename" + DST="$MAC_DIR/$filename" + rm -f "$DST" + ln "$SRC" "$DST" +done + +touch "$MAC_DIR/pxeboot_complete" + +INTERFACE_FILE="/distro_infra/persistent/interface_name.txt" +if [ ! -f "$INTERFACE_FILE" ]; then + echo "Error: Interface file not found: $INTERFACE_FILE" >&2 + exit 1 +fi + +INTERFACE=$(cat "$INTERFACE_FILE") +if [ -z "$INTERFACE" ]; then + echo "Error: Interface file is empty: $INTERFACE_FILE" >&2 + exit 1 +fi + +IPV6=$(ip -6 addr show dev "$INTERFACE" scope global | awk -F '[[:space:]/]+' '/inet6/{print $3; exit}') +if [ -n "$IPV6" ]; then + cat >"$MAC_DIR/ipxev6.efi-serverip" <"/distro_infra/dnsmasq_conf.d/$DASH_MAC" + +echo "PXE boot enabled for $DASH_MAC" +echo "MAC directory: $MAC_DIR" diff --git a/fboss-image/distro_infra/parts/getip.sh b/fboss-image/distro_infra/parts/getip.sh new file mode 100755 index 0000000000000..33082866af4fb --- /dev/null +++ b/fboss-image/distro_infra/parts/getip.sh @@ -0,0 +1,349 @@ +#!/bin/bash + +# getip.sh - MAC Address to IP Resolution Utility (JSON Output) +# +# Description: +# Resolves IP addresses (IPv4/IPv6) from MAC addresses using the kernel's +# neighbor table (ARP/NDP cache). Supports optional network interface filtering. +# Returns results in JSON format. +# +# Usage: +# getip.sh [INTERFACE] +# +# Algorithm: +# 1. Check neighbor table for existing MAC-to-IP mappings +# 2. If found: Ping specific IPs to verify and refresh the mapping +# 3. If not found: Ping broadcast (IPv4) and multicast (IPv6) to discover devices +# 4. Wait for neighbor table to update (1 second) +# 5. Query neighbor table again and return the IP addresses +# +# Output Format (JSON): +# Success: +# { +# "mac": "aa:bb:cc:dd:ee:ff", +# "interface": "eth0", # Optional, if interface specified +# "ipv4": "192.168.1.100", # Optional, if IPv4 found +# "ipv6": "fe80::1" # Optional, if IPv6 found +# } +# +# Error (MAC not found): +# { +# "mac": "aa:bb:cc:dd:ee:ff", +# "error_code": "MAC_NOT_FOUND", +# "error": "MAC address not found in ip neighbor table." +# } +# +# Error (Invalid arguments): +# { +# "error_code": "INVALID_ARGUMENTS", +# "error": "MAC address argument required. Use -h for help." +# } +# +# Error (Command failed): +# { +# "error_code": "COMMAND_FAILED", +# "error": "Command 'ip -4 neighbor show dev eth99' failed: Device \"eth99\" does not exist." +# } +# +# Error Codes: +# MAC_NOT_FOUND - The specified MAC address was not found in the neighbor table +# INVALID_ARGUMENTS - Missing or invalid command-line arguments +# COMMAND_FAILED - A system command (ip) failed to execute (command included in error message) +# +# Exit Codes: +# 0 - Success: IP address found and returned +# 1 - Error: MAC address not found in neighbor table or command failed +# 2 - Error: Invalid arguments or missing MAC address +# +# Dependencies: +# - iproute (ip command) +# - iputils (ping, ping6 commands) +# - jq (JSON processor) + +print_usage() { + cat < [INTERFACE] + +Get the IP address associated with a MAC address and an optional interface +from the ip neighbor table. Returns results in JSON format. + +Arguments: + MAC_ADDRESS The MAC address to look up (e.g., aa:bb:cc:dd:ee:ff) + INTERFACE (Optional) The network interface to filter the search (e.g., eth0) + +Options: + -h Show this help message and exit + +Output: + JSON object containing mac, ipv4, ipv6, and optional interface fields. + On error, returns JSON with error_code and error fields. Possible error codes: + - INVALID_ARGUMENTS + - MAC_NOT_FOUND + - COMMAND_FAILED + +Examples: + $(basename "$0") aa:bb:cc:dd:ee:ff eth0 + $(basename "$0") aa:bb:cc:dd:ee:ff + $(basename "$0") -h + +EOF +} + +# Helper function to build JSON error output +# Args: $1=error_code, $2=error_message, $3=MAC address (optional) +build_error_json() { + local error_code="$1" + local error_msg="$2" + local mac="$3" + + # Build JSON using jq + jq -n \ + --arg mac "$mac" \ + --arg error_code "$error_code" \ + --arg error "$error_msg" \ + '(if $mac != "" then {mac: $mac} else {} end) + + {error_code: $error_code, error: $error}' +} + +# Helper function to build JSON output for successful MAC-to-IP resolution +# Args: $1=MAC address, $2=interface (optional), $3=IPv4 address (optional), $4=IPv6 address (optional) +build_success_json() { + local mac="$1" + local interface="$2" + local ipv4="$3" + local ipv6="$4" + + # Build JSON using jq + jq -n \ + --arg mac "$mac" \ + --arg interface "$interface" \ + --arg ipv4 "$ipv4" \ + --arg ipv6 "$ipv6" \ + '{mac: $mac} + + (if $interface != "" then {interface: $interface} else {} end) + + (if $ipv4 != "" then {ipv4: $ipv4} else {} end) + + (if $ipv6 != "" then {ipv6: $ipv6} else {} end)' +} + +# Get IPv4 broadcast address from local interface configs or device (if specified) +get_ipv4_broadcast() { + local target_intf="$1" + local broadcast_ip="" + local dev_option="" + + if [ -n "$target_intf" ]; then + dev_option="dev $target_intf" + fi + + # Capture both stdout and stderr + local tmp_output="/tmp/getip_broadcast_$$" + local tmp_error="/tmp/getip_broadcast_err_$$" + local cmd="ip -4 addr show ${dev_option}" + + ip -4 addr show ${dev_option} >"$tmp_output" 2>"$tmp_error" + local exit_code=$? + + if [ $exit_code -ne 0 ]; then + # Command failed, return error in JSON format + local error_msg="" + error_msg=$(cat "$tmp_error" 2>/dev/null || echo "Failed to get IPv4 broadcast address") + rm -f "$tmp_output" "$tmp_error" + build_error_json "COMMAND_FAILED" "Command '$cmd' failed: $error_msg" + return 1 + fi + + broadcast_ip=$(grep -oP 'brd \K[\d.]+' "$tmp_output" | head -n 1) + rm -f "$tmp_output" "$tmp_error" + + echo "$broadcast_ip" +} + +# Get link-local multicast address for IPv6 +get_ipv6_multicast() { + # Use all-nodes multicast address + echo "ff02::1" +} + +# Helper function to get IP from neighbor table for a given MAC address +# Args: $1=IP version (4 or 6), $2=MAC address, $3=dev_option (optional) +get_ip_from_neighbor() { + local ip_version="$1" + local target_mac="$2" + local dev_option="$3" + + # Capture both stdout and stderr + local tmp_output="/tmp/getip_neighbor_${ip_version}_$$" + local tmp_error="/tmp/getip_neighbor_err_${ip_version}_$$" + local cmd="ip -${ip_version} neighbor show ${dev_option}" + + ip -"${ip_version}" neighbor show ${dev_option} >"$tmp_output" 2>"$tmp_error" + local exit_code=$? + + if [ $exit_code -ne 0 ]; then + # Command failed, return error in JSON format + local error_msg="" + error_msg=$(cat "$tmp_error" 2>/dev/null || echo "Failed to query IP neighbor table") + rm -f "$tmp_output" "$tmp_error" + build_error_json "COMMAND_FAILED" "Command '$cmd' failed: $error_msg" + return 1 + fi + + local result="" + result=$(grep -i "lladdr $target_mac" "$tmp_output" | awk '{print $1}' | head -n 1) + rm -f "$tmp_output" "$tmp_error" + + echo "$result" +} + +# Helper function to ping an IP address with optional interface +# Args: $1=IP address, $2=interface (optional), $3=additional options (optional) +ping_ip() { + local ip_addr="$1" + local target_intf="$2" + local extra_options="$3" + local ping_cmd="ping" + local ping_options="-c 1 -w 1 -q" + + # Determine if IPv6 based on presence of colon in IP + if [[ $ip_addr =~ : ]]; then + ping_cmd="ping6" + fi + + # Add extra options if provided (e.g., -b for broadcast) + if [ -n "$extra_options" ]; then + ping_options="$extra_options $ping_options" + fi + + # Add interface option if provided + if [ -n "$target_intf" ]; then + ping_options="$ping_options -I $target_intf" + fi + + # Ping the IP address. Suppress output and errors. + $ping_cmd $ping_options "$ip_addr" >/dev/null 2>&1 +} + +# Check if an IP is IPv6 +is_ipv6() { + local ip="$1" + [[ $ip =~ : ]] +} + +get_ip_from_mac() { + local target_mac="$1" + local target_intf="$2" # Optional interface argument + + # Build device option for ip commands + local dev_option="" + if [ -n "$target_intf" ]; then + dev_option="dev $target_intf" + fi + + # Step 1: Check the neighbor table for existing entries (both IPv4 and IPv6) + # Check for IPv4 entry + local existing_ipv4="" + existing_ipv4=$(get_ip_from_neighbor 4 "$target_mac" "$dev_option") + + # Check if IPv4 query returned an error + if echo "$existing_ipv4" | grep -q '"error_code"'; then + echo "$existing_ipv4" + return 1 + fi + + # Check for IPv6 entry + local existing_ipv6="" + existing_ipv6=$(get_ip_from_neighbor 6 "$target_mac" "$dev_option") + + # Check if IPv6 query returned an error + if echo "$existing_ipv6" | grep -q '"error_code"'; then + echo "$existing_ipv6" + return 1 + fi + + if [ -n "$existing_ipv4" ] || [ -n "$existing_ipv6" ]; then + # Entry exists, ping the specific IP(s) to verify the MAC-IP mapping + [ -n "$existing_ipv4" ] && ping_ip "$existing_ipv4" "$target_intf" + [ -n "$existing_ipv6" ] && ping_ip "$existing_ipv6" "$target_intf" + else + # Entry doesn't exist, ping the broadcast/multicast addresses + + # Ping IPv4 broadcast if we have one + local broadcast_ipv4="" + broadcast_ipv4=$(get_ipv4_broadcast "$target_intf") + + # Check if broadcast query returned an error + if echo "$broadcast_ipv4" | grep -q '"error_code"'; then + echo "$broadcast_ipv4" + return 1 + fi + + [ -n "$broadcast_ipv4" ] && ping_ip "$broadcast_ipv4" "$target_intf" "-b" + + # Ping IPv6 multicast address + local multicast_ipv6="" + multicast_ipv6=$(get_ipv6_multicast) + ping_ip "$multicast_ipv6" "$target_intf" + fi + + # Wait a moment for the neighbor table to update + sleep 1 + + # Step 2: Check the neighbor table again and return all IPs which match the MAC + # Get IPv4 address + local ipv4_addr="" + ipv4_addr=$(get_ip_from_neighbor 4 "$target_mac" "$dev_option") + + # Check if IPv4 query returned an error + if echo "$ipv4_addr" | grep -q '"error_code"'; then + echo "$ipv4_addr" + return 1 + fi + + # Get IPv6 address + local ipv6_addr="" + ipv6_addr=$(get_ip_from_neighbor 6 "$target_mac" "$dev_option") + + # Check if IPv6 query returned an error + if echo "$ipv6_addr" | grep -q '"error_code"'; then + echo "$ipv6_addr" + return 1 + fi + + # Build and return JSON output + build_success_json "$target_mac" "$target_intf" "$ipv4_addr" "$ipv6_addr" +} + +# Parse arguments +if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then + print_usage + exit 0 +fi + +if [ -z "$1" ]; then + build_error_json "INVALID_ARGUMENTS" "MAC address argument required. Use -h for help." + exit 2 +fi + +# Get the IP address for the provided MAC address (and optional interface) +result_json=$(get_ip_from_mac "$@") + +# Parse the JSON to check if an IP was found or if there's an error +if echo "$result_json" | grep -q '"error_code"'; then + # Already has an error (e.g., COMMAND_FAILED), pass it through + echo "$result_json" + exit 1 +elif echo "$result_json" | grep -qE '"ipv4"|"ipv6"'; then + # Success case - found at least one IP address + echo "$result_json" + exit 0 +else + # No IP found and no error - this is MAC_NOT_FOUND + if [ -n "$2" ]; then + error_msg="MAC address $1 not found in ip neighbor table on interface $2." + else + error_msg="MAC address $1 not found in ip neighbor table." + fi + + build_error_json "MAC_NOT_FOUND" "$error_msg" "$1" + exit 1 +fi diff --git a/fboss-image/distro_infra/parts/run_distro_infra.sh b/fboss-image/distro_infra/parts/run_distro_infra.sh index 867a2f5e9531c..2a102b2ee283d 100755 --- a/fboss-image/distro_infra/parts/run_distro_infra.sh +++ b/fboss-image/distro_infra/parts/run_distro_infra.sh @@ -95,38 +95,5 @@ dnsmasq --interface="${INTERFACE}" --no-daemon \ --pxe-service=tag:fbossdut,x86-64_EFI,ipxe,ipxev4.efi \ ${dhcpv6_conf} & -sleep 2 # Wait for dnsmasq log spew - -# Loop asking the user for a MAC address, then creating the appropriate configuration files. Exiting the loop on an -# empty MAC -while read -rp "Enter MAC address (blank to exit): " mac; do - if [[ ${#mac} -eq 0 ]]; then - break - elif [[ ${#mac} -ne 17 ]]; then - echo "Invalid MAC address" - continue - fi - - dashmac=$(echo "$mac" | tr '[:upper:]:' '[:lower:]-') - colonmac=$(echo "$dashmac" | tr '-' ':') - - mkdir -m 777 "/distro_infra/persistent/${dashmac}" 2>/dev/null - ln -f /distro_infra/persistent/cache/ipxev4.efi "/distro_infra/persistent/${dashmac}/ipxev4.efi" - ln -f /distro_infra/persistent/cache/ipxev6.efi "/distro_infra/persistent/${dashmac}/ipxev6.efi" - ln -f /distro_infra/persistent/cache/autoexec.ipxe "/distro_infra/persistent/${dashmac}/autoexec.ipxe" - touch "/distro_infra/persistent/${dashmac}/pxeboot_complete" - - # IPv6 - # When booting over IPv6, iPXE only receives a fully-formed bootfile-url DHCPv6 option and it appears there is no - # way to give just iPXE other options. bootfile-url becomes the iPXE ${filename} setting, but is a full URL and iPXE - # scripting is not powerful enough to extract just the server IP from it so we can use HTTP downloading for the - # large artifacts. Thus we autogenerate this iPXE script simply to set the server IP to be used by autoexec.ipxe. - echo "#!ipxe" >"/distro_infra/persistent/${dashmac}/ipxev6.efi-serverip" - echo "set server_ip [${v6_ip}]" >>"/distro_infra/persistent/${dashmac}/ipxev6.efi-serverip" - echo "imgexec autoexec.ipxe" >>"/distro_infra/persistent/${dashmac}/ipxev6.efi-serverip" - - # Activate IPv4 and IPv6 - echo "${colonmac},id:*,set:fbossdut" >"/distro_infra/dnsmasq_conf.d/${dashmac}" - - sleep 1 # Wait for dnsmasq log spew -done +# Block on dnsmasq running in the background +wait diff --git a/fboss-image/image_builder/templates/centos-09.0/root_files/usr/local/bin/fboss_init.sh b/fboss-image/image_builder/templates/centos-09.0/root_files/usr/local/bin/fboss_init.sh index 6fdeeec683ae1..fc00bd8a06a96 100755 --- a/fboss-image/image_builder/templates/centos-09.0/root_files/usr/local/bin/fboss_init.sh +++ b/fboss-image/image_builder/templates/centos-09.0/root_files/usr/local/bin/fboss_init.sh @@ -91,6 +91,30 @@ enable_hw_agents() { done } +create_distro_base_snapshot() { + local base_snapshot="/distro-base" + + if [[ -e $base_snapshot ]]; then + log "Base snapshot already exists at $base_snapshot (skipping)" + return + fi + + log "Creating base snapshot for service updates..." + if btrfs subvolume snapshot / "$base_snapshot"; then + log "Created $base_snapshot snapshot successfully" + # Make it read-only to prevent accidental modifications + if btrfs property set -ts "$base_snapshot" ro true; then + log "Set $base_snapshot to read-only" + else + error "Failed to set $base_snapshot to read-only" + return 1 + fi + else + error "Failed to create $base_snapshot snapshot" + return 1 + fi +} + main() { log "Starting FBOSS initialization" @@ -99,6 +123,7 @@ main() { exit 1 fi + create_distro_base_snapshot setup_coop_configs "$platform_dir" generate_fruid enable_hw_agents "$platform_dir"