Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions cmd/hostagent/subcmds/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/nvidia/doca-platform/internal/provisioning/hostagent"
"github.com/nvidia/doca-platform/internal/provisioning/hostagent/networkmanager"
"github.com/nvidia/doca-platform/internal/provisioning/hostagent/nodemanager"
"github.com/nvidia/doca-platform/internal/provisioning/hostagent/phase/reboot"
"github.com/nvidia/doca-platform/internal/provisioning/hostagent/service"

"github.com/spf13/cobra"
Expand Down Expand Up @@ -103,11 +104,13 @@ var serveCmd = &cobra.Command{
os.Exit(1)
}

if err := service.NewInstallationService(unCachedClient, nm).Start(true); err != nil {
rh := reboot.NewHandler(mgr.GetClient(), dpuNodeManager.GetNodeName, nm.GetDevice)

if err := service.NewInstallationService(unCachedClient, nm, rh).Start(true); err != nil {
klog.Fatalf("failed to start installation service: %v", err)
}

reconciler := hostagent.NewHostAgentReconciler(mgr.GetClient(), opts.BFBRegistryAddress, dpuNodeManager, nm)
reconciler := hostagent.NewHostAgentReconciler(mgr.GetClient(), opts.BFBRegistryAddress, dpuNodeManager, nm, rh)
if err = reconciler.SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DPU")
os.Exit(1)
Expand Down
13 changes: 13 additions & 0 deletions internal/provisioning/controllers/util/dms/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,10 @@ func CreateHostAgentPod(ctx context.Context, client client.Client, node *corev1.
Name: "run-udev",
MountPath: "/run/udev",
},
{
Name: "etc-udev-rules",
MountPath: "/etc/udev/rules.d",
},
{
Name: "systemd-network",
MountPath: "/usr/lib/systemd/network",
Expand Down Expand Up @@ -307,6 +311,15 @@ func CreateHostAgentPod(ctx context.Context, client client.Client, node *corev1.
},
},
},
{
Name: "etc-udev-rules",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/etc/udev/rules.d",
Type: ptr.To(corev1.HostPathDirectoryOrCreate),
},
},
},
{
Name: "systemd-network",
VolumeSource: corev1.VolumeSource{
Expand Down
5 changes: 3 additions & 2 deletions internal/provisioning/hostagent/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ type HostAgentReconciler struct {
func NewHostAgentReconciler(client client.Client,
bfbRegistryAddress string,
nodeManager nodemanager.Interface,
networkManager networkmanager.Interface) *HostAgentReconciler {
networkManager networkmanager.Interface,
rebootHandler *reboot.Handler) *HostAgentReconciler {
r := &HostAgentReconciler{
Client: client,
NodeManager: nodeManager,
Expand All @@ -70,7 +71,7 @@ func NewHostAgentReconciler(client client.Client,
provisioningv1.DPUInitializeInterface: interfaceinit.NewHandler(client, r.NetworkManager.GetDevice),
provisioningv1.DPUConfigFWParameters: configfw.NewHandler(client, r.NetworkManager.GetDevice),
provisioningv1.DPUOSInstalling: install.NewHandler(client, bfbRegistry, r.NetworkManager.GetDevice),
provisioningv1.DPURebooting: reboot.NewHandler(client, r.NodeManager.GetNodeName, r.NetworkManager.GetDevice),
provisioningv1.DPURebooting: rebootHandler,
provisioningv1.DPUHostNetworkConfiguration: network.NewHandler(r.NetworkManager.AddNetworkRequest),
}
return r
Expand Down
35 changes: 28 additions & 7 deletions internal/provisioning/hostagent/networkmanager/network_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ type Interface interface {
Start() error
// GetDevice returns the PCI device by serial number
GetDevice(serialNumber string) (hostutil.Device, bool)
// AddNetworkRequest adds a network request for a DPU
AddNetworkRequest(dpu *provisioningv1.DPU) error
// AddNetworkRequest adds a network request for a DPU.
// If vfCount is non-nil it overrides the value derived from the DPUFlavor.
AddNetworkRequest(dpu *provisioningv1.DPU, vfCount *int) error
}

type NetworkManager struct {
Expand Down Expand Up @@ -185,6 +186,12 @@ func (nm *NetworkManager) processNetworkRequest(nr NetworkRequest) error {
return nil
}
operations := []networkOperation{
{
name: "DisableNMForVFs",
f: func(nr NetworkRequest) error {
return nm.netBackend.EnsureVFsUnmanaged()
},
},
{
name: "CreateP0VF",
f: func(nr NetworkRequest) error {
Expand Down Expand Up @@ -247,7 +254,7 @@ func (nm *NetworkManager) processNetworkRequest(nr NetworkRequest) error {
return nil
}

func (nm *NetworkManager) AddNetworkRequest(dpu *provisioningv1.DPU) error {
func (nm *NetworkManager) AddNetworkRequest(dpu *provisioningv1.DPU, vfCount *int) error {
nm.Lock()
defer nm.Unlock()
if !nm.initialized {
Expand All @@ -256,7 +263,15 @@ func (nm *NetworkManager) AddNetworkRequest(dpu *provisioningv1.DPU) error {
return fmt.Errorf("DPU is nil")
}

if _, ok := nm.reqs[string(dpu.UID)]; ok {
if existing, ok := nm.reqs[string(dpu.UID)]; ok {
if vfCount != nil && *vfCount != 0 && existing.NumOfVFs != *vfCount {
existing.NumOfVFs = *vfCount
if err := writeNetworkRequestFile(&existing); err != nil {
return fmt.Errorf("failed to update network request file: %w", err)
}
nm.reqs[existing.UID] = existing
klog.Infof("Updated VF count to %d for DPU %s/%s", *vfCount, existing.DPUNamespace, existing.DpuName)
}
return nil
}

Expand All @@ -272,9 +287,15 @@ func (nm *NetworkManager) AddNetworkRequest(dpu *provisioningv1.DPU) error {
}
nr.PCIAddress = dev.Address

numOfVFs, err := nm.getNumOfVFs(dpu)
if err != nil {
return fmt.Errorf("failed to get number of VFs: %w", err)
var numOfVFs int
if vfCount != nil && *vfCount != 0 {
numOfVFs = *vfCount
} else {
var err error
numOfVFs, err = nm.getNumOfVFs(dpu)
if err != nil {
return fmt.Errorf("failed to get number of VFs: %w", err)
}
}
nr.NumOfVFs = numOfVFs

Expand Down
6 changes: 3 additions & 3 deletions internal/provisioning/hostagent/phase/network/hostnetwork.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,18 @@ const (
)

type Handler struct {
AddNetworkRequest func(dpu *provisioningv1.DPU) error
AddNetworkRequest func(dpu *provisioningv1.DPU, vfCount *int) error
}

func NewHandler(addNetworkRequest func(dpu *provisioningv1.DPU) error) *Handler {
func NewHandler(addNetworkRequest func(dpu *provisioningv1.DPU, vfCount *int) error) *Handler {
return &Handler{
AddNetworkRequest: addNetworkRequest,
}
}

func (h *Handler) Handle(ctx context.Context, dpu *provisioningv1.DPU) (provisioningv1.DPUStatus, ctrl.Result, error) {
log := log.FromContext(ctx)
if err := h.AddNetworkRequest(dpu); err != nil {
if err := h.AddNetworkRequest(dpu, nil); err != nil {
log.Error(err, "Failed to add network request")
hostutil.NewCondition(condition).Failure(err, "FailedToSetupHostNetwork").Set(&dpu.Status.Conditions)
return dpu.Status, ctrl.Result{}, err
Expand Down
8 changes: 4 additions & 4 deletions internal/provisioning/hostagent/phase/reboot/sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,18 +130,18 @@ func (r *Handler) reboot(ctx context.Context, dpuNode *provisioningv1.DPUNode, d
}
}
if runPowerCycle {
if err := r.runPowerCycle(dpuNode, rebootNow); err != nil {
if err := r.RunPowerCycle(dpuNode, rebootNow); err != nil {
return rebootNow, err
}
return nil, nil
}
if err := r.runSLR(ctx, rebootNow); err != nil {
if err := r.RunSLR(ctx, rebootNow); err != nil {
return rebootNow, err
}
return nil, nil
}

func (r *Handler) runPowerCycle(dpuNode *provisioningv1.DPUNode, dpus []provisioningv1.DPU) error {
func (r *Handler) RunPowerCycle(dpuNode *provisioningv1.DPUNode, dpus []provisioningv1.DPU) error {
powerCycleCommand, err := reboot.PowerCycleCommand(dpuNode)
if err != nil {
return fmt.Errorf("failed to get power cycle command: %w", err)
Expand All @@ -157,7 +157,7 @@ func (r *Handler) runPowerCycle(dpuNode *provisioningv1.DPUNode, dpus []provisio
return nil
}

func (r *Handler) runSLR(ctx context.Context, toBeRebooted []provisioningv1.DPU) error {
func (r *Handler) RunSLR(ctx context.Context, toBeRebooted []provisioningv1.DPU) error {
devs := make([]hostutil.Device, len(toBeRebooted))
for i, dpu := range toBeRebooted {
dev, ok := r.getDeviceBySerialNumberFunc(dpu.Spec.SerialNumber)
Expand Down
77 changes: 74 additions & 3 deletions internal/provisioning/hostagent/service/installation_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"time"

provisioningv1 "github.com/nvidia/doca-platform/api/provisioning/v1alpha1"
"github.com/nvidia/doca-platform/internal/provisioning/hostagent/phase/reboot"
"github.com/nvidia/doca-platform/internal/provisioning/hostagent/service/types"

restful "github.com/emicklei/go-restful/v3"
Expand Down Expand Up @@ -67,7 +68,7 @@ const (
// NetworkConfigurator is an interface for triggering host network configuration.
// It is satisfied by networkmanager.NetworkManager.
type NetworkConfigurator interface {
AddNetworkRequest(dpu *provisioningv1.DPU) error
AddNetworkRequest(dpu *provisioningv1.DPU, vfCount *int) error
}

type InstallationService struct {
Expand All @@ -78,16 +79,18 @@ type InstallationService struct {
// listeners maps interface names to their listeners
listeners map[string]net.Listener
networkManager NetworkConfigurator
rebootHandler *reboot.Handler
// stopCh is closed by Stop() to terminate background goroutines
stopCh chan struct{}
stopOnce sync.Once
}

func NewInstallationService(client client.Client, nm NetworkConfigurator) *InstallationService {
func NewInstallationService(client client.Client, nm NetworkConfigurator, rh *reboot.Handler) *InstallationService {
s := &InstallationService{
Client: client,
listeners: make(map[string]net.Listener),
networkManager: nm,
rebootHandler: rh,
stopCh: make(chan struct{}),
}
ws := new(restful.WebService).Path("/")
Expand All @@ -110,6 +113,11 @@ func NewInstallationService(client client.Client, nm NetworkConfigurator) *Insta
Consumes(restful.MIME_JSON).
Produces(restful.MIME_JSON).
To(s.ConfigureHostVFs))
ws.Route(
ws.POST("/trigger-reboot").
Consumes(restful.MIME_JSON).
Produces(restful.MIME_JSON).
To(s.TriggerReboot))
ws.Route(ws.GET("/healthz").To(s.HealthCheck))
// Package repositories: serve .deb and .rpm packages for DPU provisioning.
ws.Route(ws.GET("/deb/{subpath:*}").To(serveRepoFile(debRepoDir)))
Expand Down Expand Up @@ -329,7 +337,7 @@ func (s *InstallationService) ConfigureHostVFs(req *restful.Request, resp *restf
return
}

if err := s.networkManager.AddNetworkRequest(dpu); err != nil {
if err := s.networkManager.AddNetworkRequest(dpu, &request.VFCount); err != nil {
klog.Errorf("failed to add network request for DPU %s/%s: %v", request.DPUNamespace, request.DPUName, err)
_ = resp.WriteError(http.StatusInternalServerError, err)
return
Expand All @@ -339,6 +347,69 @@ func (s *InstallationService) ConfigureHostVFs(req *restful.Request, resp *restf
resp.WriteHeader(http.StatusOK)
}

func (s *InstallationService) TriggerReboot(req *restful.Request, resp *restful.Response) {
var request types.TriggerRebootRequest
if err := req.ReadEntity(&request); err != nil {
klog.Errorf("failed to read trigger reboot request: %v", err)
_ = resp.WriteError(http.StatusBadRequest, err)
return
}
klog.Infof("Received trigger reboot request: %#v", request)

ctx := req.Request.Context()

dpu := &provisioningv1.DPU{}
if err := s.Get(ctx, client.ObjectKey{Namespace: request.DPUNamespace, Name: request.DPUName}, dpu); err != nil {
klog.Errorf("failed to get DPU %s/%s: %v", request.DPUNamespace, request.DPUName, err)
if apierrors.IsNotFound(err) {
_ = resp.WriteError(http.StatusNotFound, err)
} else {
_ = resp.WriteError(http.StatusInternalServerError, err)
}
return
}

if string(dpu.UID) != request.DPUUID {
klog.Warningf("Rejecting trigger reboot request for DPU %s/%s: request UID %q does not match current DPU UID %q",
request.DPUNamespace, request.DPUName, request.DPUUID, dpu.UID)
_ = resp.WriteError(http.StatusConflict, fmt.Errorf("stale DPU object: expected UID %q but got %q", request.DPUUID, dpu.UID))
return
}

// Detach from the HTTP request context: the request arrives over tmfifo,
// and shutting down the ARM severs that connection.
rebootCtx := context.WithoutCancel(ctx)

switch request.RebootMethod {
case provisioningv1.RebootMethodSystemLevelReset,
provisioningv1.RebootMethodFirmwareReset,
provisioningv1.RebootMethodSystemReboot:
if err := s.rebootHandler.RunSLR(rebootCtx, []provisioningv1.DPU{*dpu}); err != nil {
klog.Errorf("SLR failed for DPU %s/%s: %v", request.DPUNamespace, request.DPUName, err)
_ = resp.WriteError(http.StatusInternalServerError, err)
return
}
case provisioningv1.RebootMethodPowerCycle:
dpuNode := &provisioningv1.DPUNode{}
if err := s.Get(rebootCtx, client.ObjectKey{Name: dpu.Spec.DPUNodeName}, dpuNode); err != nil {
klog.Errorf("failed to get DPUNode %s: %v", dpu.Spec.DPUNodeName, err)
_ = resp.WriteError(http.StatusInternalServerError, err)
return
}
if err := s.rebootHandler.RunPowerCycle(dpuNode, []provisioningv1.DPU{*dpu}); err != nil {
klog.Errorf("PowerCycle failed for DPU %s/%s: %v", request.DPUNamespace, request.DPUName, err)
_ = resp.WriteError(http.StatusInternalServerError, err)
return
}
default:
_ = resp.WriteError(http.StatusBadRequest, fmt.Errorf("unsupported reboot method: %q", request.RebootMethod))
return
}

klog.Infof("Successfully triggered reboot (%s) for DPU %s/%s", request.RebootMethod, request.DPUNamespace, request.DPUName)
resp.WriteHeader(http.StatusOK)
}

func (s *InstallationService) UpdateStatus(req *restful.Request, resp *restful.Response) {
var request types.UpdateStatusRequest
if err := req.ReadEntity(&request); err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ var _ = Describe("InstallationService", func() {
testNS = &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{GenerateName: "installation-service-testns-"}}
Expect(k8sClient.Create(ctx, testNS)).To(Succeed())

installationService = NewInstallationService(k8sClient, nil)
installationService = NewInstallationService(k8sClient, nil, nil)
Expect(installationService.Start(false)).To(Succeed())
// Start() runs the server in a goroutine; wait until it is listening to avoid connection refused.
Eventually(func() error {
Expand Down
8 changes: 8 additions & 0 deletions internal/provisioning/hostagent/service/types/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,12 @@ type UpdateStatusRequest struct {
type ConfigureHostVFsRequest struct {
DPUName string `json:"dpuName"`
DPUNamespace string `json:"dpuNamespace"`
VFCount int `json:"vfCount"`
}

type TriggerRebootRequest struct {
DPUName string `json:"dpuName"`
DPUNamespace string `json:"dpuNamespace"`
DPUUID string `json:"dpuUID"`
RebootMethod provisioningv1.RebootMethodType `json:"rebootMethod"`
}
5 changes: 5 additions & 0 deletions internal/provisioning/hostagent/util/netconfig/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ type Backend interface {

// IsDHCPConfigured checks if DHCP is enabled for an interface.
IsDHCPConfigured(interfaceName string) (bool, error)

// EnsureVFsUnmanaged ensures that VF interfaces will not be managed by the
// network configuration backend. For NetworkManager this writes a udev rule;
// other backends may no-op.
EnsureVFsUnmanaged() error
}

// ConfigureNetwork orchestrates PF interface and bridge MTU configuration
Expand Down
Loading