Skip to content

Commit

Permalink
Add waitForDevicesInitialization to systemd service
Browse files Browse the repository at this point in the history
This function ensures that the network devices specified in the configuration are registered and handled by UDEV. Sometimes, the initialization of network devices might take a significant amount of time, and the sriov-config systemd service may start before the devices are fully processed, leading to failure.
  • Loading branch information
ykulazhenkov committed Nov 21, 2024
1 parent 5fce462 commit 8d4ae20
Show file tree
Hide file tree
Showing 7 changed files with 131 additions and 0 deletions.
57 changes: 57 additions & 0 deletions cmd/sriov-network-config-daemon/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"errors"
"fmt"
"os"
"time"

"github.com/go-logr/logr"
"github.com/spf13/cobra"
Expand All @@ -40,6 +41,12 @@ import (
const (
PhasePre = "pre"
PhasePost = "post"

// InitializationDeviceDiscoveryTimeoutSec constant defines the number of
// seconds to wait for devices to be registered in the system with the expected name.
InitializationDeviceDiscoveryTimeoutSec = 60
// InitializationDeviceUdevProcessingTimeoutSec constant defines the number of seconds to wait for udev rules to process
InitializationDeviceUdevProcessingTimeoutSec = 60
)

var (
Expand Down Expand Up @@ -104,6 +111,8 @@ func runServiceCmd(cmd *cobra.Command, args []string) error {
return updateSriovResultErr(setupLog, phaseArg, fmt.Errorf("failed to create hostHelpers: %v", err))
}

waitForDevicesInitialization(setupLog, sriovConf, hostHelpers)

if phaseArg == PhasePre {
err = phasePre(setupLog, sriovConf, hostHelpers)
} else {
Expand Down Expand Up @@ -303,3 +312,51 @@ func updateResult(setupLog logr.Logger, result, msg string) error {
setupLog.V(0).Info("result file updated", "SyncStatus", sriovResult.SyncStatus, "LastSyncError", msg)
return nil
}

// waitForDevicesInitialization should be executed in both the pre and post-networking stages.
// This function ensures that the network devices specified in the configuration are registered
// and handled by UDEV. Sometimes, the initialization of network devices might take a significant
// amount of time, and the sriov-config systemd service may start before the devices are fully
// processed, leading to failure.
//
// To address this, we not only check if the devices are registered with the correct name but also
// wait for the udev event queue to empty. This increases the likelihood that the service will start
// only when the devices are fully initialized. It is required to call this function in the
// "post-networking" phase as well because the OS network manager might change device configurations,
// and we need to ensure these changes are fully processed before starting the post-networking part.
//
// The timeouts used in this function are intentionally kept low to avoid blocking the OS loading
// process for too long in case of any issues.
//
// Note: Currently, this function handles only Baremetal clusters. We do not have evidence that
// this logic is required for virtual clusters.
func waitForDevicesInitialization(setupLog logr.Logger, conf *systemd.SriovConfig, hostHelpers helper.HostHelpersInterface) {
if conf.PlatformType != consts.Baremetal {
// skip waiting on virtual cluster
return
}
// wait for devices from the spec to be registered in the system with expected names
devicesToWait := make(map[string]string, len(conf.Spec.Interfaces))
for _, d := range conf.Spec.Interfaces {
devicesToWait[d.PciAddress] = d.Name
}
deadline := time.Now().Add(time.Second * time.Duration(InitializationDeviceDiscoveryTimeoutSec))
for time.Now().Before(deadline) {
for pciAddr, name := range devicesToWait {
if hostHelpers.TryGetInterfaceName(pciAddr) == name {
delete(devicesToWait, pciAddr)
}
}
if len(devicesToWait) == 0 {
break
}
time.Sleep(time.Second)
}
if len(devicesToWait) != 0 {
setupLog.Info("WARNING: some devices were not initialized", "devices", devicesToWait, "timeout", InitializationDeviceDiscoveryTimeoutSec)
}
if err := hostHelpers.WaitUdevEventsProcessed(InitializationDeviceUdevProcessingTimeoutSec); err != nil {
setupLog.Info("WARNING: failed to wait for udev events processing", "reason", err.Error(),
"timeout", InitializationDeviceUdevProcessingTimeoutSec)
}
}
20 changes: 20 additions & 0 deletions cmd/sriov-network-config-daemon/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"fmt"

"github.com/go-logr/logr"
"github.com/golang/mock/gomock"
"github.com/spf13/cobra"
"gopkg.in/yaml.v3"
Expand Down Expand Up @@ -158,6 +159,8 @@ var _ = Describe("Service", func() {
"/etc/sriov-operator/sriov-interface-result.yaml": []byte("something"),
},
})
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("enp216s0f0np0")
hostHelpers.EXPECT().WaitUdevEventsProcessed(60).Return(nil)
hostHelpers.EXPECT().CheckRDMAEnabled().Return(true, nil)
hostHelpers.EXPECT().TryEnableTun().Return()
hostHelpers.EXPECT().TryEnableVhostNet().Return()
Expand Down Expand Up @@ -211,6 +214,8 @@ var _ = Describe("Service", func() {
"/etc/sriov-operator/sriov-interface-result.yaml": []byte("something"),
},
})
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("enp216s0f0np0")
hostHelpers.EXPECT().WaitUdevEventsProcessed(60).Return(nil)
hostHelpers.EXPECT().CheckRDMAEnabled().Return(true, nil)
hostHelpers.EXPECT().TryEnableTun().Return()
hostHelpers.EXPECT().TryEnableVhostNet().Return()
Expand All @@ -236,6 +241,8 @@ var _ = Describe("Service", func() {
"/etc/sriov-operator/sriov-interface-result.yaml": getTestResultFileContent("InProgress", ""),
},
})
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("enp216s0f0np0")
hostHelpers.EXPECT().WaitUdevEventsProcessed(60).Return(nil)
hostHelpers.EXPECT().DiscoverSriovDevices(hostHelpers).Return([]sriovnetworkv1.InterfaceExt{{
Name: "enp216s0f0np0",
}}, nil)
Expand Down Expand Up @@ -276,4 +283,17 @@ var _ = Describe("Service", func() {
testHelpers.GinkgoAssertFileContentsEquals("/etc/sriov-operator/sriov-interface-result.yaml",
string(getTestResultFileContent("Failed", "post: unexpected result of the pre phase: Failed, syncError: pretest")))
})
It("waitForDevicesInitialization", func() {
cfg := &systemd.SriovConfig{Spec: sriovnetworkv1.SriovNetworkNodeStateSpec{
Interfaces: []sriovnetworkv1.Interface{
{Name: "name1", PciAddress: "0000:d8:00.0"},
{Name: "name2", PciAddress: "0000:d8:00.1"}}}}
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("other")
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.1").Return("")
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("name1")
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.1").Return("")
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.1").Return("name2")
hostHelpers.EXPECT().WaitUdevEventsProcessed(60).Return(nil)
waitForDevicesInitialization(logr.Discard(), cfg, hostHelpers)
})
})
14 changes: 14 additions & 0 deletions pkg/helper/mock/mock_helper.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions pkg/host/internal/udev/udev.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"os"
"path"
"path/filepath"
"strconv"
"strings"

"sigs.k8s.io/controller-runtime/pkg/log"
Expand Down Expand Up @@ -126,6 +127,18 @@ func (u *udev) LoadUdevRules() error {
return nil
}

// WaitUdevEventsProcessed calls `udevadm settle“ with provided timeout
// The command watches the udev event queue, and exits if all current events are handled.
func (u *udev) WaitUdevEventsProcessed(timeout int) error {
log.Log.V(2).Info("WaitUdevEventsProcessed()")
_, stderr, err := u.utilsHelper.RunCommand("udevadm", "settle", "-t", strconv.Itoa(timeout))
if err != nil {
log.Log.Error(err, "WaitUdevEventsProcessed(): failed to wait for udev rules to process", "error", stderr, "timeout", timeout)
return err
}
return nil
}

func (u *udev) addUdevRule(pfPciAddress, ruleName, ruleContent string) error {
log.Log.V(2).Info("addUdevRule()", "device", pfPciAddress, "rule", ruleName)
rulePath := u.getRuleFolderPath()
Expand Down
10 changes: 10 additions & 0 deletions pkg/host/internal/udev/udev_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,4 +210,14 @@ var _ = Describe("UDEV", func() {
Expect(s.LoadUdevRules()).To(MatchError(testError))
})
})
Context("WaitUdevEventsProcessed", func() {
It("Succeed", func() {
utilsMock.EXPECT().RunCommand("udevadm", "settle", "-t", "10").Return("", "", nil)
Expect(s.WaitUdevEventsProcessed(10)).NotTo(HaveOccurred())
})
It("Command Failed", func() {
utilsMock.EXPECT().RunCommand("udevadm", "settle", "-t", "20").Return("", "", testError)
Expect(s.WaitUdevEventsProcessed(20)).To(MatchError(testError))
})
})
})
14 changes: 14 additions & 0 deletions pkg/host/mock/mock_host.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions pkg/host/types/interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,9 @@ type UdevInterface interface {
RemoveVfRepresentorUdevRule(pfPciAddress string) error
// LoadUdevRules triggers udev rules for network subsystem
LoadUdevRules() error
// WaitUdevEventsProcessed calls `udevadm settle“ with provided timeout
// The command watches the udev event queue, and exits if all current events are handled.
WaitUdevEventsProcessed(timeout int) error
}

type VdpaInterface interface {
Expand Down

0 comments on commit 8d4ae20

Please sign in to comment.