Skip to content

Commit

Permalink
Merge pull request #783 from zeeke/us/multiple-device-plugins
Browse files Browse the repository at this point in the history
config-daemon: Restart all instances of device-plugin
  • Loading branch information
zeeke authored Oct 9, 2024
2 parents 92cf81c + f286a04 commit a85ab70
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 40 deletions.
53 changes: 27 additions & 26 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ func New(
eventRecorder: er,
featureGate: featureGates,
disabledPlugins: disabledPlugins,
mu: &sync.Mutex{},
}
}

Expand Down Expand Up @@ -159,7 +160,6 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error {

var timeout int64 = 5
var metadataKey = "metadata.name"
dn.mu = &sync.Mutex{}
informerFactory := sninformer.NewFilteredSharedInformerFactory(dn.sriovClient,
time.Second*15,
vars.Namespace,
Expand Down Expand Up @@ -683,7 +683,6 @@ func (dn *Daemon) restartDevicePluginPod() error {
defer dn.mu.Unlock()
log.Log.V(2).Info("restartDevicePluginPod(): try to restart device plugin pod")

var podToDelete string
pods, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{
LabelSelector: "app=sriov-device-plugin",
FieldSelector: "spec.nodeName=" + vars.NodeName,
Expand All @@ -702,35 +701,37 @@ func (dn *Daemon) restartDevicePluginPod() error {
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
return nil
}
podToDelete = pods.Items[0].Name

log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete)
err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{})
if errors.IsNotFound(err) {
log.Log.Info("restartDevicePluginPod(): pod to delete not found")
return nil
}
if err != nil {
log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
return err
}

if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) {
_, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{})
for _, pod := range pods.Items {
podToDelete := pod.Name
log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete)
err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{})
if errors.IsNotFound(err) {
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
return true, nil
log.Log.Info("restartDevicePluginPod(): pod to delete not found")
continue
}

if err != nil {
log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
} else {
log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete)
log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
return err
}

if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) {
_, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{})
if errors.IsNotFound(err) {
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
return true, nil
}

if err != nil {
log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
} else {
log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete)
}
return false, nil
}, dn.stopCh); err != nil {
log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
return err
}
return false, nil
}, dn.stopCh); err != nil {
log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
return err
}

return nil
Expand Down
61 changes: 47 additions & 14 deletions pkg/daemon/daemon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ import (
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem"
)

var SriovDevicePluginPod corev1.Pod

func TestConfigDaemon(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Config Daemon Suite")
Expand Down Expand Up @@ -107,19 +109,6 @@ var _ = Describe("Config Daemon", func() {
},
}

SriovDevicePluginPod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "sriov-device-plugin-xxxx",
Namespace: vars.Namespace,
Labels: map[string]string{
"app": "sriov-device-plugin",
},
},
Spec: corev1.PodSpec{
NodeName: "test-node",
},
}

err = sriovnetworkv1.AddToScheme(scheme.Scheme)
Expect(err).ToNot(HaveOccurred())
kClient := kclient.NewClientBuilder().WithScheme(scheme.Scheme).WithRuntimeObjects(&corev1.Node{
Expand All @@ -130,7 +119,7 @@ var _ = Describe("Config Daemon", func() {
Namespace: vars.Namespace,
}}).Build()

kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs, &SriovDevicePluginPod)
kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs)
snclient := snclientset.NewSimpleClientset()
err = sriovnetworkv1.InitNicIDMapFromConfigMap(kubeClient, vars.Namespace)
Expect(err).ToNot(HaveOccurred())
Expand Down Expand Up @@ -175,6 +164,22 @@ var _ = Describe("Config Daemon", func() {
err := sut.Run(stopCh, exitCh)
Expect(err).ToNot(HaveOccurred())
}()

SriovDevicePluginPod = corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "sriov-device-plugin-xxxx",
Namespace: vars.Namespace,
Labels: map[string]string{
"app": "sriov-device-plugin",
},
},
Spec: corev1.PodSpec{
NodeName: "test-node",
},
}
_, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), &SriovDevicePluginPod, metav1.CreateOptions{})
Expect(err).ToNot(HaveOccurred())

})

AfterEach(func() {
Expand Down Expand Up @@ -286,6 +291,34 @@ var _ = Describe("Config Daemon", func() {

Expect(sut.desiredNodeState.GetGeneration()).To(BeNumerically("==", 777))
})

It("restart all the sriov-device-plugin pods present on the node", func() {
otherPod1 := SriovDevicePluginPod.DeepCopy()
otherPod1.Name = "sriov-device-plugin-xxxa"
_, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod1, metav1.CreateOptions{})
Expect(err).ToNot(HaveOccurred())

otherPod2 := SriovDevicePluginPod.DeepCopy()
otherPod2.Name = "sriov-device-plugin-xxxz"
_, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod2, metav1.CreateOptions{})
Expect(err).ToNot(HaveOccurred())

err = sut.restartDevicePluginPod()
Expect(err).ToNot(HaveOccurred())

Eventually(func() (int, error) {
podList, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{
LabelSelector: "app=sriov-device-plugin",
FieldSelector: "spec.nodeName=test-node",
})

if err != nil {
return 0, err
}

return len(podList.Items), nil
}, "1s").Should(BeZero())
})
})
})

Expand Down

0 comments on commit a85ab70

Please sign in to comment.