Skip to content

Commit

Permalink
redesign device plugin
Browse files Browse the repository at this point in the history
always deploy sriov network device plugin and use a label to enable or
disable it on the nodes

Signed-off-by: Sebastian Sch <[email protected]>
  • Loading branch information
SchSeba committed Nov 6, 2024
1 parent 823b4d4 commit 3d20dff
Show file tree
Hide file tree
Showing 14 changed files with 340 additions and 122 deletions.
2 changes: 1 addition & 1 deletion bindata/manifests/plugins/sriov-device-plugin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ spec:
hostNetwork: true
nodeSelector:
{{- range $key, $value := .NodeSelectorField }}
{{ $key }}: {{ $value }}
{{ $key }}: "{{ $value }}"
{{- end }}
tolerations:
- operator: Exists
Expand Down
11 changes: 0 additions & 11 deletions controllers/drain_controller_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,17 +109,6 @@ func (dr *DrainReconcile) handleNodeDrainOrReboot(ctx context.Context,
return reconcile.Result{RequeueAfter: 5 * time.Second}, nil
}

reqLogger.Info("remove Device plugin from node")
err = utils.LabelNode(ctx, node.Name, constants.SriovDevicePluginLabel, constants.SriovDevicePluginLabelDisabled, dr.Client)
if err != nil {
reqLogger.Error(err, "failed to label node for device plugin label",
"labelKey",
constants.SriovDevicePluginLabel,
"labelValue",
constants.SriovDevicePluginLabelDisabled)
return reconcile.Result{}, err
}

// if we manage to drain we label the node state with drain completed and finish
err = utils.AnnotateObject(ctx, nodeNetworkState, constants.NodeStateDrainAnnotationCurrent, constants.DrainComplete, dr.Client)
if err != nil {
Expand Down
16 changes: 11 additions & 5 deletions controllers/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,11 +152,17 @@ func GetDefaultNodeSelector() map[string]string {

// GetDefaultNodeSelectorForDevicePlugin return a nodeSelector with worker linux os
// and the enabled sriov device plugin
func GetDefaultNodeSelectorForDevicePlugin() map[string]string {
return map[string]string{
"kubernetes.io/os": "linux",
constants.SriovDevicePluginLabel: constants.SriovDevicePluginLabelEnabled,
func GetNodeSelectorForDevicePlugin(dc *sriovnetworkv1.SriovOperatorConfig) map[string]string {
if len(dc.Spec.ConfigDaemonNodeSelector) == 0 {
return map[string]string{
"kubernetes.io/os": "linux",
constants.SriovDevicePluginLabel: constants.SriovDevicePluginLabelEnabled,
}
}

tmp := dc.Spec.DeepCopy()
tmp.ConfigDaemonNodeSelector[constants.SriovDevicePluginLabel] = constants.SriovDevicePluginLabelEnabled
return tmp.ConfigDaemonNodeSelector
}

func syncPluginDaemonObjs(ctx context.Context,
Expand All @@ -173,7 +179,7 @@ func syncPluginDaemonObjs(ctx context.Context,
data.Data["ReleaseVersion"] = os.Getenv("RELEASEVERSION")
data.Data["ResourcePrefix"] = vars.ResourcePrefix
data.Data["ImagePullSecrets"] = GetImagePullSecrets()
data.Data["NodeSelectorField"] = GetDefaultNodeSelectorForDevicePlugin()
data.Data["NodeSelectorField"] = GetNodeSelectorForDevicePlugin(dc)
data.Data["UseCDI"] = dc.Spec.UseCDI
objs, err := renderDsForCR(constants.PluginPath, &data)
if err != nil {
Expand Down
50 changes: 41 additions & 9 deletions controllers/sriovnetworknodepolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ import (
sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
constants "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
)

Expand Down Expand Up @@ -133,10 +134,6 @@ func (r *SriovNetworkNodePolicyReconciler) Reconcile(ctx context.Context, req ct
if err = r.syncDevicePluginConfigMap(ctx, defaultOpConf, policyList, nodeList); err != nil {
return reconcile.Result{}, err
}
// Render and sync Daemon objects
if err = syncPluginDaemonObjs(ctx, r.Client, r.Scheme, defaultOpConf, policyList); err != nil {
return reconcile.Result{}, err
}

// All was successful. Request that this be re-triggered after ResyncPeriod,
// so we can reconcile state again.
Expand Down Expand Up @@ -182,6 +179,12 @@ func (r *SriovNetworkNodePolicyReconciler) SetupWithManager(mgr ctrl.Manager) er
Info("Enqueuing sync for create event", "resource", e.Object.GetName())
qHandler(q)
},
UpdateFunc: func(ctx context.Context, e event.UpdateEvent, q workqueue.RateLimitingInterface) {
reflect.DeepEqual(e.ObjectOld.GetLabels(), e.ObjectNew.GetLabels())
log.Log.WithName("SriovNetworkNodePolicy").
Info("Enqueuing sync for create event", "resource", e.ObjectNew.GetName())
qHandler(q)
},
DeleteFunc: func(ctx context.Context, e event.DeleteEvent, q workqueue.RateLimitingInterface) {
log.Log.WithName("SriovNetworkNodePolicy").
Info("Enqueuing sync for delete event", "resource", e.Object.GetName())
Expand Down Expand Up @@ -219,6 +222,30 @@ func (r *SriovNetworkNodePolicyReconciler) syncDevicePluginConfigMap(ctx context
return err
}
configData[node.Name] = string(config)

if data.ResourceList == nil || len(data.ResourceList) == 0 {
// if we don't have policies we should add the disabled label for the device plugin
err = utils.LabelNode(ctx, node.Name, constants.SriovDevicePluginLabel, constants.SriovDevicePluginLabelDisabled, r.Client)
if err != nil {
logger.Error(err, "failed to label node for device plugin label",
"labelKey",
constants.SriovDevicePluginLabel,
"labelValue",
constants.SriovDevicePluginLabelDisabled)
return err
}
} else {
// if we have policies we should add the enabled label for the device plugin
err = utils.LabelNode(ctx, node.Name, constants.SriovDevicePluginLabel, constants.SriovDevicePluginLabelEnabled, r.Client)
if err != nil {
logger.Error(err, "failed to label node for device plugin label",
"labelKey",
constants.SriovDevicePluginLabel,
"labelValue",
constants.SriovDevicePluginLabelEnabled)
return err
}
}
}

cm := &corev1.ConfigMap{
Expand Down Expand Up @@ -296,8 +323,15 @@ func (r *SriovNetworkNodePolicyReconciler) syncAllSriovNetworkNodeStates(ctx con
}
}
if !found {
// remove device plugin labels
logger.Info("removing device plugin label from node as SriovNetworkNodeState doesn't exist", "nodeStateName", ns.Name)
err = utils.RemoveLabelFromNode(ctx, ns.Name, constants.SriovDevicePluginLabel, r.Client)
if err != nil {
logger.Error(err, "Fail to remove device plugin label from node", "node", ns.Name)
return err
}
logger.Info("Deleting SriovNetworkNodeState as node with that name doesn't exist", "nodeStateName", ns.Name)
err := r.Delete(ctx, &ns, &client.DeleteOptions{})
err = r.Delete(ctx, &ns, &client.DeleteOptions{})
if err != nil {
logger.Error(err, "Fail to Delete", "SriovNetworkNodeState CR:", ns.GetName())
return err
Expand Down Expand Up @@ -415,13 +449,13 @@ func (r *SriovNetworkNodePolicyReconciler) renderDevicePluginConfigData(ctx cont
found, i := resourceNameInList(p.Spec.ResourceName, &rcl)

if found {
err := updateDevicePluginResource(ctx, &rcl.ResourceList[i], &p, nodeState)
err := updateDevicePluginResource(&rcl.ResourceList[i], &p, nodeState)
if err != nil {
return rcl, err
}
logger.V(1).Info("Update resource", "Resource", rcl.ResourceList[i])
} else {
rc, err := createDevicePluginResource(ctx, &p, nodeState)
rc, err := createDevicePluginResource(&p, nodeState)
if err != nil {
return rcl, err
}
Expand All @@ -442,7 +476,6 @@ func resourceNameInList(name string, rcl *dptypes.ResourceConfList) (bool, int)
}

func createDevicePluginResource(
ctx context.Context,
p *sriovnetworkv1.SriovNetworkNodePolicy,
nodeState *sriovnetworkv1.SriovNetworkNodeState) (*dptypes.ResourceConfig, error) {
netDeviceSelectors := dptypes.NetDeviceSelectors{}
Expand Down Expand Up @@ -516,7 +549,6 @@ func createDevicePluginResource(
}

func updateDevicePluginResource(
ctx context.Context,
rc *dptypes.ResourceConfig,
p *sriovnetworkv1.SriovNetworkNodePolicy,
nodeState *sriovnetworkv1.SriovNetworkNodeState) error {
Expand Down
137 changes: 136 additions & 1 deletion controllers/sriovnetworknodepolicy_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,20 @@ package controllers
import (
"context"
"encoding/json"
"sync"
"testing"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

"github.com/google/go-cmp/cmp"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"

k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"

dptypes "github.com/k8snetworkplumbingwg/sriov-network-device-plugin/pkg/types"
Expand Down Expand Up @@ -126,3 +132,132 @@ func TestRenderDevicePluginConfigData(t *testing.T) {
})
}
}

var _ = Describe("SriovnetworkNodePolicy controller", Ordered, func() {
var cancel context.CancelFunc
var ctx context.Context

BeforeAll(func() {
By("Create SriovOperatorConfig controller k8s objs")
config := makeDefaultSriovOpConfig()
Expect(k8sClient.Create(context.Background(), config)).Should(Succeed())
DeferCleanup(func() {
err := k8sClient.Delete(context.Background(), config)
Expect(err).ToNot(HaveOccurred())
})

// setup controller manager
By("Setup controller manager")
k8sManager, err := setupK8sManagerForTest()
Expect(err).ToNot(HaveOccurred())

err = (&SriovNetworkNodePolicyReconciler{
Client: k8sManager.GetClient(),
Scheme: k8sManager.GetScheme(),
FeatureGate: featuregate.New(),
}).SetupWithManager(k8sManager)
Expect(err).ToNot(HaveOccurred())

ctx, cancel = context.WithCancel(context.Background())

wg := sync.WaitGroup{}
wg.Add(1)
go func() {
defer wg.Done()
defer GinkgoRecover()
By("Start controller manager")
err := k8sManager.Start(ctx)
Expect(err).ToNot(HaveOccurred())
}()

DeferCleanup(func() {
By("Shut down manager")
cancel()
wg.Wait()
})
})
AfterEach(func() {
err := k8sClient.DeleteAllOf(context.Background(), &corev1.Node{})
Expect(err).ToNot(HaveOccurred())

err = k8sClient.DeleteAllOf(context.Background(), &sriovnetworkv1.SriovNetworkNodePolicy{}, k8sclient.InNamespace(vars.Namespace))
Expect(err).ToNot(HaveOccurred())

err = k8sClient.DeleteAllOf(context.Background(), &sriovnetworkv1.SriovNetworkNodeState{}, k8sclient.InNamespace(vars.Namespace))
Expect(err).ToNot(HaveOccurred())
})
Context("device plugin labels", func() {
It("Should add the right labels to the nodes", func() {
node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{
Name: "node0",
Labels: map[string]string{"kubernetes.io/os": "linux",
"node-role.kubernetes.io/worker": ""},
}}
Expect(k8sClient.Create(ctx, node)).To(Succeed())

nodeState := &sriovnetworkv1.SriovNetworkNodeState{}
Eventually(func(g Gomega) {
err := k8sClient.Get(context.TODO(), k8sclient.ObjectKey{Name: "node0", Namespace: testNamespace}, nodeState)
g.Expect(err).ToNot(HaveOccurred())
}, time.Minute, time.Second).Should(Succeed())

Eventually(func(g Gomega) {
err := k8sClient.Get(context.Background(), k8sclient.ObjectKey{Name: node.Name}, node)
g.Expect(err).ToNot(HaveOccurred())
value, exist := node.Labels[consts.SriovDevicePluginLabel]
g.Expect(exist).To(BeTrue())
g.Expect(value).To(Equal(consts.SriovDevicePluginLabelDisabled))
}, time.Minute, time.Second).Should(Succeed())

nodeState.Status.Interfaces = sriovnetworkv1.InterfaceExts{
sriovnetworkv1.InterfaceExt{
Vendor: "8086",
Driver: "i40e",
Mtu: 1500,
Name: "ens803f0",
PciAddress: "0000:86:00.0",
NumVfs: 0,
TotalVfs: 64,
},
}
err := k8sClient.Status().Update(context.Background(), nodeState)
Expect(err).ToNot(HaveOccurred())

somePolicy := &sriovnetworkv1.SriovNetworkNodePolicy{}
somePolicy.SetNamespace(testNamespace)
somePolicy.SetName("some-policy")
somePolicy.Spec = sriovnetworkv1.SriovNetworkNodePolicySpec{
NumVfs: 5,
NodeSelector: map[string]string{"node-role.kubernetes.io/worker": ""},
NicSelector: sriovnetworkv1.SriovNetworkNicSelector{Vendor: "8086"},
Priority: 20,
}
Expect(k8sClient.Create(context.Background(), somePolicy)).ToNot(HaveOccurred())

Eventually(func(g Gomega) {
err := k8sClient.Get(context.Background(), k8sclient.ObjectKey{Name: node.Name}, node)
g.Expect(err).ToNot(HaveOccurred())
value, exist := node.Labels[consts.SriovDevicePluginLabel]
g.Expect(exist).To(BeTrue())
g.Expect(value).To(Equal(consts.SriovDevicePluginLabelEnabled))
}, time.Minute, time.Second).Should(Succeed())

delete(node.Labels, "node-role.kubernetes.io/worker")
err = k8sClient.Update(context.Background(), node)
Expect(err).ToNot(HaveOccurred())

Eventually(func(g Gomega) {
err := k8sClient.Get(context.Background(), k8sclient.ObjectKey{Name: node.Name}, node)
g.Expect(err).ToNot(HaveOccurred())
_, exist := node.Labels[consts.SriovDevicePluginLabel]
g.Expect(exist).To(BeFalse())
}, time.Minute, time.Second).Should(Succeed())

Eventually(func(g Gomega) {
err := k8sClient.Get(context.Background(), k8sclient.ObjectKey{Name: node.Name, Namespace: testNamespace}, nodeState)
Expect(err).To(HaveOccurred())
Expect(errors.IsNotFound(err)).To(BeTrue())
}, time.Minute, time.Second).Should(Succeed())
})
})
})
8 changes: 4 additions & 4 deletions controllers/sriovoperatorconfig_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ import (
machinev1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1"

sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
apply "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/apply"
consts "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/apply"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate"
snolog "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/log"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms"
render "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/render"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/render"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
)

Expand Down Expand Up @@ -140,7 +140,7 @@ func (r *SriovOperatorConfigReconciler) Reconcile(ctx context.Context, req ctrl.
return reconcile.Result{}, err
}

if err = syncPluginDaemonObjs(ctx, r.Client, r.Scheme, defaultConfig, policyList); err != nil {
if err = syncPluginDaemonObjs(ctx, r.Client, r.Scheme, defaultConfig); err != nil {
return reconcile.Result{}, err
}

Expand Down
Loading

0 comments on commit 3d20dff

Please sign in to comment.