wip: handle driver migrations in nvidiadriver controller

Signed-off-by: Christopher Desiniotis <[email protected]>
NVIDIA · Sep 9, 2024 · ff977e9 · ff977e9
1 parent 81640fd
commit ff977e9
Show file tree

Hide file tree

Showing 20 changed files with 208 additions and 6 deletions.
diff --git a/controllers/nvidiadriver_controller.go b/controllers/nvidiadriver_controller.go
@@ -20,12 +20,14 @@ import (
 	"context"
 	"fmt"
 	"maps"
+	"os"
 	"time"
 
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/util/workqueue"
@@ -57,6 +59,7 @@ type NVIDIADriverReconciler struct {
 	stateManager          state.Manager
 	nodeSelectorValidator validator.Validator
 	conditionUpdater      conditions.Updater
+	operatorNamespace     string
 }
 
 //+kubebuilder:rbac:groups=nvidia.com,resources=nvidiadrivers,verbs=get;list;watch;create;update;patch;delete
@@ -124,6 +127,8 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
 	}
 	clusterPolicyInstance := clusterPolicyList.Items[0]
 
+	r.operatorNamespace = os.Getenv("OPERATOR_NAMESPACE")
+
 	// Create a new InfoCatalog which is a generic interface for passing information to state managers
 	infoCatalog := state.NewInfoCatalog()
 
@@ -168,6 +173,16 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
 		return reconcile.Result{}, nil
 	}
 
+	clusterpolicyDriverLabels, err := getClusterpolicyDriverLabels(r.ClusterInfo, clusterPolicyInstance)
+	if err != nil {
+		return reconcile.Result{}, fmt.Errorf("failed to get clusterpolicy driver labels: %w", err)
+	}
+
+	err = updateNodesManagedByDriver(ctx, r, instance, clusterpolicyDriverLabels)
+	if err != nil {
+		return reconcile.Result{}, fmt.Errorf("failed to update nodes managed by driver: %w", err)
+	}
+
 	// Sync state and update status
 	managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog)
 
@@ -191,6 +206,9 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
 			}
 		}
 		// if no errors are reported from any state, then we would be waiting on driver daemonset pods
+		// TODO: Avoid marking 'default' NVIDIADriver instances as NotReady if DesiredNumberScheduled == 0.
+		//       This will avoid unnecessary reconciliations when the 'default' instance is overriden on all
+		//       GPU nodes, and thus, DesiredNumberScheduled being 0 is valid.
 		if errorInfo == nil {
 			condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.DriverNotReady, "Waiting for driver pod to be ready")
 			if condErr != nil {
@@ -404,5 +422,154 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.
 		return fmt.Errorf("failed to add index key: %w", err)
 	}
 
+	if err := mgr.GetFieldIndexer().IndexField(ctx, &corev1.Pod{}, "spec.nodeName", func(rawObj client.Object) []string {
+		pod := rawObj.(*corev1.Pod)
+		return []string{pod.Spec.NodeName}
+	}); err != nil {
+		return err
+	}
+
 	return nil
 }
+
+func updateNodesManagedByDriver(ctx context.Context, r *NVIDIADriverReconciler, instance *nvidiav1alpha1.NVIDIADriver, clusterpolicyDriverLabels map[string]string) error {
+	nodes, err := getNVIDIADriverSelectedNodes(ctx, r.Client, instance)
+	if err != nil {
+		return fmt.Errorf("failed to get selected nodes for NVIDIADriver CR: %w", err)
+	}
+
+	// A map tracking which node objects need to be updated. E.g. updated label / annotations
+	// need to be applied.
+	nodesToUpdate := map[*corev1.Node]struct{}{}
+
+	for _, node := range nodes.Items {
+		labels := node.GetLabels()
+		annotations := node.GetAnnotations()
+
+		managedBy, exists := labels["nvidia.com/gpu.driver.managed-by"]
+		if !exists {
+			// if 'managed-by' label does not exist, label node with cr.Name
+			labels["nvidia.com/gpu.driver.managed-by"] = instance.Name
+			node.SetLabels(labels)
+			nodesToUpdate[&node] = struct{}{}
+			// If there is an orphan driver pod running on the node,
+			// indicate to the upgrade controller that an upgrade is required.
+			// This will occur when we are migrating from a Clusterpolicy owned
+			// daemonset to a NVIDIADriver owned daemonset.
+			podList := &corev1.PodList{}
+			err = r.Client.List(ctx, podList,
+				client.InNamespace(r.operatorNamespace),
+				client.MatchingLabels(clusterpolicyDriverLabels),
+				client.MatchingFields{"spec.nodeName": node.Name})
+			if err != nil {
+				return fmt.Errorf("failed to list driver pods: %w", err)
+			}
+			if len(podList.Items) == 0 {
+				continue
+			}
+			pod := podList.Items[0]
+			if pod.OwnerReferences == nil || len(pod.OwnerReferences) == 0 {
+				annotations["nvidia.com/gpu-driver-upgrade-requested"] = "true"
+				node.SetAnnotations(annotations)
+			}
+			continue
+		}
+
+		// do nothing if node is already being managed by this CR
+		if managedBy == instance.Name {
+			continue
+		}
+
+		// If node is 'managed-by' another CR, there are several scenarios
+		//    1) There is no driver pod running on the node. Therefore the label is stale.
+		//    2) There is a driver pod running on the node, and it is not an orphan. There are
+		//       two possible actions:
+		//           a) If the NVIDIADriver instance we are currently reconciling is the 'default'
+		//              instance (the node selector is empty), do nothing. All other NVIDIADriver
+		//              instances take precedence.
+		//           b) The pod running no longer falls into the node pool of the CR it is currently
+		//              being managed by. Thus, the NVIDIADriver instance we are currently reconciling
+		//              should take ownership of the node.
+		podList := &corev1.PodList{}
+		err = r.Client.List(ctx, podList,
+			client.InNamespace(r.operatorNamespace),
+			client.MatchingLabels(map[string]string{AppComponentLabelKey: AppComponentLabelValue}),
+			client.MatchingFields{"spec.nodeName": node.Name})
+		if err != nil {
+			return fmt.Errorf("failed to list driver pods: %w", err)
+		}
+		if len(podList.Items) == 0 {
+			labels["nvidia.com/gpu.driver.managed-by"] = instance.Name
+			node.SetLabels(labels)
+			nodesToUpdate[&node] = struct{}{}
+			continue
+		}
+		if instance.Spec.NodeSelector == nil || len(instance.Spec.NodeSelector) == 0 {
+			// If the nodeSelector for the NVIDIADriver instance is empty, then we
+			// treat it as the 'default' CR which has the lowest precedence. Allow
+			// the existing driver pod, owned by another NVIDIADriver CR, to continue
+			// to run.
+			continue
+		}
+		pod := podList.Items[0]
+		if pod.OwnerReferences != nil && len(pod.OwnerReferences) > 0 {
+			err := r.Client.Patch(ctx, &pod, client.RawPatch(types.MergePatchType, []byte(fmt.Sprintf(`{"metadata":{"labels":{"app": null}}}`))))
+			if err != nil {
+				return fmt.Errorf("failed to patch pod in order to make it an orphan: %w", err)
+			}
+		}
+		labels["nvidia.com/gpu.driver.managed-by"] = instance.Name
+		annotations["nvidia.com/gpu-driver-upgrade-requested"] = "true"
+		node.SetLabels(labels)
+		node.SetAnnotations(annotations)
+		nodesToUpdate[&node] = struct{}{}
+	}
+
+	// Apply updated labels / annotations on node objects
+	for node := range nodesToUpdate {
+		err = r.Client.Update(ctx, node)
+		if err != nil {
+			return fmt.Errorf("failed to update node %s: %w", node.Name, err)
+		}
+	}
+
+	return nil
+}
+
+// getNVIDIADriverSelectedNodes returns selected nodes based on the nodeselector labels set for a given NVIDIADriver instance
+func getNVIDIADriverSelectedNodes(ctx context.Context, k8sClient client.Client, cr *nvidiav1alpha1.NVIDIADriver) (*corev1.NodeList, error) {
+	nodeList := &corev1.NodeList{}
+
+	if cr.Spec.NodeSelector == nil {
+		cr.Spec.NodeSelector = cr.GetNodeSelector()
+	}
+
+	selector := labels.Set(cr.Spec.NodeSelector).AsSelector()
+
+	opts := []client.ListOption{
+		client.MatchingLabelsSelector{Selector: selector},
+	}
+	err := k8sClient.List(ctx, nodeList, opts...)
+
+	return nodeList, err
+}
+
+// getClusterpolicyDriverLabels returns a set of labels that can be used to identify driver pods running that are owned by Clusterpolicy
+func getClusterpolicyDriverLabels(clusterInfo clusterinfo.Interface, clusterpolicy gpuv1.ClusterPolicy) (map[string]string, error) {
+	// initialize with common app=nvidia-driver-daemonset label
+	driverLabelKey := DriverLabelKey
+	driverLabelValue := DriverLabelValue
+
+	ocpVer, err := clusterInfo.GetOpenshiftVersion()
+	if err != nil {
+		return nil, fmt.Errorf("failed to get the OpenShift version: %w", err)
+	}
+	if ocpVer != "" && clusterpolicy.Spec.Operator.UseOpenShiftDriverToolkit != nil && *clusterpolicy.Spec.Operator.UseOpenShiftDriverToolkit == true {
+		// For OCP, when DTK is enabled app=nvidia-driver-daemonset label is not
+		// constant and changes based on rhcos version. Hence use DTK label instead
+		driverLabelKey = ocpDriverToolkitIdentificationLabel
+		driverLabelValue = ocpDriverToolkitIdentificationValue
+	}
+
+	return map[string]string{driverLabelKey: driverLabelValue}, nil
+}
diff --git a/internal/state/driver.go b/internal/state/driver.go
@@ -548,6 +548,7 @@ func getDriverSpec(cr *nvidiav1alpha1.NVIDIADriver, nodePool nodePool) (*driverS
 
 	return &driverSpec{
 		Spec:             spec,
+		CRName:           cr.Name,
 		AppName:          nvidiaDriverAppName,
 		Name:             nvidiaDriverName,
 		ImagePath:        imagePath,

diff --git a/internal/state/driver_test.go b/internal/state/driver_test.go
@@ -624,6 +624,7 @@ func getMinimalDriverRenderData() *driverRenderData {
 				ReadinessProbe: getDefaultContainerProbeSpec(),
 				DriverType:     nvidiav1alpha1.GPU,
 			},
+			CRName:           "test-cr",
 			AppName:          "nvidia-gpu-driver-ubuntu22.04-7c6d7bd86b",
 			Name:             "nvidia-gpu-driver-ubuntu22.04",
 			ImagePath:        "nvcr.io/nvidia/driver:525.85.03-ubuntu22.04",

diff --git a/internal/state/testdata/golden/driver-additional-configs.yaml b/internal/state/testdata/golden/driver-additional-configs.yaml
@@ -240,6 +240,7 @@ spec:
           name: run-mellanox-drivers
       nodeSelector:
         nvidia.com/gpu.deploy.driver: "true"
+        nvidia.com/gpu.driver.managed-by: test-cr
       priorityClassName: system-node-critical
       serviceAccountName: nvidia-gpu-driver-ubuntu22.04
       tolerations:

diff --git a/internal/state/testdata/golden/driver-full-spec.yaml b/internal/state/testdata/golden/driver-full-spec.yaml
@@ -254,9 +254,8 @@ spec:
           mountPropagation: HostToContainer
           name: run-mellanox-drivers
       nodeSelector:
-        example.com/bar: bar
-        example.com/foo: foo
         nvidia.com/gpu.deploy.driver: "true"
+        nvidia.com/gpu.driver.managed-by: test-cr
       priorityClassName: custom-priority-class-name
       serviceAccountName: nvidia-gpu-driver-ubuntu22.04
       tolerations:

diff --git a/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml b/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml
@@ -407,6 +407,7 @@ spec:
       nodeSelector:
         feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: 413.92.202304252344-0
         nvidia.com/gpu.deploy.driver: "true"
+        nvidia.com/gpu.driver.managed-by: test-cr
       priorityClassName: system-node-critical
       serviceAccountName: nvidia-gpu-driver-openshift
       tolerations:

diff --git a/internal/state/testdata/golden/driver-gdrcopy.yaml b/internal/state/testdata/golden/driver-gdrcopy.yaml
@@ -296,6 +296,7 @@ spec:
           name: run-mellanox-drivers
       nodeSelector:
         nvidia.com/gpu.deploy.driver: "true"
+        nvidia.com/gpu.driver.managed-by: test-cr
       priorityClassName: system-node-critical
       serviceAccountName: nvidia-gpu-driver-ubuntu22.04
       tolerations:

diff --git a/internal/state/testdata/golden/driver-gds.yaml b/internal/state/testdata/golden/driver-gds.yaml
@@ -296,6 +296,7 @@ spec:
           name: run-mellanox-drivers
       nodeSelector:
         nvidia.com/gpu.deploy.driver: "true"
+        nvidia.com/gpu.driver.managed-by: test-cr
       priorityClassName: system-node-critical
       serviceAccountName: nvidia-gpu-driver-ubuntu22.04
       tolerations:

diff --git a/internal/state/testdata/golden/driver-minimal.yaml b/internal/state/testdata/golden/driver-minimal.yaml
@@ -231,6 +231,7 @@ spec:
           name: run-mellanox-drivers
       nodeSelector:
         nvidia.com/gpu.deploy.driver: "true"
+        nvidia.com/gpu.driver.managed-by: test-cr
       priorityClassName: system-node-critical
       serviceAccountName: nvidia-gpu-driver-ubuntu22.04
       tolerations:

diff --git a/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml b/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml
@@ -341,6 +341,7 @@ spec:
       nodeSelector:
         feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: 413.92.202304252344-0
         nvidia.com/gpu.deploy.driver: "true"
+        nvidia.com/gpu.driver.managed-by: test-cr
       priorityClassName: system-node-critical
       serviceAccountName: nvidia-gpu-driver-openshift
       tolerations:

diff --git a/internal/state/testdata/golden/driver-precompiled.yaml b/internal/state/testdata/golden/driver-precompiled.yaml
@@ -234,6 +234,7 @@ spec:
       nodeSelector:
         feature.node.kubernetes.io/kernel-version.full: 5.4.0-150-generic
         nvidia.com/gpu.deploy.driver: "true"
+        nvidia.com/gpu.driver.managed-by: test-cr
       priorityClassName: system-node-critical
       serviceAccountName: nvidia-gpu-driver-ubuntu22.04
       tolerations:

diff --git a/internal/state/testdata/golden/driver-rdma-hostmofed.yaml b/internal/state/testdata/golden/driver-rdma-hostmofed.yaml
@@ -307,6 +307,7 @@ spec:
           name: run-mellanox-drivers
       nodeSelector:
         nvidia.com/gpu.deploy.driver: "true"
+        nvidia.com/gpu.driver.managed-by: test-cr
       priorityClassName: system-node-critical
       serviceAccountName: nvidia-gpu-driver-ubuntu22.04
       tolerations:

diff --git a/internal/state/testdata/golden/driver-rdma.yaml b/internal/state/testdata/golden/driver-rdma.yaml
@@ -301,6 +301,7 @@ spec:
           name: run-mellanox-drivers
       nodeSelector:
         nvidia.com/gpu.deploy.driver: "true"
+        nvidia.com/gpu.driver.managed-by: test-cr
       priorityClassName: system-node-critical
       serviceAccountName: nvidia-gpu-driver-ubuntu22.04
       tolerations:

diff --git a/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml b/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml
@@ -302,6 +302,7 @@ spec:
       nodeSelector:
         feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: 413.92.202304252344-0
         nvidia.com/gpu.deploy.vgpu-manager: "true"
+        nvidia.com/gpu.driver.managed-by: test-cr
       priorityClassName: system-node-critical
       serviceAccountName: nvidia-vgpu-manager-openshift
       tolerations:

diff --git a/internal/state/testdata/golden/driver-vgpu-host-manager.yaml b/internal/state/testdata/golden/driver-vgpu-host-manager.yaml
@@ -217,6 +217,7 @@ spec:
           name: run-mellanox-drivers
       nodeSelector:
         nvidia.com/gpu.deploy.vgpu-manager: "true"
+        nvidia.com/gpu.driver.managed-by: test-cr
       priorityClassName: system-node-critical
       serviceAccountName: nvidia-vgpu-manager-ubuntu22.04
       tolerations:

diff --git a/internal/state/testdata/golden/driver-vgpu-licensing.yaml b/internal/state/testdata/golden/driver-vgpu-licensing.yaml
@@ -237,6 +237,7 @@ spec:
           name: run-mellanox-drivers
       nodeSelector:
         nvidia.com/gpu.deploy.driver: "true"
+        nvidia.com/gpu.driver.managed-by: test-cr
       priorityClassName: system-node-critical
       serviceAccountName: nvidia-gpu-driver-ubuntu22.04
       tolerations:

diff --git a/internal/state/types.go b/internal/state/types.go
@@ -30,6 +30,7 @@ type SyncingSource source.SyncingSource
 // which is to be populated with the fully-qualified image path.
 type driverSpec struct {
 	Spec              *nvidiav1alpha1.NVIDIADriverSpec
+	CRName            string
 	AppName           string
 	Name              string
 	ImagePath         string

diff --git a/internal/validator/validator.go b/internal/validator/validator.go
@@ -51,8 +51,30 @@ func (nsv *nodeSelectorValidator) Validate(ctx context.Context, cr *nvidiav1alph
 		return err
 	}
 
+	crHasEmptyNodeSelector := false
+	if cr.Spec.NodeSelector == nil || len(cr.Spec.NodeSelector) == 0 {
+		crHasEmptyNodeSelector = true
+	}
+
 	names := map[string]struct{}{}
-	for di := range drivers.Items {
+	for di, driver := range drivers.Items {
+		if driver.Name == cr.Name {
+			continue
+		}
+
+		if crHasEmptyNodeSelector {
+			// If the CR we are validating has an empty node selector, it does not conflict
+			// with any other CR unless there is another CR that also is configured with an
+			// empty node selector. Only one NVIDIADriver instance can be configured with an
+			// empty node selector at any point in time. We deem such an instance as the 'default'
+			// instance, as it will get deployed on all GPU nodes. Other CRs, with non-empty
+			// node selectors, can override the 'default' NVIDIADriver instance.
+			if driver.Spec.NodeSelector == nil || len(driver.Spec.NodeSelector) == 0 {
+				return fmt.Errorf("cannot have empty nodeSelector as another CR is configured with an empty nodeSelector: %s", driver.Name)
+			}
+			continue
+		}
+
 		nodeList, err := nsv.getNVIDIADriverSelectedNodes(ctx, &drivers.Items[di])
 		if err != nil {
 			return err

diff --git a/internal/validator/validator_test.go b/internal/validator/validator_test.go
@@ -93,6 +93,7 @@ func makeTestNode(opts ...nodeOptions) *corev1.Node {
 	return n
 }
 
+// TODO: update this test function
 func TestCheckNodeSelector(t *testing.T) {
 	node := makeTestNode(labelled(map[string]string{"os-version": "ubuntu20.04"}))
 	driver := makeTestDriver(nodeSelector(node.Labels))